BTSearcher/sites.py

337 lines
9.1 KiB
Python

import chardet
import requests
from bs4 import BeautifulSoup
def lst():
def get():
for _, value in globals().items():
try:
if value.site_enable:
yield value
except:
pass
return tuple(get())
class Sites():
def __init__(self, signal):
self.send_item = signal
print(self.name, 'start ...')
def __del__(self):
print(self.name, 'stop !')
def stop(self):
self.to_stop = True
def search(self, key_word):
self.to_stop = False
key_word = self.trns_key(key_word)
for url in self.gen_url(key_word):
if self.to_stop:
return
try:
soup = self.fetch_soup(url)
except Exception as e:
print('\n' + str(e))
break
for item in self.get_item(soup):
if self.to_stop:
return
if 'link_url' in item:
try:
item['link'] = self.get_link(self.fetch_soup(item['link_url']))
except:
item['link'] = ''
if item['link']:
self.send_item.emit(item)
try:
if self.last_page(soup):
break
except:
break
@staticmethod
def fetch_soup(url):
print('fetching "' + url + '" ...', end=' ')
req = requests.get(url)
det = chardet.detect(bytes(req.text, req.encoding))
req.encoding = det['encoding']
soup = BeautifulSoup(req.text, 'lxml')
print('ok!')
return soup
@staticmethod
def gen_url(key_word):
raise NotImplementedError
@staticmethod
def last_page(soup):
raise NotImplementedError
@staticmethod
def get_item(soup):
raise NotImplementedError
@staticmethod
def get_link(url):
raise NotImplementedError
class Bobobt(Sites):
name = 'Bobobt'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%20')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'https://www.bobobt.com/search/%s/%d/0/0.html' % (key_word, page_number)
@staticmethod
def last_page(soup):
return soup.find('div', 'pager').find_all('a')[-1].string not in ['>>', '>>']
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('div', 'ss'):
try:
la = dom_item.find_all('a')
lb = dom_item.find_all('b')
yield {
'hot': lb[4].string,
'size': lb[1].string,
'name': ''.join(la[0].strings).strip(),
'link': la[1].get('href')
}
except:
continue
class BTcerise(Sites):
name = 'BTcerise'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%20')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'http://www.btcerise.me/search?keyword=%s&p=%d' % (key_word, page_number)
@staticmethod
def last_page(soup):
return 'disable' in soup.find('ul', 'pagination').find_all('li')[-1]
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('div', 'r'):
try:
yield {
'hot': '-',
'size': dom_item.find_all('span', 'prop_val')[1].string,
'name': ''.join(dom_item.find('h5').strings),
'link': dom_item.find('div').find('a').get('href')
}
except:
continue
class Cililianc(Sites):
name = 'Cililianc'
site_enable = False
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%2B')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'http://cililianc.com/list/%s/%d.html' % (key_word, page_number)
@staticmethod
def last_page(soup):
return soup.find('div', 'pg').find_all('a')[-1].string not in ['下一页']
@staticmethod
def get_item(soup):
for dom_item in soup.find('ul', 'mlist').find_all('li'):
try:
yield {
'hot': '-',
'size': dom_item.find('dt').find('span').string,
'name': ''.join(dom_item.find('a').strings),
'link': dom_item.find('div', 'dInfo').find('a').get('href')
}
except:
continue
class BTdao(Sites):
name = 'BTdao'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%2B')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'http://www.btdao.me/list/%s-s2d-%d.html' % (key_word, page_number)
@staticmethod
def last_page(soup):
return soup.find('div', 'pg').find_all('a')[-1].string not in ['下一页']
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('li'):
try:
la = dom_item.find('a')
ls = dom_item.find('dl').find_all('span')
yield {
'hot': ls[3].string,
'size': ls[0].string,
'name': la.get('title'),
'link_url': 'http://www.btdao.me' + la.get('href')
}
except:
continue
@staticmethod
def get_link(soup):
return soup.find('dl', 'BotInfo').find('a').get('href')
class BTrabbit(Sites):
name = 'BTrabbit'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%20')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'http://www.btrabbit.net/search/%s/default-%d.html' % (key_word, page_number)
@staticmethod
def last_page(soup):
return soup.find('div', 'bottom-pager').find_all('a')[-1].string not in ['>']
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('div', 'search-item'):
try:
la = dom_item.find('a')
lb = dom_item.find('div', 'item-bar').find_all('b')
yield {
'hot': lb[2].string,
'size': lb[1].string.replace(' ', ' '),
'name': la.get('title'),
'link_url': 'http://www.btrabbit.net' + la.get('href')
}
except:
continue
@staticmethod
def get_link(soup):
return soup.find_all('textarea')[0].string
class BTanw(Sites):
name = 'BTanw'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '%20')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'http://www.btanw.com/search/%s-hot-desc-%d' % (key_word, page_number)
@staticmethod
def last_page(soup):
return len(soup.find('div', 'bottom-pager').find_all('a')[-1].get('href')) == 0
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('div', 'search-item'):
try:
la = dom_item.find('a')
lb = dom_item.find('div', 'item-bar').find_all('b')
yield {
'hot': lb[1].string,
'size': lb[3].string,
'name': ''.join(la.strings),
'link_url': 'http://www.btanw.com' + la.get('href')
}
except:
continue
@staticmethod
def get_link(soup):
return soup.find('div', 'fileDetail').find_all('p')[5].find('a').get('href')
class Ciliba(Sites):
name = 'Ciliba'
site_enable = True
@staticmethod
def trns_key(key_word):
return key_word.replace(' ', '+')
@staticmethod
def gen_url(key_word):
page_number = 0
while True:
page_number += 1
yield 'https://www.ciliba.org/s/%s_rel_%d.html' % (key_word, page_number)
@staticmethod
def last_page(soup):
return soup.find('ul', 'pagination').find_all('a')[-1].string not in ['Last']
@staticmethod
def get_item(soup):
for dom_item in soup.find_all('div', 'search-item'):
try:
la = dom_item.find('h3').find('a')
lb = dom_item.find('div', 'item-bar').find_all('b')
yield {
'hot': lb[2].string,
'size': lb[1].string,
'name': ''.join(la.strings),
'link_url': la.get('href')
}
except:
continue
@staticmethod
def get_link(soup):
return soup.find('a', 'download').get('href')