import requests from bs4 import BeautifulSoup class Sites: def __init__(self, signal): self.send_item = signal print(self.name, 'start ...') def __del__(self): print(self.name, 'stop !') def search(self, key_word): key_word = self.trns_key(key_word) for url in self.gen_url(key_word): try: soup = self.fetch_soup(url) except: break for item in self.get_item(soup): if 'link_url' in item: try: item['link'] = self.get_link(self.fetch_soup(item['link_url'])) except: item['link'] = '' if item['link']: self.send_item.emit(item) try: if self.last_page(soup): break except: break @staticmethod def fetch_soup(url): print('fetching "' + url + '" ...', end=' ') soup = BeautifulSoup(requests.get(url).text, 'lxml') print('ok!') return soup @staticmethod def gen_url(key_word): raise NotImplementedError @staticmethod def last_page(soup): raise NotImplementedError @staticmethod def get_item(soup): raise NotImplementedError @staticmethod def get_link(url): raise NotImplementedError class Bobobt(Sites): name = 'Bobobt' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%20') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'https://www.bobobt.com/search/%s/%d/0/0.html' % (key_word, page_number) @staticmethod def last_page(soup): return soup.find('div', 'pager').find_all('a')[-1].string not in ['>>', '>>'] @staticmethod def get_item(soup): for dom_item in soup.find_all('div', 'ss'): try: la = dom_item.find_all('a') lb = dom_item.find_all('b') yield { 'hot': lb[4].string, 'size': lb[1].string, 'name': ''.join(la[0].strings).strip(), 'link': la[1].get('href') } except: continue class BTcerise(Sites): name = 'BTcerise' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%20') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'http://www.btcerise.me/search?keyword=%s&p=%d' % (key_word, page_number) @staticmethod def last_page(soup): return 'disable' in soup.find('ul', 'pagination').find_all('li')[-1] @staticmethod def get_item(soup): for dom_item in soup.find_all('div', 'r'): try: yield { 'hot': '-', 'size': dom_item.find_all('span', 'prop_val')[1].string, 'name': ''.join(dom_item.find('h5').strings), 'link': dom_item.find('div').find('a').get('href') } except: continue class Cililianc(Sites): name = 'Cililianc' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%2B') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'http://cililianc.com/list/%s/%d.html' % (key_word, page_number) @staticmethod def last_page(soup): return soup.find('div', 'pg').find_all('a')[-1].string not in ['下一页'] @staticmethod def get_item(soup): for dom_item in soup.find('ul', 'mlist').find_all('li'): try: yield { 'hot': '-', 'size': dom_item.find('dt').find('span').string, 'name': ''.join(dom_item.find('a').strings), 'link': dom_item.find('div', 'dInfo').find('a').get('href') } except: continue class BTdao(Sites): name = 'BTdao' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%2B') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'http://www.btdao.me/list/%s-s2d-%d.html' % (key_word, page_number) @staticmethod def last_page(soup): return soup.find('div', 'pg').find_all('a')[-1].string not in ['下一页'] @staticmethod def get_item(soup): for dom_item in soup.find_all('li'): try: la = dom_item.find('a') ls = dom_item.find('dl').find_all('span') yield { 'hot': ls[3].string, 'size': ls[0].string, 'name': la.get('title'), 'link_url': 'http://www.btdao.me' + la.get('href') } except: continue @staticmethod def get_link(soup): return soup.find('dl', 'BotInfo').find('a').get('href') class BTrabbit(Sites): name = 'BTrabbit' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%20') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'http://www.btrabbit.net/search/%s/default-%d.html' % (key_word, page_number) @staticmethod def last_page(soup): return soup.find('div', 'bottom-pager').find_all('a')[-1].string not in ['>'] @staticmethod def get_item(soup): for dom_item in soup.find_all('div', 'search-item'): try: la = dom_item.find('a') lb = dom_item.find('div', 'item-bar').find_all('b') yield { 'hot': lb[2].string, 'size': lb[1].string.replace(' ', ' '), 'name': la.get('title'), 'link_url': 'http://www.btrabbit.net' + la.get('href') } except: continue @staticmethod def get_link(soup): return soup.find_all('textarea')[0].string class BTanw(Sites): name = 'BTanw' @staticmethod def trns_key(key_word): return key_word.replace(' ', '%20') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'http://www.btanw.com/search/%s-hot-desc-%d' % (key_word, page_number) @staticmethod def last_page(soup): return len(soup.find('div', 'bottom-pager').find_all('a')[-1].get('href')) == 0 @staticmethod def get_item(soup): for dom_item in soup.find_all('div', 'search-item'): try: la = dom_item.find('a') lb = dom_item.find('div', 'item-bar').find_all('b') yield { 'hot': lb[1].string, 'size': lb[3].string, 'name': ''.join(la.strings), 'link_url': 'http://www.btanw.com' + la.get('href') } except: continue @staticmethod def get_link(soup): return soup.find('div', 'fileDetail').find_all('p')[5].find('a').get('href') class Ciliba(Sites): name = 'Ciliba' @staticmethod def trns_key(key_word): return key_word.replace(' ', '+') @staticmethod def gen_url(key_word): page_number = 0 while True: page_number += 1 yield 'https://www.ciliba.org/s/%s_rel_%d.html' % (key_word, page_number) @staticmethod def last_page(soup): return soup.find('ul', 'pagination').find_all('a')[-1].string not in ['Last'] @staticmethod def get_item(soup): for dom_item in soup.find_all('div', 'search-item'): try: la = dom_item.find('h3').find('a') lb = dom_item.find('div', 'item-bar').find_all('b') yield { 'hot': lb[2].string, 'size': lb[1].string, 'name': ''.join(la.strings), 'link_url': la.get('href') } except: continue @staticmethod def get_link(soup): return soup.find('a', 'download').get('href') lst = (Bobobt, BTcerise, Cililianc, BTdao, BTrabbit, BTanw, Ciliba)