一开始用的保存函数是将原来的文件内容替换掉,所以换了一个函数就可以了
from urllib.error import HTTPError from xlutils.copy import copy import re, pymongo, requests, xlrd, xlwt class DataTool(object): pattern_n = re.compile(r'\n', re.S) pattern_r = re.compile(r' ', re.S) pattern_br = re.compile(r' ', re.S) pattern_b = re.compile(r' ', re.S) def process_tuple_data(self, origin_tuple_data): content = re.sub(self.pattern_n, '', origin_tuple_data[4]) content = re.sub(self.pattern_r, '', content) content = re.sub(self.pattern_br, '', content) content = re.sub(self.pattern_b, '', content) data = (origin_tuple_data[0], origin_tuple_data[1], origin_tuple_data[2], origin_tuple_data[3], content, origin_tuple_data[5]) return data class QiShuSpider(object): client = pymongo.MongoClient('localhost') db = client['dbmovie'] def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } self.base_url = 'https://www.qisuu.la/soft/sort01/' self.tool = DataTool() def get_total_page_num(self): """ 获取搜索结果的总页数 :return: """ response = requests.get(self.base_url, headers=self.headers) html = response.text total_num_pattern = re.compile(r'<div class="tspage".*?页次:1/(.*?) ', re.S) total_num = int(re.findall(total_num_pattern, html)[0]) return total_num def get_list_html(self, page, page_num): list_url = 'https://www.qisuu.la/soft/sort0{}/index_{}.html'.format(page, page_num) try: response = requests.get(list_url, headers=self.headers) html = response.text except HTTPError as e: print('列表页异常:url={}, error={}'.format(list_url, e)) return None, None else: return html def parse_list_html(self, html): if html: pattern = re.compile( r'<div class="s">(.*?)<br.*?>(.*?)<br>.*?<em class="lstar3">.*?<br>(.*?)</div>.*?<img .*?>(.*?)</a>.*?<div class="u">(.*?)</div>.*?<a.*?>(.*?)</a>', re.S) detail_urls = re.findall(pattern, html) for detail_url in detail_urls: new_detail = self.tool.process_tuple_data(detail_url) print(new_detail) return detail_urls else: print('html源代码为None') return None def data_write(self, k, qishu, datas): # 注意一定要先创建.xls文件,而且因为版本问题不能是xlsx文件 old_file = xlrd.open_workbook('qishu.xls') new_file = copy(old_file) new_sheet = new_file.get_sheet(0) row = k # 已存在文件中的数据行数 for data in datas: for j in range(len(data)): new_sheet.write(row, j, data[j]) row += 1 new_file.save('qishu.xls') print('写入成功') def start_spider(self, i, x, num): print('正在请求第{}页'.format(num)) list_html = self.get_list_html(x, num) if list_html: detail_urls = self.parse_list_html(list_html) self.data_write(i, 'qishu.xlsx', detail_urls) if __name__ == '__main__': obj = QiShuSpider() page = obj.get_total_page_num() # pool = Pool(1) # for x in range(1, 12): # for y in range(1, page): # pool.map(obj.start_spider, x, y) # pool.close() # pool.join() i = 0 for x in range(1, 12): for y in range(1, page): obj.start_spider(i, x, y) i = i + 15 运行结果: