# coding=utf-8 #@auther:Mana_菜小刀 import requests import queue import threading import xlrd import xlwt from lxml import etree from xlutils.copy import copy from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } myxls = xlwt.Workbook() sheet1 = myxls.add_sheet('收录search') lst_name = ['url', '收录/未收录', '图片'] for i in range(len(lst_name)): sheet1.write(0, i, lst_name[i]) myxls.save('result.xls') def log(*args,**kwargs): print(*args,**kwargs) class baiduSpider(threading.Thread): def __init__(self, queue_li, name): threading.Thread.__init__(self) self._queue = queue_li self._name = name def run(self): while not self._queue.empty(): url = self._queue.get() try: self.get_url(url) except Exception as e: log(e) pass def get_url(self,url): requests.adapters.DEFAULT_RETRIES = 5 r = requests.session() r.keep_alive = False s = r.get(url=url, headers=headers) #log(s) xpather = etree.HTML(s.text) strs = xpather.xpath('//span[@class="nums_text"]//text()') imgs = xpather.xpath('//img[@class="c-img c-img6"]/@src') #log(strs, imgs) search_mo = ['收录','Not included'] img_mo = ['有图','无图'] url_mo = url.replace('http://www.baidu.com/s?wd=','') workbook = xlrd.open_workbook('result.xls', formatting_info=True) sheet = workbook.sheet_by_index(0) rowNum = sheet.nrows colNum = sheet.ncols newbook = copy(workbook) newsheet =newbook.get_sheet (0) IF ! strs [0] = " Baidu for you to find relevant results about 0 " and len (imgs)> 0: newsheet.write (rowNum, 0, url_mo) newsheet.write (rowNum, 1 , search_mo [0]) newsheet.write (rowNum, 2 , img_mo [0]) log (search_mo [0], ' Shu ' , img_mo [0], ' Shu ' , url_mo) # newbook.save ( 'result.xls' ) elif ! strs [0] = " Baidu to find relevant results you about 0 " and len (imgs) == 0: newsheet.write (rowNum, 0, url_mo) newsheet.write(rowNum, 1, search_mo[0]) newsheet.write(rowNum, 2, img_mo[1]) log(search_mo[0],'丨',img_mo[1],'丨',url_mo) #newbook.save('result.xls') else: newsheet.write(rowNum, 0, url_mo) newsheet.write(rowNum, 1, search_mo[1]) newsheet.write(rowNum, 2, img_mo[1]) log(search_mo[1],'丨', img_mo [. 1], ' Shu ', url_mo) newbook.save ( ' result.xls ' ) DEF main (): queue_li = Queue.Queue () Threads = [] THREAD_COUNT = 10 myxls = xlwt.Workbook () Sheet1 = myxls.add_sheet ( ' the IDF ' ) ' '' the 'urls' into their own txt document name: ' '' with Open ( ' urls ' , ' r ' , encoding = ' UTF-8 ' , errors = "ignore") as f: content = f.read() urls = content.split('\n') for url in urls: if len(url) > 0: url_search = url queue_li.put('http://www.baidu.com/s?wd={}'.format(url_search)) for i in range(thread_count): spider = baiduSpider(queue_li, url_search) threads.append(spider) for i in threads: i.start() for i in threads: i.join() '''log("Mana好伟大!(^-^)V")''' if __name__ == '__main__': log("Mana好伟大!(^-^)V") main()