Optimized web crawler

Date: 2019-07-03

Author: Sun

Before optimization crawler code is as follows:

# -*- coding: utf-8 -*-  
__author__ = 'sun'
__date__ = '2019/7/3 上午10:53' 

from bs4 import BeautifulSoup as BSP4

import requests

g_set = set()

URL_LIST = [
   ('https://www.geyanw.com/lizhimingyan/list_33_1.html', '励志名言', 'lizhimingyan'),
   ('https://www.geyanw.com/renshenggeyan/list_32_1.html', '人生格言', 'renshenggeyan'),
   ('https://www.geyanw.com/mingyanjingju/list_37_1.html', '名言警句', 'mingyanjingju'),
   ('https://www.geyanw.com/html/mingrenmingyan/list_1_1.html', '名人名言', 'mingrenmingyan'),
   ('https://www.geyanw.com/html/dushumingyan/list_5_1.html', '读书名言', 'dushumingyan'),
   ('https://www.geyanw.com/html/jingdianmingyan/list_2_1.html', '经典名言', 'jingdianmingyan'),

]


def store_file(filename, response):
   html_doc = response.text
   with open("geyan_%s.html" % filename, "w", encoding="utf-8") as f:
      f.write(html_doc)


def download(url, filename="index", store_flag=True):
   '''
   :param url:        待爬取的url
   :param filename:   待存储html文件名称
   :param store_flag: 本地持久化的标志
   :return:
   '''
   response = requests.get(url)

   if  store_flag:
      store_file(filename, response)

   return response


def parse_page(page, ctype, url):
   response = download(url, store_flag=False)
   html_doc = response.content
   soup = BSP4(html_doc, "lxml")
   link_list = soup.select("#p_left .newlist ul h2 a")
   #print(link_list)
   index = 1
   for link in link_list:
      url_link = "https://www.geyanw.com" + link['href']
      print("ctype:" + ctype + ", page: " + str(page) + ", url_link: " + url_link)
      if url_link not in g_set:
         index += 1
         response = download(url_link, filename="%s_%s.html" % (ctype, index), store_flag=False)


def parse(response):
   url = response.url
   #print(url)
   base_urls = url.split("/list_")
   print(base_urls)
   domain = base_urls[0]
   init_html = base_urls[-1]
   print(domain)
   print(init_html)
   ctype = init_html.split("_")[0]
   cindex = init_html.split("_")[1].split(".")[0]
   g_set.add(url)

   html_doc  = response.content
   soup = BSP4(html_doc, "lxml")
   #page_list = soup.select("#p_left .newlist .pagelist li a") #分页内容
   #print(page_list)

   total_num = soup.select("#p_left .newlist .pagelist .pageinfo strong")[0]
   page_max = int(total_num.get_text())

   [parse_page(page, ctype, "%s/list_%s_%s.html" % (domain, ctype, page)) for page in range(2, page_max+1)]


def process(entry_url):
   try:
      response = download(entry_url, store_flag=False)
      parse(response)    #下载和解析进行分开
      return True
   except Exception as e:
      return False

'''
采用多进程的方式来爬取
'''
def multprocess_run():
   from multiprocessing import Pool
   pool = Pool(processes=8)
   result = []
   for (entry_url, name, type ) in  URL_LIST:
      pc = pool.apply_async(process, args=(entry_url, ))
      result.append(pc)

   pool.close()
   pool.join()


'''
采用协程来处理并发量
'''
import  asyncio

@asyncio.coroutine
def async_io_loop(entry_url):
   yield from process(entry_url)


def async_run():
   loop = asyncio.get_event_loop()
   tasks = [async_io_loop(url) for (url, name, type)  in  URL_LIST]
   loop.run_until_complete(asyncio.wait(tasks))
   loop.close()


import threading
import queue
import time

class Worker(threading.Thread):
   def __init__(self, name, queue):
      threading.Thread.__init__(self)
      self.queue = queue
      self.start()

   def run(self):
      while True:
         if self.queue.empty():
            break
         url = self.queue.get()
         print(self.getName() + " process " + str(url))
         process(url)
         self.queue.task_done()


def multithread_run():
   squeue = queue.Queue()
   for (url, name, type) in URL_LIST:
      squeue.put(url)

   for i in range(10):
      threadName = 'Thread' + str(i)
      Worker(threadName, squeue)

   squeue.join()


def main():

   #multprocess_run()

   #async_run()

   multithread_run()

   # for (url, name, type) in URL_LIST:
   #  process(url, name, type)
   #[process(url, name, type)  for (url, name, type) in URL_LIST]
   # entry_url = "https://www.geyanw.com/lizhimingyan/list_33_1.html"
   # process(entry_url)

if __name__ == "__main__":
   main()

Guess you like

Origin www.cnblogs.com/sunBinary/p/11129828.html