My introduction to crawler (2)


1. Use thread pool

Increase the code:

with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

Use the map() method to open multiple threads to execute concurrent tasks, and the maximum number of threads to open is limited to 10. After the thread task is executed, the results are saved in the order of the parameter list, so the order of the chapters will not be messy. The first parameter of the map() method is the method to be executed by the thread, and the second is the method parameter. It is recommended not to set too many threads, which is friendly to the server.

2. Complete code

from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
import time
import sys
import re
import threading


class fiction():

    __chapter_download = 0   # 已下载章节数
    __chapter_total = 0  # 章节总数

    def __init__(self, name, url_ws, url_lp, encode, attrs_div_lp={
    
    }, attrs_div_ct={
    
    }):
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode  # 指定编码格式

    def Update(self, name, url_ws, url_lp, encode, attrs_div_lp={
    
    }, attrs_div_ct={
    
    }):
        '''重置参数

        必须同时重置所有参数,否则可能出现错误

        '''
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode

    def __get_Link_chapter(self):
        '''在目录页获得各个章节的url.

        解析目录页,通过属性找到存放各个章节url的div标签,
        获取各个章节的url并且返回

        '''

        # 当请求发生异常:连接或者超时错误,等待1S再尝试
        for try_counter in range(10):
            try:
                req_lp = requests.get(self.__url_lp, timeout=10)
                break
            except ConnectionError:
                print('尝试获取目录页ConnectionError:%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取目录页TimeoutError:%d' % (try_counter+1))
            except:
                print('尝试获取目录页OtherError:%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取目录页失败')
            return
        else:
            try:
                req_lp.encoding = req_lp.apparent_encoding
                # 建立BeautifulSoup对象,指定解析器lxml
                bs_lp = BeautifulSoup(req_lp.text, 'lxml')
                # 找到所有对应属性的div标签
                div_list = bs_lp.find_all('div', attrs=self.__attrs_div_lp)
                # 找到所有的a标签
                link_chapter = []
                for div in div_list:
                    link_chapter += div.find_all('a')
                return link_chapter
            except TypeError:
                print('目录页解析异常:TypeError')
                return
            # except:
            #     print('目录页解析异常:OtherError')
            #     return

    def __get_content_chapter(self, link):
        '''获取章节内容.

        :param link:在目录页解析后得到的a标签
                    内含章节名和url

        '''
        
        name_chapter = link.string
        url_chapter = self.__url_ws + link['href']  # 拼接得到章节页url
        for try_counter in range(10):
            try:
                req_ct = requests.get(url_chapter, timeout=10)
                break
            except ConnectionError:
                print('尝试获取章节链接:ConnectionError%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取章节链接:TimeoutError%d' % (try_counter+1))
            except:
                print('尝试获取章节链接:OtherError%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取链接失败:'+name_chapter)
            content_chapter = name_chapter+'\n\n'
        else:
            try:
                req_ct.encoding = self.__encode
                bs_ct = BeautifulSoup(
                    req_ct.text, 'lxml')
                content = bs_ct.find(
                    'div', attrs=self.__attrs_div_ct)
                content = str(content).replace('<br/>','\n').replace('\xa0',' ')
                content = BeautifulSoup(content,'lxml').get_text()
                content_chapter = name_chapter + '\n\n' + content + '\n\n'
            except TypeError:
                print('章节页解析异常:TypeError '+name_chapter)
                content_chapter = name_chapter+'\n\n'
            except:
                print('章节页解析异常:OtherError '+name_chapter)
                content_chapter = name_chapter+'\n\n'

        self.__chapter_download += 1    # 计算章节下载数
        sys.stdout.write('下载进度:%.1f%%' % float(
                            self.__chapter_download/self.__chapter_total*100)+'\r')
        return content_chapter

    def write(self, path_save):
        '''写下载的文件到指定路径.

        :param path_save:指定的保存路径

        '''
        path_save = path_save + '\\' + self.__name + '.txt'
        link_chapter = self.__get_Link_chapter()
        self.__chapter_total = len(link_chapter)
        if link_chapter is None:
            pass
        else:
            # 开线程池
            with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

            with open(path_save, 'w+', encoding=self.__encode) as file:
                for content_chapter in content_chapter_list:
                    file.write(content_chapter)
        print('<<'+self.__name+'>>下载完成')


if __name__ == '__main__':
    start = time.time()
    f = fiction(name='雪中悍刀行',
                url_ws='http://www.xbiquge.la',
                url_lp='http://www.xbiquge.la/0/745/',
                attrs_div_lp={
    
    'id': 'list'},
                attrs_div_ct={
    
    'id': 'content'},
                encode='utf-8')
    f.write(r'C:\Users\HP\Desktop\pytxt')
    stop = time.time()
    print('用时:%ds' % (stop-start))

3. Reference

  1. Python speeds up crawlers: multi-threaded, multi-process
  2. Python thread pool
  3. Python official documentation- start parallel tasks

Guess you like

Origin blog.csdn.net/qq_36439722/article/details/106044630