Mi entrada del rastreador (2)


1. Usar grupo de subprocesos

Incrementa el código:

with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

Utilice el método map () para abrir varios subprocesos para ejecutar tareas simultáneas, y el número máximo de subprocesos para abrir está limitado a 10. Después de que se ejecuta la tarea del hilo, los resultados se guardan en el orden de la lista de parámetros, por lo que el orden de los capítulos no será complicado. El primer parámetro del método map () es el método a ejecutar por el hilo, y el segundo es el parámetro del método. Se recomienda no establecer demasiados hilos, lo cual es amigable para el servidor.

2. Código completo

from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import requests
import time
import sys
import re
import threading


class fiction():

    __chapter_download = 0   # 已下载章节数
    __chapter_total = 0  # 章节总数

    def __init__(self, name, url_ws, url_lp, encode, attrs_div_lp={
    
    }, attrs_div_ct={
    
    }):
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode  # 指定编码格式

    def Update(self, name, url_ws, url_lp, encode, attrs_div_lp={
    
    }, attrs_div_ct={
    
    }):
        '''重置参数

        必须同时重置所有参数,否则可能出现错误

        '''
        self.__name = name  # 名字
        self.__url_ws = url_ws  # 网站url
        self.__url_lp = url_lp  # 链接(目录)页的url
        self.__attrs_div_lp = attrs_div_lp  # 链接(目录页)存放各个章节链接的div标签属性
        self.__attrs_div_ct = attrs_div_ct  # 章节页存放内容的div标签属性
        self.__encode = encode

    def __get_Link_chapter(self):
        '''在目录页获得各个章节的url.

        解析目录页,通过属性找到存放各个章节url的div标签,
        获取各个章节的url并且返回

        '''

        # 当请求发生异常:连接或者超时错误,等待1S再尝试
        for try_counter in range(10):
            try:
                req_lp = requests.get(self.__url_lp, timeout=10)
                break
            except ConnectionError:
                print('尝试获取目录页ConnectionError:%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取目录页TimeoutError:%d' % (try_counter+1))
            except:
                print('尝试获取目录页OtherError:%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取目录页失败')
            return
        else:
            try:
                req_lp.encoding = req_lp.apparent_encoding
                # 建立BeautifulSoup对象,指定解析器lxml
                bs_lp = BeautifulSoup(req_lp.text, 'lxml')
                # 找到所有对应属性的div标签
                div_list = bs_lp.find_all('div', attrs=self.__attrs_div_lp)
                # 找到所有的a标签
                link_chapter = []
                for div in div_list:
                    link_chapter += div.find_all('a')
                return link_chapter
            except TypeError:
                print('目录页解析异常:TypeError')
                return
            # except:
            #     print('目录页解析异常:OtherError')
            #     return

    def __get_content_chapter(self, link):
        '''获取章节内容.

        :param link:在目录页解析后得到的a标签
                    内含章节名和url

        '''
        
        name_chapter = link.string
        url_chapter = self.__url_ws + link['href']  # 拼接得到章节页url
        for try_counter in range(10):
            try:
                req_ct = requests.get(url_chapter, timeout=10)
                break
            except ConnectionError:
                print('尝试获取章节链接:ConnectionError%d' % (try_counter+1))
            except TimeoutError:
                print('尝试获取章节链接:TimeoutError%d' % (try_counter+1))
            except:
                print('尝试获取章节链接:OtherError%d' % (try_counter+1))
            time.sleep(1)

        if try_counter >= 9:
            print('获取链接失败:'+name_chapter)
            content_chapter = name_chapter+'\n\n'
        else:
            try:
                req_ct.encoding = self.__encode
                bs_ct = BeautifulSoup(
                    req_ct.text, 'lxml')
                content = bs_ct.find(
                    'div', attrs=self.__attrs_div_ct)
                content = str(content).replace('<br/>','\n').replace('\xa0',' ')
                content = BeautifulSoup(content,'lxml').get_text()
                content_chapter = name_chapter + '\n\n' + content + '\n\n'
            except TypeError:
                print('章节页解析异常:TypeError '+name_chapter)
                content_chapter = name_chapter+'\n\n'
            except:
                print('章节页解析异常:OtherError '+name_chapter)
                content_chapter = name_chapter+'\n\n'

        self.__chapter_download += 1    # 计算章节下载数
        sys.stdout.write('下载进度:%.1f%%' % float(
                            self.__chapter_download/self.__chapter_total*100)+'\r')
        return content_chapter

    def write(self, path_save):
        '''写下载的文件到指定路径.

        :param path_save:指定的保存路径

        '''
        path_save = path_save + '\\' + self.__name + '.txt'
        link_chapter = self.__get_Link_chapter()
        self.__chapter_total = len(link_chapter)
        if link_chapter is None:
            pass
        else:
            # 开线程池
            with ThreadPoolExecutor(max_workers=10) as thread_pool:
                content_chapter_list = thread_pool.map(
                    self.__get_content_chapter, link_chapter)

            with open(path_save, 'w+', encoding=self.__encode) as file:
                for content_chapter in content_chapter_list:
                    file.write(content_chapter)
        print('<<'+self.__name+'>>下载完成')


if __name__ == '__main__':
    start = time.time()
    f = fiction(name='雪中悍刀行',
                url_ws='http://www.xbiquge.la',
                url_lp='http://www.xbiquge.la/0/745/',
                attrs_div_lp={
    
    'id': 'list'},
                attrs_div_ct={
    
    'id': 'content'},
                encode='utf-8')
    f.write(r'C:\Users\HP\Desktop\pytxt')
    stop = time.time()
    print('用时:%ds' % (stop-start))

3. Referencia

  1. Python acelera los rastreadores: multiproceso, multiproceso
  2. Grupo de subprocesos de Python
  3. Documentación oficial de Python: iniciar tareas paralelas

Supongo que te gusta

Origin blog.csdn.net/qq_36439722/article/details/106044630
Recomendado
Clasificación