Python多线程爬取小说

上一篇 Python爬取文章和小说内容

一、效果

在这里插入图片描述

二、代码(可直接运行)

先安装所需要的库

1、pip install requests
2、pip install lxml
3、pip install bs4

安装完成后则可运行代码：

import requests
from lxml import etree
import os
from queue import Queue
import threading
class Biquge(threading.Thread):
    def __init__(self,url=None,name=None,q_novels=None):
        super().__init__()
        self.url = url
        self.name = name
        self.q_novel = q_novels
        self.proxies = self.get_proxies()
        # self.parse()

    def get_proxies(self):
        try:
            response = requests.get('http://localhost:5000/get')
            proxy = response.text
            proxies = {
                'http': 'http://' + proxy
            }
            return proxies
        except Exception:
            return None

    def get_xpath_by_requests(self,url, proxies):
        '''
        :param url:
        :param proxies: 代理字典
        :return:
        '''
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
                'Cookie': '_abcde_qweasd=0; _abcde_qweasd=0; bdshare_firstime=1577178973028; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1577178973,1577186563,1577186739,1577235413; BAIDU_SSP_lcr=https://www.baidu.com/link?url=AvLJGcMiHKBXi90P2T0xOluezhPz2PeeTLAbP75dmma&wd=&eqid=e131d391001338d8000000025e02b3d2; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1577235422',
                'Referer': 'http://www.xbiquge.la/'
            }
            response = requests.get(url, headers=headers, proxies=proxies)
            return etree.HTML(response.content.decode('utf-8'))
        except Exception:
            new_proxies = self.get_proxies()
            print('更换{}代理ip！'.format(new_proxies))
            return self.get_xpath_by_requests(url, new_proxies)

    def get_text(self,text):
        if text:
            return text[0]
        return ''

    def write_to_txt(self,text, book_name):
        filename = './book/' + book_name + '.txt'   # 保存的文件名以及文件格式
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        with open(filename, 'a+', encoding='utf-8') as fp:   # 写入操作
            fp.write(text)

    def parse_chapter(self,url):
        url = 'http://www.xbiquge.la' + url
        html = self.get_xpath_by_requests(url, self.proxies)
        chapter_name = self.get_text(html.xpath('//div[@class="bookname"]/h1/text()'))
        book_name = self.get_text(html.xpath('//div[@class="con_top"]/a[last()]/text()'))
        # print(chapter_name,book_name)
        contents = html.xpath('//div[@id="content"]/text()')
        # print(type(contents))
        # content = ''
        content = ''.join(contents)
        text = chapter_name + r'\n' + content
        self.write_to_txt(text, book_name)
        # print(url)
        # print(''.join(contents))

    def parse_novel(self,url):
        # 获取页面xpath对象
        html = self.get_xpath_by_requests(url, self.proxies)
        chapters = html.xpath('//div[@id="list"]/dl/dd/a/@href')
        # print(chapters)
        for chapter in chapters:
            self.parse_chapter(chapter)

    def get_novels(self):
        html = self.get_xpath_by_requests(self.url, self.proxies)
        novel_urls = html.xpath('//span[@class="s2"]/a/@href')
        # print(novel_urls)
        return novel_urls
        # for url in novel_urls:
        #     self.parse_novel(url)
    def run(self):
        while True:
            if self.q_novel.empty():
                break
            novel_url = self.q_novel.get()
            print('======={}==========@{}'.format(novel_url,self.name))
            self.parse_novel(novel_url)
if __name__ == '__main__':
    base_url = 'http://www.xbiquge.la/xuanhuanxiaoshuo/'
    b = Biquge(url=base_url)
    novel_urs = b.get_novels()
    #初始化任务队列
    q_novels = Queue()
    for url in novel_urs:
        q_novels.put(url)
    #创建一个list，遍历这个list创建线程
    crawl_list = ['1','2','3','4','5']
    for crwal in crawl_list:
        t = Biquge(name = crwal,q_novels=q_novels)
        t.start()

上一篇 Python爬取文章和小说内容

原文链接：https://blog.csdn.net/D_wart/article/details/103695881

Python多线程爬取小说

一、效果

二、代码(可直接运行)

猜你喜欢