python-- multi-threaded crawling vertex fiction ()

import requests
from lxml import etree
from threading import Thread
from queue import Queue


class MyThread(Thread):
    def __init__(self, q):
        Thread.__init__(self)
        self.q = q

    def run(self):
        global index
        while not self.q.empty():
            data = self.q.get()
            url = root + ''.join(data[1])
            response = requests.get(url, headers=headers)
            page = etree.HTML(response.content)

            chapter = page.xpath("//h1/text()")
            chapter = ''.join(chapter)
            print("爬取 -> %s" % chapter,index)

            = page.xpath Content ( "// div [@ ID = 'Content'] / text ()") 
            Content = '\ n'.join (Content) 
            Content = content.replace ( "\ XA0 \ XA0 \ XA0 \ XA0 "," \ T ") 

            # 1 is greater than the ratio of the marker is currently stored novel chapter number, blocking 
            the while Data [0]> index + 1: 
                pass 

            when # 1 just above, by saving section 
            if data [0] == index . 1 +: 
                Print ( "save ->% S" Chapter%, index) 
                f.write ( '\ n-' + Chapter + '\ n-') 
                f.write (Content) 
                index = +. 1 


IF the __name__ == '__main__' : 
    the root = "http://www.booktxt.net/8_8455/" 
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    
    index = -1  # 章节标记,表示保存的章数
    
    response = requests.get(root, headers=headers)
    page = etree.HTML(response.content)
    title = ''.join(page.xpath("//h1/text()"))      # 小说名
    print(title)

    with open("%s.txt" % title, 'w', encoding='utf8') as f:
        f.write(title)      # 先写入小说名
        hrefs = page.xpath("//div[@id='list']/dl/dt[2]/following-sibling::dd/a/@href")
        q = Queue()
        for i,href in enumerate(hrefs):
            q.put((i,href))

        ts = []
        for i in range(5):
            t = MyThread(q)
            t.start()
            ts.append(t)
        for t in ts:
            t.join()

  Reprinted from - https://www.cnblogs.com/twoice/p/11405677.html

Guess you like

Origin www.cnblogs.com/fqqwz/p/11656074.html