爬虫多线程模板，xpath，etree


class QuiShi:
    def __init__(self):
        self.temp_url = "http://www.lovehhy.net/Joke/Detail/QSBK/{0}"
        self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"}
        #1.Queue url队列
        self.url_query = Queue()
        #　html网页队列
        self.html_query = Queue()
        # content内容队列
        self.content_query = Queue()
    def get_url_list(self):
         for i in range(1,5):
             self.url_query.put(self.temp_url.format(i))

    def parse_url(self):
        while True:
            url = self.url_query.get()
            self.html_query.put(requests.get(url,headers=self.headers).content.decode("gbk"))
            self.url_query.task_done()

    def get_content_list(self):
        # print(html_str)
        #etree.HTML 变成树状结构
        while True:
            html_str = self.html_query.get()
            html_str = html_str.replace("<br />","").strip("")
            html = etree.HTML(html_str)
            # s = html.xpath('//div[@id="footzoon"]')
            h3_list = html.xpath('//div[@id="footzoon"]/h3')
            content_list=[]
            for h3 in h3_list:
                item = {}
                item["title"] = h3.xpath("./a/text()")
                item["title_href"] = h3.xpath("./a/@href")
                item["content"] =[]
                s = h3.xpath('./following-sibling::div/text()')
                for i in s:
                    item["content"].append(i.replace("\u3000",""))
                content_list.append(item)
            self.content_query.put(content_list)
            self.html_query.task_done()

    def save_content_list(self):
        while True:
            cons = self.content_query.get()
            print(cons)
            self.content_query.task_done()


    def run(self):
        # 1.获取url地址列表

        t1 = threading.Thread(target=self.get_url_list)
        t21 = threading.Thread(target=self.parse_url)
        t22 = threading.Thread(target=self.parse_url)
        t23 = threading.Thread(target=self.parse_url)
        t3 = threading.Thread(target=self.get_content_list)
        t4 = threading.Thread(target=self.save_content_list)
        t1.start()
        t21.start()
        t22.start()
        t23.start()
        t3.start()
        t4.start()
        self.url_query.join()
        self.html_query.join()
        self.content_query.join()


if __name__ == '__main__':
    t1 = time.time()
    quishi = QuiShi()
    quishi.run()
    print(time.time() - t1)
爬虫多线程模板，xpath，etree

猜你喜欢