糗事爬虫-单线程+多线程

单线程:


```python

```python
import requests
import json
from lxml import etree
class Qiushi:
    def __init__(self):
        self.url="http://www.yicommunity.com/remen/index_{}.html"
        self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
    def get_url_list(self):
        return [self.url.format(i)for i in range(2,10)]
    def parase_url(self,url):
        print(url)
        response=requests.get(url,headers=self.headers)
        return response.content.decode()
    def get_content_list(self,html_str):#提取数据
        html = etree.HTML(html_str)
        content_list = []
        div_list=html.xpath("//div[@class='col1']/div")#分组
        for  div in div_list:
            item={}
            item["auther"] = div.xpath(".//div[@class='auther']/img/@alt")
            item["content"]=div.xpath(".//div[@class='content']/text()")
            item["auther_img"]=div.xpath(".//div[@class='auther']/img/@src")
            # item["dianzhan"]=div.xpath(".///a[@class='voted disable']/text()")
            content_list.append(item)
        return content_list
    def save_content_list(self, content_list):  # 保存
        with open("qiushi.txt", "a",encoding="gbk") as f:
            for content in content_list:
                f.write(json.dumps(content, ensure_ascii=False))
                f.write("\n")
    def run(self):
        #1.首页地址
        url_list= self.get_url_list()
        #2.遍历,发送请求,获取响应
        for url in url_list:
            html_str = self.parase_url(url)
        #3.提取数据
            content_list=self.get_content_list(html_str)
        #4.保存数据
        self.save_content_list(content_list)


if __name__ =="__main__":
    qiushi=Qiushi()
    qiushi.run()

多线程(注意死循环):

import requests
import json
from lxml import etree
from queue import Queue
import threading
class Qiushi:
    def __init__(self):
        self.url="http://www.yicommunity.com/remen/index_{}.html"
        self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
        self.url_queue = Queue()
        self.html_queue =Queue()
        self.content_queue = Queue()
    def get_url_list(self):
        # return [self.url.format(i)for i in range(2,10)]
        for i in range(2,3):
            self.url_queue.put(self.url.format(i))

    def parase_url(self):
        while True:
            url=self.url_queue.get()
            print(url)
            response=requests.get(url,headers=self.headers)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()#和get函数一起使用,达到减一的目的
        print("par主线程结束")

    def get_content_list(self):#提取数据
        while True:
            html_str = self.html_queue.get()
            html = etree.HTML(html_str)
            content_list = []
            div_list=html.xpath("//div[@class='col1']/div")#分组
            for  div in div_list:
                item={}
                item["auther"] = div.xpath(".//div[@class='auther']/img/@alt")
                item["content"]=div.xpath(".//div[@class='content']/text()")
                item["auther_img"]=div.xpath(".//div[@class='auther']/img/@src")
                # item["dianzhan"]=div.xpath(".///a[@class='voted disable']/text()")
                content_list.append(item)
            self.content_queue.put(content_list)
            self.html_queue.task_done()
        print("get主线程结束")
    def save_content_list(self):  # 保存
        content_list = self.content_queue.get()
        with open("qiushi.txt", "a") as f:
            for content in content_list:
                f.write(json.dumps(content, ensure_ascii=False))
                f.write("\n")
        self.content_queue.task_done()
        print("sav主线程结束")
    def run(self):
        thread_list=[]
        #1.首页地址
        t_url=threading.Thread(target=self.get_url_list())
        thread_list.append(t_url)
        #2.遍历,发送请求,获取响应
        for i in range(20):
            t_parase=threading.Thread(target=self.parase_url())
            thread_list.append(t_parase)
        #3.提取数据
        t_html = threading.Thread(target=self.get_content_list())
        thread_list.append(t_html)
        #4.保存数据
        t_save = threading.Thread(target=self.save_content_list())
        thread_list.append(t_save)
        for t in thread_list:
            t.start()
            t.setDaemon(True)#把子线程设置为守护线程,该线程不重要主线程结束,子线程结束

        for q in [self.url_queue,self.content_queue,self.html_queue]:
            q.join()#让主线程等待阻塞,等待队列的任务完成

        print("主线程结束")

if __name__ =="__main__":
    qiushi=Qiushi()
    qiushi.run()
发布了16 篇原创文章 · 获赞 0 · 访问量 498

猜你喜欢

转载自blog.csdn.net/zhanlong11/article/details/104849929