单线程:
```python
```python
import requests
import json
from lxml import etree
class Qiushi:
def __init__(self):
self.url="http://www.yicommunity.com/remen/index_{}.html"
self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
def get_url_list(self):
return [self.url.format(i)for i in range(2,10)]
def parase_url(self,url):
print(url)
response=requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,html_str):#提取数据
html = etree.HTML(html_str)
content_list = []
div_list=html.xpath("//div[@class='col1']/div")#分组
for div in div_list:
item={}
item["auther"] = div.xpath(".//div[@class='auther']/img/@alt")
item["content"]=div.xpath(".//div[@class='content']/text()")
item["auther_img"]=div.xpath(".//div[@class='auther']/img/@src")
# item["dianzhan"]=div.xpath(".///a[@class='voted disable']/text()")
content_list.append(item)
return content_list
def save_content_list(self, content_list): # 保存
with open("qiushi.txt", "a",encoding="gbk") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False))
f.write("\n")
def run(self):
#1.首页地址
url_list= self.get_url_list()
#2.遍历,发送请求,获取响应
for url in url_list:
html_str = self.parase_url(url)
#3.提取数据
content_list=self.get_content_list(html_str)
#4.保存数据
self.save_content_list(content_list)
if __name__ =="__main__":
qiushi=Qiushi()
qiushi.run()
多线程(注意死循环):
import requests
import json
from lxml import etree
from queue import Queue
import threading
class Qiushi:
def __init__(self):
self.url="http://www.yicommunity.com/remen/index_{}.html"
self.headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
self.url_queue = Queue()
self.html_queue =Queue()
self.content_queue = Queue()
def get_url_list(self):
# return [self.url.format(i)for i in range(2,10)]
for i in range(2,3):
self.url_queue.put(self.url.format(i))
def parase_url(self):
while True:
url=self.url_queue.get()
print(url)
response=requests.get(url,headers=self.headers)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()#和get函数一起使用,达到减一的目的
print("par主线程结束")
def get_content_list(self):#提取数据
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
content_list = []
div_list=html.xpath("//div[@class='col1']/div")#分组
for div in div_list:
item={}
item["auther"] = div.xpath(".//div[@class='auther']/img/@alt")
item["content"]=div.xpath(".//div[@class='content']/text()")
item["auther_img"]=div.xpath(".//div[@class='auther']/img/@src")
# item["dianzhan"]=div.xpath(".///a[@class='voted disable']/text()")
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print("get主线程结束")
def save_content_list(self): # 保存
content_list = self.content_queue.get()
with open("qiushi.txt", "a") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False))
f.write("\n")
self.content_queue.task_done()
print("sav主线程结束")
def run(self):
thread_list=[]
#1.首页地址
t_url=threading.Thread(target=self.get_url_list())
thread_list.append(t_url)
#2.遍历,发送请求,获取响应
for i in range(20):
t_parase=threading.Thread(target=self.parase_url())
thread_list.append(t_parase)
#3.提取数据
t_html = threading.Thread(target=self.get_content_list())
thread_list.append(t_html)
#4.保存数据
t_save = threading.Thread(target=self.save_content_list())
thread_list.append(t_save)
for t in thread_list:
t.start()
t.setDaemon(True)#把子线程设置为守护线程,该线程不重要主线程结束,子线程结束
for q in [self.url_queue,self.content_queue,self.html_queue]:
q.join()#让主线程等待阻塞,等待队列的任务完成
print("主线程结束")
if __name__ =="__main__":
qiushi=Qiushi()
qiushi.run()