Article Directory
How multithreading works
Multi-threaded diagram
Queue (Queue Object)
-
definition
Queue is a standard library in python, which can be directly referenced from queue import Queue; queue is the most commonly used form of data exchange between threads
-
Thinking of multithreading under python
For resources, locking is an important link. Queue is thread-safe, so it is recommended to use a queue if the conditions of use are met
-
Create a "queue" object
pageQueue = Queue(10)
-
Put a value in the queue
for page in range(1, 11): pageQueue.put(page)
-
Remove a value from the queue
pageQueue.get()
Common methods of Queue queue
- put()
- get(block)
- empty()
- full()
- qsize()
Queue usage example
from queue import Queue
import threading
import time
num = 5
exit_flag = False
def assign_task(q:Queue):
global num
global exit_flag
while True: # 死循环一定要退出
if num == 10:
exit_flag = True
break
q.put('任务%d'%(num))
print('+++++++++新的任务%d分配了+++++++++++'%(num))
num +=1
time.sleep(2)
if __name__ == '__main__':
q = Queue(100)
for i in range(5):
q.put(item = 'task%d'%(i))
# 开启子线程分配任务
t = threading.Thread(target=assign_task,args=(q,))
t.start()
while True:
# if q.empty():
# print( '-------任务退出-------')
# break
if exit_flag:
print('任务完成下班回家!')
break
try:
print('-------------------',q.get(block=False))
except:
pass
Queue lock and thread lock
Mainly used to lock methods and code blocks. When a method or code block uses a lock, at most only one thread is executing the code at the same time. When multiple threads access the locking method/code block of the same object, only one thread is executing at the same time, and the remaining threads must wait for the current thread to execute before executing the code segment. However, the rest of the threads can access the non-locked code block in the object.
import threading
from queue import Queue
import time
exit_flag = False
def sail(q:Queue):
while True:
# if q.empty():
# break
if exit_flag:
break
print('----------线程%s售出票:%d-----------'
%(threading.current_thread().getName(),q.get()))
q.task_done()#任务完成
time.sleep(1)
if __name__ == '__main__':
tickets = Queue(1000)
for i in range(1000):
tickets.put(i)
threads = []
for i in range(100):
t = threading.Thread(target = sail,args=(tickets,))
t.start()
threads.append(t)
# for t in threads:
# t.join()
tickets.join()#等待所有的任务完成,下面代码才会执行
# 任务结束,退出条件反转
exit_flag = True
print('子线程任务全部结束!当前线程是:',threading.current_thread().getName())
Implementation of multi-threaded crawler
-
Guide package
import requests import json from lxml import etree from queue import Queue import threading import time
-
Define variables
# 声明变量 # 爬虫线程存放页码的队列,线程安全 crawlQueue = Queue() # 解析线程存放待解析数据的队列 parseQueue = Queue() # 解析线程是否退出 parseExitFlag = False # url base_url = 'http://www.qiushibaike.com/8hr/page/%d/'
-
Create a crawler thread and start
# 现在使用多线程爬去前10页数据 url p = 1 p = 2 # 任务 for i in range(1,11): requestQueue.put(i) # 爬虫线程 for i in range(1,4): threadCrawl = ThreadCrawl(requestQueue,i) threadCrawl.start()
-
Crawler thread
class ThreadCrawl(threading.Thread): def __init__(self, queue,id): super().__init__() self.queue = queue self.id = id def run(self): super().run() print('爬虫线程:--------%d--------开始工作!'%(self.id)) self.getHtml() print('爬虫线程:--------%d--------结束工作!'%(self.id)) def getHtml(self): # 该方法要一直执行,直到请求队列中为空 while True: if self.queue.empty(): break try: p = self.queue.get(block = False) # 获得页码,进行网络请求 url = url_base%(p) response = requests.get(url=url,headers = headers,verify = False) html = response.text # 将网络请求获取的数据保存另一个队列中 parseQueue parseQueue.put((html,p)) self.queue.task_done() print('爬虫线程:-------%d-------下载页码:-------%d------的数据'%(self.id,p)) except Exception as e: pass pass
-
Create a parsing thread and start
# 解析线程 fp = open('./糗事百科.txt','w',encoding='utf-8') for i in range(1,4): threadParese = ThreadParse(parseQueue,i,fp) threadParese.start()
-
Parsing thread
class ThreadParse(threading.Thread): def __init__(self, threadId, parseQueue, file): super().__init__() self.threadId = threadId self.parseQueue = parseQueue self.file = file def run(self): super().run() print('--------------------------------------starting parse ', self.threadId) global parseExitFlag while not parseExitFlag: try: item = self.parseQueue.get(False) self.parse(item) parseQueue.task_done() except: pass print('-------------------------------------exiting parse ', self.threadId) def parse(self,item): global parsePage try: xml = etree.HTML(item) result = xml.xpath('//div[contains(@id,"qiushi_tag")]') print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>',parsePage,len(result)) for site in result: img = site.xpath('.//img/@src')[0] user_name = site.xpath('.//h2')[0].text content = site.xpath('.//div[@class = "content"]/span')[0].text.strip() vote = site.xpath('.//i')[0].text comments = site.xpath('.//i')[1].text duanzi = { 'img': img, 'user_name': user_name, 'content': content, 'vote': vote, 'comments': comments} self.file.write(json.dumps(duanzi, ensure_ascii=False) + '\n') print('---------------------------------------parse done Thread = ', self.threadId, 'parsePage = ', parsePage) parsePage += 1 except Exception as e: pass ''' 调用队列对象的get()方法从队头删除并返回一个项目。可选参数为block,默认为True。 如果队列为空且block为True,get()就使调用线程暂停,直至有项目可用。 如果队列为空且block为False,队列将引发Empty异常。 '''
-
join() locks the thread to ensure that all threads are executed
# 队列锁添加 requestQueue.join() parseQueue.join()
-
End task
# 解析线程可以退出 exitParseFlag = True # 关闭文件 fp.close()
Multi-threaded crawler code implementation
import requests
from bs4 import BeautifulSoup
from queue import Queue
import threading
from threading import Lock
url = 'https://www.dushu.com/book/1175_%d.html'
task_queue = Queue(100)
parse_queue = Queue(100)
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390',
'Host':'www.dushu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',}
# 解析线程退出的标记
exit_flag = False
# 相当于线程池
class CrawlThread(threading.Thread):
def __init__(self, q_task:Queue,q_parse:Queue) -> None:
super().__init__()
self.q_task = q_task
self.q_parse = q_parse
def run(self) -> None:
super().run()
self.spider()
# 一直干活
def spider(self):
while True:
if self.q_task.empty():
print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName()))
break
taskId = self.q_task.get()
response = requests.get(url % (taskId), headers = headers)
response.encoding = 'utf-8'
html = response.text
self.q_parse.put((html,taskId))
self.q_task.task_done()
print('------爬虫线程:%s-----执行任务:%d-------'
%(threading.current_thread().getName(),taskId))
# 专心爬虫
def crawl():
for i in range(1,101):
task_queue.put(i)
for i in range(5):
t = CrawlThread(task_queue,parse_queue)
t.start()
class ParseThread(threading.Thread):
def __init__(self,q_parse:Queue,lock:Lock,fp):
super().__init__()
self.q_parse = q_parse
self.lock = lock
self.fp = fp
def run(self):
super().run()
self.parse()
def parse(self):
while True:
if exit_flag:
print('-----------解析线程:%s完成任务退出------------'
%(threading.current_thread().getName()))
break
try:
html,taskId = self.q_parse.get(block=False)
soup = BeautifulSoup(html,'lxml')
books = soup.select('div[class="bookslist"] > ul > li')
print('----------------',len(books))
for book in books:
self.lock.acquire()
book_url = book.find('img').attrs['src']
book_title = book.select('h3 a')[0]['title']
book_author = book.select('p')[0].get_text()
book_describe = book.select('p')[1].get_text()
fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe))
self.lock.release()
self.q_parse.task_done()
print('**********解析线程:%s完成了第%d页解析任务***********'
%(threading.current_thread().getName(),taskId))
except :
pass
# 专心的负责网页解析,保存
def parse(fp):
lock = Lock()
for i in range(5):
t = ParseThread(parse_queue,lock,fp)
t.start()
if __name__ == '__main__':
crawl()
fp = open('./book.txt','a',encoding='utf-8')
parse(fp)
# 队列join:队列中的任务必须结束,下面才会执行
task_queue.join()
parse_queue.join()
fp.close()
exit_flag = True
print('代码执行到这里!!!!!!!!!!!!!!')