Multi-threaded crawler of web crawler

Article Directory

How multithreading works

Multi-threaded diagram

Insert picture description here

Queue (Queue Object)

definition

Queue is a standard library in python, which can be directly referenced from queue import Queue; queue is the most commonly used form of data exchange between threads
Thinking of multithreading under python

For resources, locking is an important link. Queue is thread-safe, so it is recommended to use a queue if the conditions of use are met
Create a "queue" object
```
pageQueue = Queue(10)
```

Put a value in the queue

for page in range(1, 11):
    pageQueue.put(page)

Remove a value from the queue
```
pageQueue.get()
```

Common methods of Queue queue

put()
get(block)
empty()
full()
qsize()

Queue usage example

from queue import Queue
import threading
import time


num = 5
exit_flag = False


def assign_task(q:Queue):
    global num
    global exit_flag
    while True: # 死循环一定要退出
        if num == 10:
            exit_flag = True
            break
        q.put('任务%d'%(num))
        print('+++++++++新的任务%d分配了+++++++++++'%(num))
        num +=1
        time.sleep(2)
        
        
if __name__ == '__main__':
    q = Queue(100)
    for i in range(5):
        q.put(item = 'task%d'%(i))
    # 开启子线程分配任务
    t = threading.Thread(target=assign_task,args=(q,))
    t.start()
    while True:
        # if q.empty():
        #     print( '-------任务退出-------')
        #     break
        if exit_flag:
            print('任务完成下班回家！')
            break
        try:
            print('-------------------',q.get(block=False))
        except:
            pass

Queue lock and thread lock

Mainly used to lock methods and code blocks. When a method or code block uses a lock, at most only one thread is executing the code at the same time. When multiple threads access the locking method/code block of the same object, only one thread is executing at the same time, and the remaining threads must wait for the current thread to execute before executing the code segment. However, the rest of the threads can access the non-locked code block in the object.

import threading
from queue import Queue
import time
exit_flag = False
def sail(q:Queue):
    while True:
        # if q.empty():
        #     break
        if exit_flag:
            break
        print('----------线程%s售出票：%d-----------'
              %(threading.current_thread().getName(),q.get()))
        q.task_done()#任务完成
        time.sleep(1)
if __name__ == '__main__':
    tickets = Queue(1000)
    for i in range(1000):
        tickets.put(i)
    threads = []
    for i in range(100):
        t = threading.Thread(target = sail,args=(tickets,))
        t.start()
        threads.append(t)
    # for t in threads:
    #     t.join()
    tickets.join()#等待所有的任务完成，下面代码才会执行
    # 任务结束，退出条件反转
    exit_flag = True
    print('子线程任务全部结束！当前线程是：',threading.current_thread().getName())

Implementation of multi-threaded crawler

Guide package

import requests
import json
from lxml import etree
from queue import Queue
import threading
import time

Define variables

# 声明变量
# 爬虫线程存放页码的队列，线程安全
crawlQueue = Queue()

# 解析线程存放待解析数据的队列
parseQueue = Queue()

# 解析线程是否退出
parseExitFlag = False

# url
base_url = 'http://www.qiushibaike.com/8hr/page/%d/'

Create a crawler thread and start

# 现在使用多线程爬去前10页数据 url p = 1 p = 2
# 任务
for i in range(1,11):
	requestQueue.put(i)

# 爬虫线程
for i in range(1,4):
	threadCrawl = ThreadCrawl(requestQueue,i)
	threadCrawl.start()

Crawler thread

class ThreadCrawl(threading.Thread):
    def __init__(self, queue,id):
        super().__init__()
        self.queue = queue
        self.id = id

    def run(self):
        super().run()

        print('爬虫线程：--------%d--------开始工作！'%(self.id))

        self.getHtml()

        print('爬虫线程：--------%d--------结束工作！'%(self.id))

    def getHtml(self):
        # 该方法要一直执行，直到请求队列中为空
        while True:
            if self.queue.empty():
                break
            try:
                p = self.queue.get(block = False)

            #     获得页码，进行网络请求
                url = url_base%(p)

                response = requests.get(url=url,headers = headers,verify = False)
                html = response.text

            #     将网络请求获取的数据保存另一个队列中 parseQueue
                parseQueue.put((html,p))

                self.queue.task_done()
                print('爬虫线程：-------%d-------下载页码：-------%d------的数据'%(self.id,p))
            except Exception as e:
                pass
        pass

Create a parsing thread and start

    # 解析线程
    fp = open('./糗事百科.txt','w',encoding='utf-8')
    for i in range(1,4):
        threadParese = ThreadParse(parseQueue,i,fp)
        threadParese.start()

Parsing thread

class ThreadParse(threading.Thread):
    def __init__(self, threadId, parseQueue, file):
        super().__init__()
        self.threadId = threadId
        self.parseQueue = parseQueue
        self.file = file
    def run(self):
        super().run()
        print('--------------------------------------starting parse ', self.threadId)
        global parseExitFlag
        while not parseExitFlag:
            try:
                item = self.parseQueue.get(False)
                self.parse(item)
                parseQueue.task_done()
            except:
                pass
        print('-------------------------------------exiting parse ', self.threadId)

    def parse(self,item):
        global parsePage
        try:
            xml = etree.HTML(item)
            result = xml.xpath('//div[contains(@id,"qiushi_tag")]')
            print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>',parsePage,len(result))
            for site in result:
                img = site.xpath('.//img/@src')[0]
                user_name = site.xpath('.//h2')[0].text
                content = site.xpath('.//div[@class = "content"]/span')[0].text.strip()
                vote = site.xpath('.//i')[0].text
                comments = site.xpath('.//i')[1].text
                duanzi = {
      
      'img': img,
                          'user_name': user_name,
                          'content': content,
                          'vote': vote,
                          'comments': comments}
                self.file.write(json.dumps(duanzi, ensure_ascii=False) + '\n')
            print('---------------------------------------parse done Thread = ', self.threadId, 'parsePage = ', parsePage)
            parsePage += 1
        except Exception as e:
            pass

        
'''
调用队列对象的get()方法从队头删除并返回一个项目。可选参数为block，默认为True。
如果队列为空且block为True，get()就使调用线程暂停，直至有项目可用。
如果队列为空且block为False，队列将引发Empty异常。
'''

join() locks the thread to ensure that all threads are executed
```
# 队列锁添加
requestQueue.join()
parseQueue.join()
```

End task

# 解析线程可以退出
exitParseFlag = True

# 关闭文件
fp.close()

Multi-threaded crawler code implementation

import requests
from bs4 import BeautifulSoup
from queue import Queue
import threading
from threading import Lock
url = 'https://www.dushu.com/book/1175_%d.html'
task_queue = Queue(100)
parse_queue = Queue(100)
headers = {
    
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390',
'Host':'www.dushu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',}
# 解析线程退出的标记
exit_flag = False

# 相当于线程池
class CrawlThread(threading.Thread):
    def __init__(self, q_task:Queue,q_parse:Queue) -> None:
        super().__init__()
        self.q_task = q_task
        self.q_parse = q_parse
    def run(self) -> None:
        super().run()
        self.spider()
    # 一直干活
    def spider(self):
        while True:
            if self.q_task.empty():
                print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName()))
                break
            taskId = self.q_task.get()
            response = requests.get(url % (taskId), headers = headers)
            response.encoding = 'utf-8'
            html = response.text
            self.q_parse.put((html,taskId))
            self.q_task.task_done()
            print('------爬虫线程：%s-----执行任务:%d-------'
                  %(threading.current_thread().getName(),taskId))
# 专心爬虫
def crawl():
    for i in range(1,101):
        task_queue.put(i)
    for i in range(5):
        t = CrawlThread(task_queue,parse_queue)
        t.start()

class ParseThread(threading.Thread):
    def __init__(self,q_parse:Queue,lock:Lock,fp):
        super().__init__()
        self.q_parse = q_parse
        self.lock = lock
        self.fp = fp
    def run(self):
        super().run()
        self.parse()
    def parse(self):
        while True:
            if exit_flag:
                print('-----------解析线程：%s完成任务退出------------'
                      %(threading.current_thread().getName()))
                break
            try:
                html,taskId = self.q_parse.get(block=False)
                soup = BeautifulSoup(html,'lxml')
                books = soup.select('div[class="bookslist"] > ul > li')
                print('----------------',len(books))
                for book in books:
                    self.lock.acquire()
                    book_url = book.find('img').attrs['src']
                    book_title = book.select('h3 a')[0]['title']
                    book_author = book.select('p')[0].get_text()
                    book_describe = book.select('p')[1].get_text()
                    fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe))
                    self.lock.release()
                self.q_parse.task_done()
                print('**********解析线程：%s完成了第%d页解析任务***********'
                      %(threading.current_thread().getName(),taskId))
            except :
                pass
# 专心的负责网页解析，保存
def parse(fp):
    lock = Lock()
    for i in range(5):
        t = ParseThread(parse_queue,lock,fp)
        t.start()
if __name__ == '__main__':
    crawl()
    fp = open('./book.txt','a',encoding='utf-8')
    parse(fp)
    # 队列join：队列中的任务必须结束，下面才会执行
    task_queue.join()
    parse_queue.join()
    fp.close()
    exit_flag = True
    print('代码执行到这里！！！！！！！！！！！！！！')