[Reptile] U28_ Python3 multi-threaded crawling fight diagram matter of expression package

1. Description of Requirement

Figure it crawling fighting website address is:https://www.doutula.com/photo/list/, Website screenshot below:

Now we need to take the first climb by page 2 of expression package, then the next directly on the code.

2. Code combat

2.1 single-threaded crawling


from urllib import request
import requests
from lxml import etree
import re
import os


HEADERS= {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}

def parse_url(url):
    response = requests.get(url, headers=HEADERS)
    text = response.text
    html_str = etree.HTML(text)
    imgs = html_str.xpath('//div[@class="page-content text-center"]//a/img[@class!="gif"]')
    for img in imgs:
        img_url = img.get('data-original')
        alt = img.get('alt')
        # 替换alt中的特殊字符
        alt = re.sub(r'[\?\?\.,。,!!/]','',alt)
        # 提取后缀名
        suffix = os.path.splitext(img_url)[1]
        filename = alt + suffix
        print("正在下载:" + filename)
        request.urlretrieve(img_url,'image/'+filename)

def main():
    for i in range(1,3):
        base_url = 'https://www.doutula.com/photo/list/?page={}'.format(i)
        parse_url(base_url)

if __name__ == '__main__':
    main()

Single-threaded crawling, then there is a problem, too slow to download expression package, such as one by one download. To solve this problem, you can use multiple threads to solve this problem.

More than 2.2 version thread

# Author:Logan
from urllib import request
import requests
from lxml import etree
import re
import os
from queue import Queue
import threading


# 定义生产者
class Procuder(threading.Thread):
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Procuder, self).__init__( *args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_url(url)

    def parse_url(self,url):
        response = requests.get(url, headers=self.HEADERS)
        text = response.text
        html_str = etree.HTML(text)
        imgs = html_str.xpath('//div[@class="page-content text-center"]//a/img[@class!="gif"]')
        for img in imgs:
            img_url = img.get('data-original')
            alt = img.get('alt')
            # 替换alt中的特殊字符
            alt = re.sub(r'[\?\?\.,。,!!/*]', '', alt)
            # 提取后缀名
            suffix = os.path.splitext(img_url)[1]
            filename = alt + suffix
            self.img_queue.put((filename,img_url))
        self.page_queue.task_done()  # 让队列计数减一

class Counsumer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Counsumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        print("============")
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            filename,img_url = self.img_queue.get()
            request.urlretrieve(img_url,'image/'+filename)
            print("下载完成:%s" %filename)
            self.img_queue.task_done()

def main():
    page_queue =Queue(100)
    img_queue = Queue(1000)
    for i in range(1,2):
        base_url = 'https://www.doutula.com/photo/list/?page={}'.format(i)
        page_queue.put(base_url)

    for i in range(5):
        t1 = Procuder(page_queue, img_queue)
        t1.start()
        t1.join()

    for i in range(5):
        t2 = Counsumer(page_queue, img_queue)
        t2.start()
        t2.join() # 让主线程阻塞(等待子线程结束在结束)

if __name__ == '__main__':
    main()

If the thread taken from every queue, but no execution task_done (), then join the queue can not determine in the end there is no end, in the final implementation of a join () can not wait for the results, would have been suspended. As it will be appreciated, once every task_done deleted from a queue element, so that when the last to join the end according to whether the queue length of the queue is zero, so that execution of the main thread.

Guess you like

Origin www.cnblogs.com/OliverQin/p/12636681.html