[python] Crawler notes (9) Asynchronous crawler

purpose

Use asynchronous in crawlers to achieve high-performance data crawling

Asynchronous crawler

  • Multi-thread, multi-process (not recommended)
    • Benefit: The thread or process can be opened separately for the related blocking operation, and the blocking operation can be executed asynchronously.
    • Disadvantages: It is impossible to open multi-thread or multi-process without limitation.
  • Thread pool, process pool (appropriate use)
    • Benefits: reduce the frequency of the system's creation and destruction of processes or threads, thereby reducing system overhead
    • Disadvantages: The number of threads or processes in the pool has an upper limit.

Basic use:

import time
from multiprocessing.dummy import Pool

def get(str):
    print("正在下载:",str)
    time.sleep(2)
    print("下载成功:",str)

name_list = ['sh','ss','hh','sg']
start_time = time.time()
#实例化一个线程池对象
pool = Pool(4)

#将列表中每一个元素传递给get处理
pool.map(get,name_list)
end_time = time.time()

print(abs(start_time - end_time))

However, in the actual crawler, not all operations can be handed over to the thread pool for processing.
The thread pool handles blocking and time-consuming operations

import requests
from lxml import etree
from multiprocessing.dummy import Pool
import time
def get(url):
    print("开始爬取:", url)

    time.sleep(1)
    print("爬取成功:", url)

if __name__ == "__main__":
    ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"

    headers = {
    
    
        "User-Agent": ua,
    }

    url = "https://www.pearvideo.com/category_5"

    response = requests.get(url=url, headers=headers).text
    tree = etree.HTML(response)
    li_list = tree.xpath('//*[@id="categoryList"]/li')
    url_2 = []
    for li in li_list:
        url_2.append("https://www.pearvideo.com/" + li.xpath('//*[@id="categoryList"]/li[1]/div/a/@href')[0])
    pool = Pool(9)
    pool.map(get , url_2)
  • + Threaded asynchronous coroutine (recommended)
    Insert picture description here
    async
    using a modified async method, the object returns a coroutine

Basic use:

import asyncio

async def request(url):
    print("sh")
    return url

#asyni修饰的函数,调用之后返回一个对象
c = request(url='www.baidu.com')

#创建一个事件循环对象
# loop = asyncio.get_event_loop()
#
# #将协程对象注册到loop中,然后启动loop
# loop.run_until_complete(c)

#task的使用
#基于loop创建了一个task对象
# loop = asyncio.get_event_loop()
# task = loop.create_task(c)
# print(task)
#
# loop.run_until_complete(task)
# print(task)

#future的使用
# loop = asyncio.get_event_loop()
# task = asyncio.ensure_future(c)
# print(task)
# loop.run_until_complete(task)
# print(task)

#绑定回调
def callback_func(task):
    #result返回的就是任务对象中封装的协程对象对应函数的返回值
    print(task.result())

loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
#添加回调函数
task.add_done_callback(callback_func)
loop.run_until_complete(task)

Multi-task asynchronous coroutine

import asyncio
import time

#多任务异步协程

def request(url):
    print("正在下载:",url)
    #在异步协程中如果出现了同步模块相关的代码,那么就无法实现异步,所以用asyncio使用手动挂起
    #time.sleep(2)
    await asyncio.sleep(2)
    print("下载完毕",url)

urls = ['www.baidu.com','www.4399.com','www.12345.com']

#任务列表需要存放多任务
stasks = []
for url in urls:
    c = request(url)
    task = asyncio.ensure_future(c)
    stasks.append(task)#存放任务

# 开始注册任务
loop = asyncio.get_event_loop()
#需要将任务列表封装到wait中
loop.run_until_complete(asyncio.wait(stasks))

It should be noted that the requests request is based on synchronization, so it is necessary to use an asynchronous network request module to specify the specified URL.

import requests
from lxml import etree
import time
import asyncio
#使用该模块中的ClientSession对象进行网络请求发送
import aiohttp

async def get(url):

    async with aiohttp.ClientSession() as session:
        # get(),post()
        # headers
        async with await session.get(url) as response:
            #在获取响应数据操作之前一定要使用await挂起
            page_text = await response.text()
            print(page_text)

if __name__ == "__main__":
    ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"

    headers = {
    
    
        "User-Agent": ua,
    }

    url = "https://www.pearvideo.com/category_5"

    #requests用aiohttp替换、
    response = requests.get(url=url, headers=headers).text

    tree = etree.HTML(response)
    li_list = tree.xpath('//*[@id="categoryList"]/li')
    tasks = []
    for li in li_list:
        c = get(url)
        task = asyncio.ensure_future(c)
        tasks.append(task)


    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))

Guess you like

Origin blog.csdn.net/Sgmple/article/details/112168271