爬虫高并发之异步IO

1.asyncio模块

@asyncio.coroutine
def task():
    print('start...')
    yield from asyncio.sleep(5) #不支持HTTP请求,支持TCP请求
    #但HTTP请求就是基于TCP封装的,所以我们可以基于TCP协议发送
    #HTTP请求
    print('end')

tasks=[task(),task(),task()]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
基本用法
import asyncio
def task(host,url='/'):

    reader,writer=yield from asyncio.open_connection(host,80)

    request_header_content="GET %s HTTP/1.0\r\nHost: %s\r\n\r\n" %(url,host)
    request_header_content=bytes(request_header_content,encoding='utf-8')

    writer.write(request_header_content)
    yield from writer.drain()
    text=yield from reader.read()
    print(host,url,text)

tasks=[task('http://www.baidu.com'),task('http://www.cnblogs.com')]


loop=asyncio.get_event_loop()
loop. run_until_complete(asyncio.gather(*tasks))
loop.close()
基于TCP发送HTTP请求
import asyncio
import requests

@asyncio.coroutine
def task(fun,*args):
    print(fun,args)
    loop=asyncio.get_event_loop()
    future=loop.run_in_executor(None,fun,*args)
    response=yield from future
    print(response.url,response.content)


tasks=[
    task(requests.get,'http://bing.com'),
    task(requests.get,'http://cnblogs.com')
    ]
loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio+requests
#pip install aiohttp
#aiohttp + asyncio
import asyncio
import aiohttp

@asyncio.coroutine
def task(url):
    response=yield from aiohttp.request("GET",url)
    print(response)

tasks=[task('http://bing.com'),task('http://cnblogs.com')]

loop=asyncio.get_event_loop()
result=loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
asyncio+aiohttp

2.tornado模块

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop
COUNT=None
count=0


def handle_response(response):
    global count
    count+=1
    if response.error:
        print('error')
    else:
        body=response.body
        print(body)
    global COUNT
    if count==COUNT:
        ioloop.IOLoop.instance().stop()


def fun():
    url_list=['http://www.baidu.com','http://www.cnblogs.com']
    global COUNT
    COUNT=len(url_list)
    for url in url_list:
        http_client=AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url),handle_response)


ioloop.IOLoop.current().add_callback(fun)
ioloop.IOLoop.current().start()  #死循环
tornado异步IO

3.Twisted模块

#爬虫异步,提高并发
from twisted.web.client import getPage
from twisted.internet import reactor,defer


def one_done(args):
    print(args)
    print(type(args))


def all_done(args):
    print(args)
    print(type(args))
    reactor.stop()


@defer.inlineCallbacks
def tasks(url):
    res=getPage(bytes(url,'utf-8'))
    res.addCallback(one_done)
    yield res

url_list=['http://www.baidu.com','http://www.cnblogs.com']
def_list=[]
for i in url_list:
    v=tasks(i)
    def_list.append(v)

d=defer.DeferredList(def_list)
d.addBoth(all_done)
reactor.run()  #死循环
twisted异步IO

4.gevent模块

#pip install greenlet  #协程的模块+异步IO
#pip install gevent    #依赖greenlet模块

import gevent
import requests
from gevent.pool import Pool  #协程池
from gevent import monkey

monkey.patch_all() #封装成异步IO

pool=Pool(3) #限制发送的个数

def task(method,url,req_kwargs):
    print(method,url,req_kwargs)
    response=requests.request(method,url,**req_kwargs)
    print(response.url)
    print(response.content)


# gevent.joinall([
#     gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
#     gevent.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
# ])

gevent.joinall([
    pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
    pool.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
])
gevent+requests
#gevent+requests
import grequests

requests_list=[

    grequests.get('http://cnblogs.com'),
    grequests.get('http://bing.com'),
    grequests.get('http://che.com')
]

response_list=grequests.map(requests_list)
print(response_list)
grequests

总结:

自定义异步爬虫IO时:

  #gevent->Twised->Tornado->asyncio

 

猜你喜欢

转载自www.cnblogs.com/lujiacheng-Python/p/10255902.html