aiohttp 异步爬虫

示例

import aiohttp, asyncio
async def main():
    async with aiohttp.ClientSession() as session:
        async with session.get('http://www.baidu.com') as resp:
            print(resp.status)
            print(await resp.text())

if __name__=="__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

aiohttp.ClientSession参数

version: HttpVersion=http.HttpVersion11   #http版本默认是1.1

session.get参数

url:网站
params:get请求的参数
headers:请求头
timeout:超时设置，单位s
proxy:代理
stream:流，在爬视频会用到，默认为False
verify_ssl:ssl检验，一般改为False

session.post参数

url:网站
data:post请求的参数,(注意，如果是json格式，要先转为json再传值)
headers:请求头
timeout:超时设置，单位s
proxy:代理
stream:流，在爬视频会用到，默认为False
verify_ssl:ssl检验，一般改为False

response属性与方法

status：响应状态码
json()：如果response是json数据，可直接用该方法转换为python的dict
text(encoding='utf-8')：网页字符串数据,可指定编码格式
read()：原始数据bytes,gzip和deflate传输编码同样会自动解码。
content：原始数据bytes,gzip和deflate传输编码同样会自动解码。
'''
read()与content的区别，content本身是一个流，可以直接使用read()读取里面的内容
with open(filename, 'wb') as fd:
    while True:
        chunk = await resp.content.read(chunk_size)
        if not chunk:
            break
        fd.write(chunk)
'''

不要为每次的连接都创建一次session,一般情况下只需要创建一个session，然后使用这个session执行所有的请求。

aiohttp+aiomysql

import aiohttp, asyncio,aiomysql
import time

async def to_mysql(pool,data):
    sql="insert into mydata(data) VALUES (%s)"
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            await cur.execute(sql,(data))
            print('入库成功')
async def get_html(url,session,pool):
    async with semaphore:
        async with session.get(url) as resp:
            data=resp.status
            print(data)
            #await asyncio.gather(to_mysql(pool, data))，由于已经再时间循环体里，如果这样写会严重影响程序效率
        await to_mysql(pool, data)
async def main(loop):
    #如果不限制并发数量，当任务大于510时会报错
    semaphore = asyncio.Semaphore(500)  # 限制并发量为500
    pool=await aiomysql.create_pool(host='127.0.0.1',port=3306,user='root',password='***',
                              db='aiomysql',loop=loop,charset='utf8',autocommit=True)
    async with aiohttp.ClientSession() as session:
        tasks=[]
        for i in range(100):
            tasks.append(get_html('http://www.baidu.com',session,pool))
        await asyncio.gather(*tasks)
    pool.close()


if __name__=="__main__":
    start=time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main(loop))
    end=time.time()
    print(end-start)

示例

aiohttp.ClientSession参数

session.get参数

session.post参数

response属性与方法

aiohttp+aiomysql

猜你喜欢