简介
aiohttp需要python3.5.3以及更高的版本,它不但能做客户端爬虫,也能做服务器端,利用asyncio,协程,十分高效
官方文档
爬虫模板
# !usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:happy_code
@email: [email protected]
@file: aiocrawl.py
@time: 2019/03/27
@desc:
"""
import asyncio
import logging
import time
from aiohttp import ClientSession, ClientTimeout
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s in %(filename)s.%(funcName)s: %(message)s')
# 默认请求头
HEADERS = {
'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
}
# 默认超时时间
TIMEOUT = 15
class AioCrawl:
def __init__(self):
self.logger = logging.getLogger(__name__)
async def fetch(self, url, method='GET', headers=None, timeout=None, cookies=None, data=None):
"""采集纤程"""
method = 'POST' if method.upper() == 'POST' else 'GET'
headers = headers if headers else HEADERS
timeout = ClientTimeout(total=timeout) if timeout and isinstance(timeout, int) else ClientTimeout(total=TIMEOUT)
cookies = cookies if cookies else None
data = data if data and isinstance(data, dict) else {}
async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
try:
if method == 'GET':
async with session.get(url) as response:
return await response.read()
else:
async with session.post(url, data=data) as response:
return await response.read()
except Exception as e:
pass
def prepare_fetch(self, urls):
"""准备future_list
:param urls:
:return: list: future
"""
return [asyncio.ensure_future(self.fetch(url)) for url in urls]
def crawl_batch_urls(self, urls):
"""执行采集
:param urls: url列表
:return: result 列表
"""
future_list = self.prepare_fetch(urls)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(future_list))
self.logger.info('采集完一批: {}'.format(len(urls)))
return [future.result() for future in future_list]
def crawl(self, urls, concurrent: int):
"""循环采集,指定并发,结束返回
:param urls: url列表
:param concurrent: 并发数
:return: result 列表
"""
start = time.time()
self.logger.info('开始任务, 任务长度: {}'.format(len(urls)))
result_list = []
for index in range(0, len(urls), concurrent):
result_list.extend(self.crawl_batch_urls(urls[index:index+concurrent]))
ended = time.time()
self.logger.info('任务完成, 耗时: {}'.format(ended - start))
return result_list
if __name__ == '__main__':
a = AioCrawl()
# 3-4秒
a.crawl(['https://www.sina.com.cn' for _ in range(10)], 10)
服务器端
待更。。。