aiohttp笔记

简介

aiohttp需要python3.5.3以及更高的版本,它不但能做客户端爬虫,也能做服务器端,利用asyncio,协程,十分高效
官方文档

爬虫模板

# !usr/bin/env python
# -*- coding:utf-8 _*-
"""
@author:happy_code
@email: [email protected]
@file:  aiocrawl.py
@time:  2019/03/27
@desc:  
"""
import asyncio
import logging
import time

from aiohttp import ClientSession, ClientTimeout


logging.basicConfig(level=logging.INFO, format='[%(asctime)s] - %(levelname)s in %(filename)s.%(funcName)s: %(message)s')

# 默认请求头
HEADERS = {
    'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/69.0.3497.100 Safari/537.36',
}

# 默认超时时间
TIMEOUT = 15


class AioCrawl:

    def __init__(self):
        self.logger = logging.getLogger(__name__)

    async def fetch(self, url, method='GET', headers=None, timeout=None, cookies=None, data=None):
        """采集纤程"""

        method = 'POST' if method.upper() == 'POST' else 'GET'
        headers = headers if headers else HEADERS
        timeout = ClientTimeout(total=timeout) if timeout and isinstance(timeout, int) else ClientTimeout(total=TIMEOUT)
        cookies = cookies if cookies else None
        data = data if data and isinstance(data, dict) else {}

        async with ClientSession(headers=headers, timeout=timeout, cookies=cookies) as session:
            try:
                if method == 'GET':
                    async with session.get(url) as response:
                        return await response.read()
                else:
                    async with session.post(url, data=data) as response:
                        return await response.read()
            except Exception as e:
                pass

    def prepare_fetch(self, urls):
        """准备future_list
        :param urls:
        :return: list: future
        """
        return [asyncio.ensure_future(self.fetch(url)) for url in urls]

    def crawl_batch_urls(self, urls):
        """执行采集
        :param urls: url列表
        :return: result 列表
        """
        future_list = self.prepare_fetch(urls)

        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(future_list))

        self.logger.info('采集完一批: {}'.format(len(urls)))
        return [future.result() for future in future_list]

    def crawl(self, urls, concurrent: int):
        """循环采集,指定并发,结束返回
        :param urls: url列表
        :param concurrent: 并发数
        :return: result 列表
        """
        start = time.time()
        self.logger.info('开始任务, 任务长度: {}'.format(len(urls)))

        result_list = []
        for index in range(0, len(urls), concurrent):
            result_list.extend(self.crawl_batch_urls(urls[index:index+concurrent]))

        ended = time.time()
        self.logger.info('任务完成, 耗时: {}'.format(ended - start))
        return result_list


if __name__ == '__main__':
    a = AioCrawl()
    # 3-4秒
    a.crawl(['https://www.sina.com.cn' for _ in range(10)], 10)

服务器端

待更。。。

猜你喜欢

转载自www.cnblogs.com/haoabcd2010/p/10615364.html
今日推荐