Python/异步编程/AIOHTTP框架---多进程+协程 ( 简爬 )

AIOHTTP

简单获取：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import asyncio
import aiohttp


async def main():
        async with aiohttp.ClientSession() as session:
            async with session.get('http://www.httpbin.org/get') as resp:
                print(resp.status)
                print(await resp.json())
                print(await resp.text())

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    # print(loop.run_until_complete(main()))

200
{'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Host': 'www.httpbin.org', 'User-Agent': 'Python/3.6 aiohttp/3.5.4'}, 'origin': '221.218.215.124, 221.218.215.124', 'url': 'https://www.httpbin.org/get'}
{
  "args": {},
  "headers": {
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "Host": "www.httpbin.org",
    "User-Agent": "Python/3.6 aiohttp/3.5.4"
  },
  "origin": "221.218.215.124, 221.218.215.124",
  "url": "https://www.httpbin.org/get"
}

多进程+协程下载图片：

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import asyncio
import aiohttp
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

logging.basicConfig(
    level=logging.DEBUG,
    format='%(threadName)-10s:%(message)s'
)

TEST_URLS = [
    'https://source.unsplash.com/random',
    'https://source.unsplash.com/user/erondu/1600x900',
    'http://via.placeholder.com/350x150',
    'http://via.placeholder.com/350x150/1c2b3c/999',
]

DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")


def make_temp_name(count=5, f='.jpg'):
    logging.debug('make_temp_name is start....')
    return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(count)]) + f


async def download_image(url):
    
    logging.debug('download_image start.....')
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            filename = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
            with open(filename, "wb") as f:
                while True:
                    image = await resp.content.read(1024)
                    if not image:
                        break
                    else:
                        f.write(image)


def main():
    start_time = time.time()
    look = asyncio.get_event_loop()
    # p = Pool(3)
    # for url in TEST_URLS:
    #     result = p.apply_async(look.run_until_complete(download_image(url)))

    try:
        with ProcessPoolExecutor() as executor:
            for url in TEST_URLS:
                look.run_until_complete(download_image(url))
    except Exception as e:
        print(e)
    print(f"爬取完成，用时时间：{time.time() - start_time}秒....")

if __name__ == "__main__":
    log_to_stderr()
    get_logger()
    main()
>>>
MainThread:Using selector: SelectSelector
[DEBUG/MainProcess] created semlock with handle 460
[DEBUG/MainProcess] created semlock with handle 420
[DEBUG/MainProcess] Queue._after_fork()
[DEBUG/MainProcess] created semlock with handle 756
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:download_image start.....
MainThread:make_temp_name is start....
爬取完成，用时时间：9.381403923034668秒....
[INFO/MainProcess] process shutting down
[DEBUG/MainProcess] running all "atexit" finalizers with priority >= 0
[DEBUG/MainProcess] running the remaining "atexit" finalizers

多线程+协程（Mark2）

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import asyncio
import aiohttp
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

logging.basicConfig(
    level=logging.DEBUG,
    format='%(threadName)-10s:%(message)s'
)

TEST_URLS = [
    'https://source.unsplash.com/random',
    'https://source.unsplash.com/user/erondu/1600x900',
    'http://via.placeholder.com/350x150',
    'http://via.placeholder.com/350x150/1c2b3c/999',
    'http://img5.imgtn.bdimg.com/it/u=796460492,3306564261&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=1782917320,24227842&fm=26&gp=0.jpg',
    'http://img1.imgtn.bdimg.com/it/u=2665441243,1857925582&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=2931291472,233235010&fm=26&gp=0.jpg',
    'http://img5.imgtn.bdimg.com/it/u=744077169,3705624624&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=872389273,3559301897&fm=26&gp=0.jpg',
    'http://img0.imgtn.bdimg.com/it/u=1404919429,1733398877&fm=26&gp=0.jpg',
    'http://img1.imgtn.bdimg.com/it/u=2082542552,294635837&fm=26&gp=0.jpg',
    'https://www.baidu.com/img/bd_logo1.png'
]

DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")


def make_temp_name(count=5, f='.jpg'):
    logging.debug('make_temp_name is start....')
    return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(count)]) + f


def afetch_url():
    return TEST_URLS


async def download_image(url, loop):
    logging.debug('download_image start.....')
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as 2 resp:
            filename = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
            with open(filename, "wb") as f:
                while True:
                    image = await resp.content.read(1024)
                    if not image:
                        break
                    else:
                        f.write(image)


async def acrawler(loop):
    logging.debug('starting async crawler...')
    urls = afetch_url()
    tasks = [download_image(url, loop) for url in urls]
    await asyncio.gather(*tasks)


def download_images_2():
    start_time = time.time()
    for i in TEST_URLS:
        image_name = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
        url_data = requests.get(i).content
        if url_data:
            with open(image_name, 'wb') as f:
                f.write(url_data)
    print(f"爬取完成，用时时间：{time.time() - start_time}秒....")


# def run_async_crawler():
#     loop = asyncio.get_event_loop()
#     loop.run_until_complete(acrawler(loop))


def get_download():
    start_time = time.time()
    loop = asyncio.get_event_loop()
    try:
        with ProcessPoolExecutor(4) as executor:
            loop.run_until_complete(acrawler(loop))
    except Exception as e:
        print(e)
    print(f"爬取完成，用时时间：{time.time() - start_time}秒....")

    # for url in TEST_URLS:
    #     look = asyncio.get_event_loop()
    #     look.run_until_complete(download_image(url))

    # try:
    #     with ProcessPoolExecutor(4) as executor:
    #         for url in TEST_URLS:
    #             look.run_until_complete(download_image(url))
    # except Exception as e:
    #     print(e)


if __name__ == "__main__":
    log_to_stderr()
    get_logger()
    get_download()
    download_images_2()
>>>
MainThread:Using selector: SelectSelector
[DEBUG/MainProcess] created semlock with handle 568
[DEBUG/MainProcess] created semlock with handle 848
[DEBUG/MainProcess] Queue._after_fork()
[DEBUG/MainProcess] created semlock with handle 476
MainThread:starting async crawler...
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:download_image start.....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
MainThread:make_temp_name is start....
爬取完成，用时时间：6.389447927474976秒....

加强封装，Class：

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import asyncio
import aiohttp
import sys
import string
import requests
import time
import random
import logging
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

logging.basicConfig(
    level=logging.DEBUG,
    format='%(threadName)-10s:%(message)s'
)

TEST_URLS = [
    'https://source.unsplash.com/random',
    'https://source.unsplash.com/user/erondu/1600x900',
    'http://via.placeholder.com/350x150',
    'http://via.placeholder.com/350x150/1c2b3c/999',
    'http://img5.imgtn.bdimg.com/it/u=796460492,3306564261&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=1782917320,24227842&fm=26&gp=0.jpg',
    'http://img1.imgtn.bdimg.com/it/u=2665441243,1857925582&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=2931291472,233235010&fm=26&gp=0.jpg',
    'http://img5.imgtn.bdimg.com/it/u=744077169,3705624624&fm=26&gp=0.jpg',
    'http://img2.imgtn.bdimg.com/it/u=872389273,3559301897&fm=26&gp=0.jpg',
    'http://img0.imgtn.bdimg.com/it/u=1404919429,1733398877&fm=26&gp=0.jpg',
    'http://img1.imgtn.bdimg.com/it/u=2082542552,294635837&fm=26&gp=0.jpg',
    'https://www.baidu.com/img/bd_logo1.png'
]

DOWNLOAD_DIR = os.path.join(BASE_DIR, "download")


class Async:
    def __init__(self, count, filem):
        self.count = count
        self.filem = filem
        self.loop = asyncio.get_event_loop()

    def make_temp_name(self):
        logging.debug('make_temp_name is start....')
        return ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(self.count)]) + self.filem

    def afetch_url(self):
        return TEST_URLS

    async def download_image(self, url):
        logging.debug('download_image start.....')
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                filename = os.path.join(DOWNLOAD_DIR, self.make_temp_name() + self.filem)
                with open(filename, "wb") as f:
                    while True:
                        image = await resp.content.read(1024)
                        if not image:
                            break
                        else:
                            f.write(image)

    async def acrawler(self, loop):
        logging.debug('starting async crawler...')
        urls = self.afetch_url()
        tasks = [self.download_image(url) for url in urls]
        await asyncio.gather(*tasks)

    def get_download(self):
        start_time = time.time()
        try:
            with ProcessPoolExecutor(4) as executor:
                self.loop.run_until_complete(self.acrawler(self.loop))
        except Exception as e:
            print(e)
        print(f"爬取完成，用时时间：{time.time() - start_time}秒....")

    def download_images_2(self):
        start_time = time.time()
        for i in TEST_URLS:
            image_name = os.path.join(DOWNLOAD_DIR, self.make_temp_name() + self.filem)
            url_data = requests.get(i).content
            if url_data:
                with open(image_name, 'wb') as f:
                    f.write(url_data)

        print(f"爬取完成，用时时间：{time.time() - start_time}秒....")


def main():
    log_to_stderr()
    get_logger()
    while True:
        menu = {
            "1" : "普通爬取",
            "2" : "多进程+协程爬取",
            "Q" : "退出"
        }
        for k, v in menu.items():
            print(f"{k} : {v}")
        
        Your = input("请您输入：").strip().upper()
        if Your == "1":
            A = Async(5, '.jpg')
            A.download_images_2()
        elif Your == "2":
            A = Async(5, '.jpg')
            A.get_download()
        elif Your == "Q":
            sys.exit()

if __name__ == "__main__":
    main()


# TODO: 测试笔记(Mark)
#-------------------------------------
# def download_images_2():
#     start_time = time.time()
#     for i in TEST_URLS:
#         image_name = os.path.join(DOWNLOAD_DIR, make_temp_name(f='-a.jpg'))
#         url_data = requests.get(i).content
#         if url_data:
#             with open(image_name, 'wb') as f:
#                 f.write(url_data)
#     print(f"爬取完成，用时时间：{time.time() - start_time}秒....")


# def run_async_crawler():
#     loop = asyncio.get_event_loop()
#     loop.run_until_complete(acrawler(loop))



# TODO: 测试笔记(Mark)
# ------------------------------
# for url in TEST_URLS:
#     look = asyncio.get_event_loop()
#     look.run_until_complete(download_image(url))

# try:
#     with ProcessPoolExecutor(4) as executor:
#         for url in TEST_URLS:
#             look.run_until_complete(download_image(url))
# except Exception as e:
#     print(e)

Mark(html)

#!/usr/bin/env python
# -*- coding:utf-8 -*-


import asyncio
import aiohttp
import logging
from bs4 import BeautifulSoup
from multiprocessing import Process, Pool, get_logger, log_to_stderr
from concurrent.futures import ProcessPoolExecutor

logging.basicConfig(
    level=logging.DEBUG,
    format='%(threadName)-10s:%(message)s'
)


class Async:
    def __init__(self, URL, file_name):
        self.URL = URL
        self.file_name = file_name
        self.loop = asyncio.get_event_loop()

    async def get_url(self):
        logging.debug('get_url start....')
        async with aiohttp.ClientSession() as session:
            async with session.get(self.URL) as resp:
                data = await resp.text()
                self.save_html(data)
    
    def save_html(self, data):
        logging.debug('save_html function start.....')
        with open(self.file_name, 'w') as f:
            if data:
                f.write(data)
            else:
                return False

    def run_url(self):
        logging.debug('run_url function start.....')
        try:
            with ProcessPoolExecutor(4) as executor:
                self.loop.run_until_complete(self.get_url())
        except Exception as e:
            print(e)
                

if __name__ == "__main__":
    log_to_stderr()
    get_logger()
    A = Async('https://edu.51cto.com/t/user/ianswer/id-3096.html', 'yankai.txt')
    A.run_url()

Python/异步编程/AIOHTTP框架---多进程+协程 ( 简爬 )

AIOHTTP

猜你喜欢