asyncio+aiohttp+aiofiles实现异步下载妹子图片

因为不熟悉的缘故,总觉得写的不好,一种异步代码与同步代码参杂着的感觉,希望能够得到指点。

import os
import re

import aiofiles
import aiohttp
import asyncio
from lxml import etree


# 发请求获取html文本
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()


# 解析html获取每组的列表页链接
async def parser(html):
    tree = etree.HTML(html)
    pic_href_list = tree.xpath('//*[@class="listbox"]/a/@href')
    pic_title_list = tree.xpath('//*[@class="listbox"]/a/@title')
    for href, title in zip(pic_href_list, pic_title_list):
        path_id = re.findall('\d+', href)[0]
        dir_path = os.path.join(os.getcwd(), 'zdqx', f"{path_id}_{title}")
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        yield 'http://' + href[2:], dir_path


# 获取每组图片的所有的链接
async def detail_parser(html):
    tree = etree.HTML(html)
    src_list = tree.xpath('//div[@class="img-box"]/div/a/img/@src')
    return src_list[:-1]


# 下载图片并用异步文件库aiofiles进行存储
async def content(session, url, dir_path):
    async with session.get(url) as response:
        img = await response.read()
        async with aiofiles.open(dir_path, mode='wb') as f:
            await f.write(img)
            await f.close()


async def download(url):
    async with aiohttp.ClientSession() as session:
        html_text = await fetch(session, url)
        async for detail_url, dir_path in parser(html_text):
            detail_text = await fetch(session, detail_url)
            src_list = await detail_parser(detail_text)
            for index, src in enumerate(src_list):
                file_path = os.path.join(dir_path, f"{index}.jpg")
                if not os.path.exists(file_path):
                    try:
                        await content(session, src, file_path)
                    except AssertionError as e:
                        print(e)
                    finally:
                        print(src)


if __name__ == '__main__':
    urls = ['http://www.zdqx.com/qingchun/index.html'] + [f'http://www.zdqx.com/qingchun/index_{i}.html' for i in
                                                          range(2, 41)]

    loop = asyncio.get_event_loop()
    tasks = [asyncio.ensure_future(download(url)) for url in urls]
    tasks = asyncio.gather(*tasks)
    loop.run_until_complete(tasks)

复制代码

转载于:https://juejin.im/post/5d073e6ce51d45777621bb82

猜你喜欢

转载自blog.csdn.net/weixin_34186128/article/details/93179228