asyncio + aiohttp + aiofiles asynchronous download pictures sister

Because not familiar with the reason, always feel badly written, an asynchronous code synchronization code mingled with feeling, hoping to get advice.

import os
import re

import aiofiles
import aiohttp
import asyncio
from lxml import etree


# 发请求获取html文本
async def fetch(session, url):
    async with session.get(url) as response:
        return await response.text()


# 解析html获取每组的列表页链接
async def parser(html):
    tree = etree.HTML(html)
    pic_href_list = tree.xpath('//*[@class="listbox"]/a/@href')
    pic_title_list = tree.xpath('//*[@class="listbox"]/a/@title')
    for href, title in zip(pic_href_list, pic_title_list):
        path_id = re.findall('\d+', href)[0]
        dir_path = os.path.join(os.getcwd(), 'zdqx', f"{path_id}_{title}")
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        yield 'http://' + href[2:], dir_path


# 获取每组图片的所有的链接
async def detail_parser(html):
    tree = etree.HTML(html)
    src_list = tree.xpath('//div[@class="img-box"]/div/a/img/@src')
    return src_list[:-1]


# 下载图片并用异步文件库aiofiles进行存储
async def content(session, url, dir_path):
    async with session.get(url) as response:
        img = await response.read()
        async with aiofiles.open(dir_path, mode='wb') as f:
            await f.write(img)
            await f.close()


async def download(url):
    async with aiohttp.ClientSession() as session:
        html_text = await fetch(session, url)
        async for detail_url, dir_path in parser(html_text):
            detail_text = await fetch(session, detail_url)
            src_list = await detail_parser(detail_text)
            for index, src in enumerate(src_list):
                file_path = os.path.join(dir_path, f"{index}.jpg")
                if not os.path.exists(file_path):
                    try:
                        await content(session, src, file_path)
                    except AssertionError as e:
                        print(e)
                    finally:
                        print(src)


if __name__ == '__main__':
    urls = ['http://www.zdqx.com/qingchun/index.html'] + [f'http://www.zdqx.com/qingchun/index_{i}.html' for i in
                                                          range(2, 41)]

    loop = asyncio.get_event_loop()
    tasks = [asyncio.ensure_future(download(url)) for url in urls]
    tasks = asyncio.gather(*tasks)
    loop.run_until_complete(tasks)

复制代码

Reproduced in: https: //juejin.im/post/5d073e6ce51d45777621bb82

Guess you like

Origin blog.csdn.net/weixin_34186128/article/details/93179228