concurrent.futures

需要下载的图片列表为，是妹子

图中的一个页面(http://www.meizitu.com/a/5593.html)的所有图片，列表是用python爬的。

urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']

要对这些图片做下载，可以由如下方法

多线程

import requests
from concurrent import futures
import time
headers = {
        "Referer": "http://www.meizitu.com/",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko'
    }
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
def download_one(url):
    resp = requests.get(url, headers=headers)
    file_name = url.split("/")[-1]
    with open(file_name, 'wb') as file:
        file.write(resp.content)
start = time.time()
with futures.ThreadPoolExecutor(len(urlList)) as executor:
    executor.map(download_one, urlList)
print(time.time() - start)

# output: 20.165

用时二十秒左右

多进程

import requests
from concurrent import futures
import time
headers = {
        "Referer": "http://www.meizitu.com/",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko'
    }
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
def download_one(url):
    resp = requests.get(url, headers=headers)
    file_name = url.split("/")[-1]
    with open(file_name, 'wb') as file:
        file.write(resp.content)
start = time.time()
with futures.ProcessPoolExecutor(4) as executor:
    executor.map(download_one, urlList)
print(time.time() - start)
# output: 36.158297061920166

由于，系统只有四个核心，所以设置的四个进程。这个是在linux环境下运行的结果，在win下出问题。

结果在二十五到四十之中出现，我这个双系统，再加上电脑很老，跑的有点卡。

协程

import requests
import asyncio
import aiohttp
# from asyncio import async, await
from concurrent import futures
import random
import time
ua_list = [
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv: 11.0) like Gecko',
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
    ]
headers = {
        "Referer": "http://www.meizitu.com/",
        'User-Agent': random.choice(ua_list)
    }
urlList = ['http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/02.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/03.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/04.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/05.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/06.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/07.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/08.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/09.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/10.jpg', 'http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/11.jpg']
params = {'headers': headers, 'proxies':{'http':requests.get('http://127.0.0.1:5010/get/')}}

async def download_one(urlList):
    async with aiohttp.ClientSession(headers=headers) as session:
        for url in urlList:
            for i in range(15):
                try:
                    print("对{}开始的第{}次请求".format(url, i + 1))
                    proxy = 'http://'+requests.get("http://127.0.0.1:5010/get/").text
                    resp = await session.get(url, proxy=proxy, timeout=5)
                    if resp.status != 200:
                        print("返回的状态码错误，为{}".format(resp.status))
                        continue
                    content = await resp.read()
                    file_name = url.split("/")[-1]
                    with open(file_name, 'wb') as file:
                        file.write(content)
                    print("{}图片下载成功".format(url))
                    break
                except asyncio.TimeoutError:
                    print("{}第{}次请求出现timeout异常".format(url, i))
                except Exception as e:
                    print("{}第{}次请求出现timeout异常".format(url, i))
                    print("the exception that is not in mind is " + e.args[0])




def get_proxy():
    resp = requests.get('http://127.0.0.1:5010/get/')
    return resp.text
# loop = asyncio.get_event_loop()
# # to_do = asyncio.wait([download_one(url) for url in urlList])
# loop.run_until_complete(download_one(urlList))
# loop.close()

# # proxy = 'http://' + get_proxy()
# async def func(url):
#     async with aiohttp.ClientSession(headers=headers) as session:
#         resp = None
#         for i in range(15):
#             try:
#                 print("网站---{}---的第---{}---次请求".format(url, i))
#                 resp = await session.get(url, proxy=('http://' + get_proxy()), timeout=3)
#                 # async with session.get(url, proxy=('http://' + get_proxy())) as resp:
#                 # async with session.get('http://mm.chinasareview.com/wp-content/uploads/2018a/01/01/01.jpg') as resp:
#                 if resp.status != 200:
#                     continue
#             except asyncio.TimeoutError:
#                 print("网站---{}---的第---{}---次请求失败".format(url, i))
#                 continue
#             except aiohttp.client_exceptions.ClientProxyConnectionError:
#                 print("访问网站---{}---代理错误".format(url))
#             else:
#                 print("网站---{}---的第---{}---次请求成功".format(url, i))
#                 break
#         if resp is None:
#             print("{}请求次数过多，都失败了".format(url))
#             return
#         if resp.status != 200:
#             print("{}请求返回的结果都是错误的".format(url))
#             return
#         print('resp is ', resp)
#         print('resp.text ', resp.text())
#         print("开始下载文件{}".format(url))
#         content = await resp.read()
#         file_name = url.split('/')[-1]
#         with open(file_name, 'wb') as file:
#             file.write(content)
#         print("文件{}下载成功".format(url))
#         # for i in range(5):
#         #     try:
#         #         print("开始下载文件{}".format(url))
#         #         content = await resp.read()
#         #         file_name = url.split('/')[-1]
#         #         with open(file_name, 'wb') as file:
#         #             file.write(content)
#         #         print("文件{}下载成功".format(url))
#         #     except asyncio.TimeoutError:
#         #         continue
#         #     else: break
# loop = asyncio.get_event_loop()
# to_do = [func(url) for url in urlList]
# loop.run_until_complete(asyncio.wait(to_do))
# loop.close()

# start = time.time()
# # with futures.ThreadPoolExecutor(2) as executor:
# #     executor.map(download_one, urlList)
# taskList = [download_one(url) for url in urlList]
# loop = asyncio.get_event_loop()
# wait_coro = asyncio.wait(taskList)
# # wait_coro运行结束后，会返回一个元组，第一个是一系列结束的期物，第二个是一系列未结束的期物。
# res, oo = loop.run_until_complete(wait_coro)
# loop.close()
# print(time.time() - start)

import sys
urlList = 'cn in us id br pk ng bd ru jp mx ph vn et eg de ir tr cd fr'.split(" ")
base_url = 'http://flupy.org/data/flags/{}/{}'

async def get_flag(url):
    async with aiohttp.ClientSession(headers=headers) as session:
        resp = await session.get(url)
        content = await resp.read()
        return content

def show(text):
    print(text, end = ' ')
    sys.stdout.flush()

def save_flag(img, filename):
    with open(filename, 'wb') as file:
        file.write(img)

async def download_one(cc):
    img = await get_flag(cc)
    save_flag(img, cc + '.gif')
    return cc

loop = asyncio.get_event_loop()
to_do = [download_one(base_url.format(cc, cc + '.gif')) for cc in urlList]
res, _ = loop.run_until_complete(asyncio.wait(to_do))
loop.close()

猜你喜欢