python笔记--多进程与多线程

import time
import requests
import current
import concurrent
from concurrent import futures
import pandas as pd
import threading
from multiprocessing import Pool

# 装饰器，打印函数的执行时间
def gettime(func):
    def warapper(*args, **kwags):
        print("="*50)
        print(func.__name__, 'Strat...')
        starttime = time.time()
        func(*args)
        endtime = time.time()
        spendtime = endtime - starttime
        print(func.__name__, "End...")
        print("Spend", spendtime, "s totally")
        print("="*50)
    return warapper

# 从文件去n个测试网站
def get_urls_from_files(n):
    df = pd.read_csv('TestUrls.csv')
    urls = list(df['url'])[:n]

    return urls

# 请求并解析网页获取数据
def getdata(url, retries=3):
    headers = {}
    try:
        html = requests.get(url, headers=headers)
    except requests.exceptions.ConnectionError as e:
        print('下载出错，错误原因：'， e)
        html = None
        # 5XX错误为服务器错误，可以重新请求
    if(html != None and 500 <= html.status_code <600 and retries):
        retries -= 1
        print("服务器出错正在重试...")
        getdata(url, retries)
        data = html.text
    else:
        data = None
    return data

# 串行
@gettime
def Mynormal():
    for url in urls:
        getdata(url)

# 进程池
@gettime
def MyprocessPool(num=10):
    pool = Pool(num)
    results = pool.map(getdata, urls)
    pool.close()
    pool.join()
    return results

# 线程池
@gettime
def Myfutures(num_of_max_works=10):
    with concurrent.futures.threadPoolExecutor(max_workers = num_of_max_works) as executor:
        executor.map(getdata, urls)

if __name__ == '__main__':
    urls = get_urls_from_file(100)
    # 串行
    Mynormal()
    # 进程池
    MyprocessPool(10)
    # 线程池
    Myfutures(100)
python笔记--多进程与多线程

猜你喜欢