python 多线程案例

from threading import Thread
import time
import requests,re
from urllib.parse import urljoin

encoding = 'utf-8'
title_re = re.compile(r'<title>(.*?)</title>', re.I | re.S)
link_re  = re.compile(r'<a[^>]+href\s*=\s*["\']([^"\';#]+)["\'][^>]*>', re.I)
def down(url,revers = 3):
    try:
        r = requests.get(url,timeout = 5)
    except TimeoutError:
        html = None
        if revers > 0:
            down(url)
    except RecursionError as err:
        html = None
        print(f'错误url{url},错误原因{err}')
    else:
         r.encoding = 'utf-8'
         html = r.text
    return html

def get_title(html):
    if not isinstance(html,str):
        return
    else:
       return ''.join(title_re.findall(html))


def link_all(url,html):
    if not isinstance(html,str):
        return
    else:
         links = link_re.findall(html)   # 可能会有相对路径
         return {urljoin(url,link) for link in links  if link.startswith('http://www.xiaoxingyun.net/')}

def main(url):  # 主函数 获取标题和所有url
    html = down(url)
    t = get_title(html)
    # l = link_all(url,html)
    print(t)

def run(url): # 线程内文件执行
    html = down(url)
    pool_num = 30 # 最多执行线程数量
    pool = [] # 线程池

    for link in link_all(url,html):
        if link or pool:
            if pool:
                t = pool.pop()
                if not t.is_alive():
                    pool.remove(t)
            if pool_num < len(pool):
                continue
            t = Thread(target=main,args=(link,))
            t.start()
            pool.append(t)

if __name__ == '__main__':
    star_time = time.time()
    run('http://www.xiaoxingyun.net/')
    end_time = time.time()
    print(f'{star_time - end_time :.2f}')

猜你喜欢

转载自blog.csdn.net/haohaomax1/article/details/109732685
今日推荐