from threading import Thread
import time
import requests,re
from urllib.parse import urljoin
encoding = 'utf-8'
title_re = re.compile(r'<title>(.*?)</title>', re.I | re.S)
link_re = re.compile(r'<a[^>]+href\s*=\s*["\']([^"\';#]+)["\'][^>]*>', re.I)
def down(url,revers = 3):
try:
r = requests.get(url,timeout = 5)
except TimeoutError:
html = None
if revers > 0:
down(url)
except RecursionError as err:
html = None
print(f'错误url{url},错误原因{err}')
else:
r.encoding = 'utf-8'
html = r.text
return html
def get_title(html):
if not isinstance(html,str):
return
else:
return ''.join(title_re.findall(html))
def link_all(url,html):
if not isinstance(html,str):
return
else:
links = link_re.findall(html) # 可能会有相对路径
return {urljoin(url,link) for link in links if link.startswith('http://www.xiaoxingyun.net/')}
def main(url): # 主函数 获取标题和所有url
html = down(url)
t = get_title(html)
# l = link_all(url,html)
print(t)
def run(url): # 线程内文件执行
html = down(url)
pool_num = 30 # 最多执行线程数量
pool = [] # 线程池
for link in link_all(url,html):
if link or pool:
if pool:
t = pool.pop()
if not t.is_alive():
pool.remove(t)
if pool_num < len(pool):
continue
t = Thread(target=main,args=(link,))
t.start()
pool.append(t)
if __name__ == '__main__':
star_time = time.time()
run('http://www.xiaoxingyun.net/')
end_time = time.time()
print(f'{star_time - end_time :.2f}')
python 多线程案例
猜你喜欢
转载自blog.csdn.net/haohaomax1/article/details/109732685
今日推荐
周排行