aiohttp的模板

 1 import aiohttp
 2 import asyncio
 3 import async_timeout
 4 from urllib.parse import urljoin,urldefrag
 5 
 6 root_url = 'http://python/org/'  # 开始的url
 7 crawled_urls,url_hub = [], [root_url]
 8 headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
 9 
10 async def get_body(url):
11     async with aiohttp.ClientSession() as session:
12         try:
13             with async_timeout.timeout(10): #超时时间的设定
14                 async with session.get(url,headers=headers) as response:
15                     if response.status == 200:
16                         html = await response.text()
17                         return {'error':'','html':html,'url':url}
18                     else:
19                         return {'error':response.status,'html':'','url':url}
20         except Exception as err:
21             return {'error':response.status,'html':'','url':url}
22 
23 async def handle_task(task_id,work_queue):
24     while not work_queue.empty():
25         queue_url = await work_queue.get()
26         if not queue_url in crawled_urls:
27 
28             body = await get_body(queue_url)
29             if not body['error']:
30                 crawled_urls.append(queue_url)
31                 parse(body)
32             else:
33                 print('{}爬取失败'.format(queue_url))
34 
35 
36 #解析返回的数据
37 def parse(body):
38     pass
39 
40 
41 
42 def remove_fragment(url):
43     pure_url, frag = urldefrag(url)
44     return pure_url
45 
46 #解析html，拼接新的url
47 def get_urls(html):
48     new_urls = [url.split('"')[0] for url in str(html).replace("'", '"').split('href="')[1:]]
49     return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
50 
51 if __name__ == '__main__':
52     q = asyncio.Queue()  #初始化一个异步的队列
53     [q.put_nowait(url) for url in url_hub]  #从初始的url队列中遍历，把url放入到队列中
54     loop = asyncio.get_event_loop()
55     tasks = [handle_task(task_id, q) for task_id in range(3)]  #3个并发
56     loop.run_until_complete(asyncio.wait(tasks))
57     loop.close()
58     for u in crawled_urls:
59         print(u)
60     print('-' * 30)
61     print(len(crawled_urls))
View Code
猜你喜欢