9.3.2 网页爬虫

　　网页爬虫常用来在互联网上爬取感兴趣的页面或文件，结合数据处理与分析技术可以得到更深层次的信息。下面的代码实现了网页爬虫，可以抓取指定网页中的所有链接，并且可以指定关键字和抓取深度。

 1 import sys
 2 import multiprocessing
 3 import re
 4 import os
 5 import urllib.request as lib
 6 
 7 def craw_links(url,depth,keywords,processed):
 8     '''
 9     :param url:       要爬取的网址
10     :param depth:     爬取深度
11     :param keywords:  要爬取的关键字组成的元组
12     :param procdssed: 进程池
13     :return:
14     '''
15 
16     contents = []
17 
18     if url.startswith(('http://','https://')):
19         if url not in processed:
20             #make this url as processed
21             processed.append(url)
22         else:
23             #avoid processing the same url again
24             return
25 
26         print('Crawing ' + url + '...')
27         fp = lib.urlopen(url)                           #向url 发出请求
28 
29         #Python3 returns bytes,so need to decode
30         contents_decoded = fp.read().decode('utf-8')
31         fp.close()                                      #至此已经读取爬取的网页文本内容
32 
33         pattern = '|'.join(keywords)
34 
35         #if this page contains certain keywords,save it to a file
36         flag = False
37         if pattern:
38             searched = re.search(pattern,contents_decoded)              #用正则表达式去返回的网页文本中匹配关键字
39         else:
40             #if the keywords to filter is not given,save current page
41             flag = True
42 
43         if flag or searched:
44             with open('craw\\' + url.replace(':','_').replace('/','_'),'w') as fp:
45                 fp.writelines(contents)
46 
47         #find all the links in the current page
48         links = re.findall('href="(.*?)"',contents_decoded)
49 
50         #craw all links in the current page
51         for link in links:
52             #consider the relative path
53             if not link.startswith(('http://','https://')):
54                 try:
55                     index = url.rindex('/')
56                     link = url[0:index+1] + link
57                 except:
58                     pass
59             if depth > 0 and link.endswith(('.htm','.html')):
60                 craw_links(link,depth-1,keywords,processed)
61 
62 if __name__ == '__main__':
63     processed = []
64     keywords=('datetime','KeyWord2')
65     if not os.path.exists('craw') or not os.path.isdir('craw'):
66         os.mkdir('craw')
67     craw_links(r'https://docs.python.org/3/library/index.html',1,keywords,processed)

猜你喜欢