简单网站爬虫30行代码


1
#encoding=utf8 2 import urllib.request 3 import re 4 import os 5 6 7 seed="http://idea.lanyus.com/" #最开始的种子,爬取的网站 8 depth=3 #最多递归depth层,避免递归栈过深 9 count=5 #每个网页只抓取count个url作为新的seed 10 href_re=re.compile(r'href\S=\S"(https?:/\S*)"') #通过正则表达匹配网页源码中的URL 11 http_re=re.compile(r'\w+') #通过正则表达匹配中文 12 pages=set()
13 path_dir="." #保存路径 14 def get_path(url): 15 name='_'.join(href_re.findall(url))[:30] 16 path=os.path.join(path_dir,"%s.txt" %name) 17 def fetch(que=[seed,],dep=0): 18 nxt_que=[] 19 for url in que: 20 print("depth:%d fetch:%s..." %(dep,url)) 21 html=urllib.request.urlopen(url).read() 22 print(html) 23 with open(get_path(url),'w+') as f: 24 f.write(html) #保存网页内容 25 cnt=0 26 for new_url in href_re.findall(html): 27 if new_url in pages:continue #如果已经爬过则跳过 28 pages.add(new_url) 29 cnt+=1 30 nxt_que.append(new_url) 31 if cnt>=count: 32 break 33 if dep<depth: 34 fetch(nxt_que,dep+1) 35 36 if __name__=="__main__": 37 fetch()

猜你喜欢

转载自www.cnblogs.com/qdzj/p/8974745.html