Simple website crawler 30 lines of code


1
# encoding=utf8 2 import urllib.request 3 import re 4 import os 5 6 7 seed= " http://idea.lanyus.com/ " # The first seed, crawled website 8 depth=3 # The most recursive depth layer, avoid recursive stack too deep 9 count=5 #Each webpage only grabs count urls as new seeds 10 href_re=re.compile(r ' href\S=\S"(https?:/\S* )" ' ) #Matches URLs in webpage source code through regular expressions 11 http_re=re.compile(r ' \w+ ' ) #Match Chinese 12 pages= through regular expressions set()
13 path_dir="." #保存路径 14 def get_path(url): 15 name='_'.join(href_re.findall(url))[:30] 16 path=os.path.join(path_dir,"%s.txt" %name) 17 def fetch(que=[seed,],dep=0): 18 nxt_que=[] 19 for url in que: 20 print("depth:%d fetch:%s..." %(dep,url)) 21 html= urllib.request.urlopen(url).read() 22 print (html) 23 with open(get_path(url), ' w+ ' ) as f: 24 f.write(html) #Save web content 25 cnt= 0 26 for new_url in href_re.findall(html): 27 if new_url in pages: continue #If already crawled, skip 28 pages.add(new_url) 29 cnt+=1 30 nxt_que.append(new_url) 31 if cnt>=count: 32 break 33 if dep<depth: 34 fetch(nxt_que,dep+1) 35 36 if __name__=="__main__": 37 fetch()

 

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325115954&siteId=291194637