The first step is to read a single page:
import urllib.request url = "http://www.badtom.cn" data = urllib.request.urlopen(url).read() data = data.decode('UTF-8') print(data)
The second step is a simple implementation of the pseudo-code of the previous stand-alone version:
from collections import deque import re import urllib.request queue = process() visited = set() init_url = "http://www.badtom.cn" queue.append(init_url) visited.add(init_url) count = 0 while queue: url = queue.popleft() print('Already fetched: ' + str(count) + ', fetching -->' + url) count += 1 try: urlop = urllib.request.urlopen(url,timeout = 2) data = urlop.read().decode('utf-8') # print(data) except: continue linkre = re.compile('href="(.+?)"') linkdata = linkre.findall(data) for next_url in linkdata: if 'http' in next_url and next_url not in visited: queue.append(next_url) visited.add(next_url)
The third step is to pretend to be a Firefox browser and save the crawled pages to disk:
from collections import deque import re import urllib.request #Store the crawled web pages def saveToFile(filePath,data): with open(filePath,'w',encoding='utf-8') as fileop: fileop.write(data) queue = process() visited = set() init_url = "http://www.badtom.cn" queue.append(init_url) visited.add(init_url) # Disguise as Firefox through header information headinfo = { 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } filePath = 'E:/spider/' count = 0 while queue: url = queue.popleft() print('Already fetched: ' + str(count) + ', fetching -->' + url) count += 1 try: req = urllib.request.Request(url,headers = headinfo) urlop = urllib.request.urlopen(req,timeout = 2) data = urlop.read().decode('utf-8') saveToFile(filePath + str(count) + '.html', data) except: continue linkre = re.compile('href="(.+?)"') linkdata = linkre.findall(data) for next_url in linkdata: if 'http' in next_url and 'github' not in next_url and next_url not in visited: queue.append(next_url) visited.add(next_url)