该爬虫目的是下载网站页面并保存到MongoDB数据库,爬虫原型是《Web Scraping with Python》一书的第一章、第三章的代码,原代码是python2编写的,这里改用python3编写,在此基础上做一些恰当修改,补全爬虫完整,增强容错能力,支持robots检查、代理、限速、爬取深度设置。
from urllib.request import urlopen, URLError, HTTPError, Request, build_opener, ProxyHandler
from urllib.parse import urlparse, urljoin
from urllib.robotparser import RobotFileParser
from datetime import datetime, timedelta
import time
import random
import re
from pymongo import MongoClient
# 数据库存储
class MongoCache:
def __init__(self, client=None, expires=timedelta(days=30)):
self.client = MongoClient('localhost', 27017) if client is None else client
self.db = self.client.cache
self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())
def __getitem__(self, url):
record = self.db.webpage.find_one({'_id': url})
if record:
return record['result']
else:
raise KeyError(f'{url} does not exist')
def __setitem__(self, url, result):
record = {'result': result, 'timestamp': datetime.now()}
self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)
# 限速
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
# 下载类,检查url是否已下载保存,否则下载保存
class Downloader:
def __init__(self, delay=3, user_agent=None, proxies=None, num_retries=1, cache=None):
self.throttle = Throttle(delay)
self.user_agent = user_agent if user_agent else 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
self.proxies = proxies
self.num_retries = num_retries
self.cache = cache
def __call__(self, url):
result = None
if self.cache:
try:
result = self.cache[url]
except KeyError:
pass
# else:
# if self.num_retries > 0 and 500 <= result['code'] < 600:
# result = None
if result is None:
self.throttle.wait(url)
result = self.download(url, self.num_retries)
if self.cache:
self.cache[url] = result
return result
def download(self, url, num_retries):
print('Downloading', url)
headers = {'User-Agent': self.user_agent}
proxy = random.choice(self.proxies) if self.proxies else None
request = Request(url, headers=headers)
opener = build_opener()
if proxy:
opener.add_handler(ProxyHandler({urlparse(url).scheme: proxy}))
try:
opened_web = opener.open(request)
code = opened_web.code
html = opened_web.read().decode()
except (URLError, HTTPError, UnicodeDecodeError, UnicodeEncodeError) as e:
print('Download error:', e.reason)
html = None
code = 0
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return self.download(url, num_retries-1)
return {'html': html, 'code': code}
def get_robots(rp, url):
rp.set_url(urljoin(url, 'robots.txt'))
rp.read()
return rp
def get_link(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
try:
return webpage_regex.findall(html)
except TypeError:
return []
# 爬取链接,保存到link_save
# 第一个参数是要爬取的url,第二个参数是匹配链接的正则表达式,第三个参数是爬取的深度
def link_crawler(seed_url, link_regex, max_depth=2, cache=None):
crawl_queue = [seed_url]
link_save = {seed_url: 0}
rp = RobotFileParser()
robots_check = get_robots(rp, seed_url)
downloader = Downloader(cache=cache)
while crawl_queue:
url = crawl_queue.pop()
depth = link_save[url]
if robots_check.can_fetch(downloader.user_agent, url):
html = downloader(url)
if depth <= max_depth:
for link in get_link(html['html']):
# 相对链接使用这行代码合并绝对链接
# link = urljoin(seed_url, link)
if re.match(link_regex, link) and link not in link_save.keys():
crawl_queue.append(link)
link_save[link] = depth + 1
# 完成所有任务后关闭数据库连接
downloader.cache.client.close()
if __name__ == "__main__":
link_crawler("http://cncarehk.com/", "http://cncarehk.com/.*?", cache=MongoCache(expires=timedelta(seconds=60)))