该爬虫的目的是爬取一个网站包含的链接并存入文件,却不仅限于此,做适当修改可以爬取其他内容。爬虫的原型来源于《Web Scraping with Python》一书的第一章的代码,原代码是python2编写的,这里改用python3编写,在此基础上做一些恰当的修改,增强容错能力,支持代理、限速、爬取深度设置。
重用此爬虫应当针对目标网站做适当修改,比如a标签中可能使用相对链接。另外爬取前应先查看网站robots.txt的规则,规则禁止的应当避免。
from urllib.request import urlopen, URLError, HTTPError, Request, build_opener, ProxyHandler
from urllib.parse import urlparse,quote,urljoin
import re
from datetime import datetime
import time
#第一个参数是要爬取的url,第二个参数是代理,第三个参数是重试次数
def download(url, proxy=None, num_retries=2):
print('Downloading', url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
request = Request(url, headers=headers)
opener = build_opener()
if proxy:
opener.add_handler(ProxyHandler({urlparse(url).scheme : proxy}))
try:
html = opener.open(request).read().decode()
except (URLError, HTTPError, UnicodeDecodeError, UnicodeEncodeError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, num_retries - 1)
return html
def get_link(html):
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
try:
return webpage_regex.findall(html)
except TypeError:
return []
# 限速
class Throttle:
def __init__(self, delay):
self.delay = delay
self.domains = {}
def wait(self, url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
# 爬取链接,保存到link_save
#第一个参数是要爬取的url,第二个参数是匹配链接的正则表达式,第三个参数是爬取的深度
def link_crawler(seed_url, link_regex, max_depth=2):
crawl_queue = [seed_url]
link_save = {seed_url:0}
throttle = Throttle(2)
while crawl_queue:
url = crawl_queue.pop()
depth = link_save[url]
throttle.wait(url)
html = download(url)
if depth <= max_depth:
for link in get_link(html):
# 相对链接使用这行代码合并绝对链接
# link = urljoin(seed_url, link)
if re.match(link_regex, link) and link not in link_save.keys():
crawl_queue.append(link)
link_save[link] = depth + 1
return link_save
if __name__ == "__main__":
result = link_crawler("http://www.xxxx.com/", "http://.*?")
with open('xxxx_com.txt', 'w') as f:
for i in result.keys():
f.writelines(f'{i}\n')