爬虫--爬取网站链接

该爬虫的目的是爬取一个网站包含的链接并存入文件，却不仅限于此，做适当修改可以爬取其他内容。爬虫的原型来源于《Web Scraping with Python》一书的第一章的代码，原代码是python2编写的，这里改用python3编写，在此基础上做一些恰当的修改，增强容错能力，支持代理、限速、爬取深度设置。

重用此爬虫应当针对目标网站做适当修改，比如a标签中可能使用相对链接。另外爬取前应先查看网站robots.txt的规则，规则禁止的应当避免。

from urllib.request import urlopen, URLError, HTTPError, Request, build_opener, ProxyHandler
from urllib.parse import urlparse,quote,urljoin
import re
from datetime import datetime
import time

#第一个参数是要爬取的url，第二个参数是代理，第三个参数是重试次数
def download(url, proxy=None, num_retries=2):
    print('Downloading', url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
    request = Request(url, headers=headers)
    opener = build_opener()
    if proxy:
        opener.add_handler(ProxyHandler({urlparse(url).scheme : proxy}))
    try:
        html = opener.open(request).read().decode()
    except (URLError, HTTPError, UnicodeDecodeError, UnicodeEncodeError) as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, num_retries - 1)
    return html


def get_link(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    try:
        return webpage_regex.findall(html)
    except TypeError:
        return []


# 限速
class Throttle:
    def __init__(self, delay):
        self.delay = delay
        self.domains = {}

    def wait(self, url):
        domain = urlparse(url).netloc
        last_accessed = self.domains.get(domain)

        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
            if sleep_secs > 0:
                time.sleep(sleep_secs)
        self.domains[domain] = datetime.now()

# 爬取链接，保存到link_save
#第一个参数是要爬取的url，第二个参数是匹配链接的正则表达式，第三个参数是爬取的深度
def link_crawler(seed_url, link_regex, max_depth=2):
    crawl_queue = [seed_url]
    link_save = {seed_url:0}
    throttle = Throttle(2)
    while crawl_queue:
        url = crawl_queue.pop()
        depth = link_save[url]
        throttle.wait(url)
        html = download(url)
        if depth <= max_depth:
            for link in get_link(html):
                # 相对链接使用这行代码合并绝对链接
                # link = urljoin(seed_url, link)
                if re.match(link_regex, link) and link not in link_save.keys():
                    crawl_queue.append(link)
                    link_save[link] = depth + 1
    return link_save


if __name__ == "__main__":
    result = link_crawler("http://www.xxxx.com/", "http://.*?")
    with open('xxxx_com.txt', 'w') as f:
        for i in result.keys():
            f.writelines(f'{i}\n')

爬虫--爬取网站链接

猜你喜欢