Python爬虫学习日记三缓存支持

Python爬虫学习日记三
冰冠 2018年06月15日14:22:06
1、为链接爬虫添加缓存支持
修改第一天中的download函数，在url下载之前进行缓存检查，另外，需要把限速功能移至函数内部，只有在真正发生下载时才会触发缓存，在加载缓存时不会触发。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-15 下午3:31"""

import re
import urllib.parse
import urllib.request
import urllib.robotparser

from day03_cache.downloader import Downloader


def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp',
                 proxies=None, num_retries=1, scrape_callback=None, cache=None):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)
    D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries,
                   cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])

            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print('Blocked by robots.txt:', url)


def normalize(seed_url, link):
    """Normalize this URL by removing hash and adding domain
    """
    link, _ = urllib.parse.urldefrag(link)  # remove hash to avoid duplicates
    return urllib.parse.urljoin(seed_url, link)


def same_domain(url1, url2):
    """Return True if both URL's belong to same domain
    """
    return urllib.parse.urlparse(url1).netloc == urllib.parse.urlparse(url2).netloc


def get_robots(url):
    """Initialize robots parser for this domain
    """
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(urllib.parse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html.decode('utf-8'))


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
                 user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com', '(.*?)/(index|view)', delay=0, num_retries=1,
                 max_depth=1, user_agent='GoodCrawler')

2、磁盘缓存
   缓存下载结果到文件系统中，首先需要将URL安全的映射为跨平台的文件名

   操作系统           文件系统           非法文件名字符           文件名最大长度
   Linux           ext3/ext4       / /0                    255字节
   OS X               HFS Plus           : /0                   255个UTF-16编码单元
   Windows           NTFS           / \ ? : * " > < |           255个字符

   (1)为了实现在不同文件系统中，缓存的文件都是安全的，需要限制其文件名为只能包含字母、数字额基本符号，将其他字符转换为_，代码如下

import re

url = 'http://example.webscraping.com/default/view/1'
filename = re.sub('[^/0-9a-zA-Z\-.,;_]','_',url)

此外，文件名及其父目录的长度需要限制在255个字符内，代码如下

filename = '/'.join(segment[:255] for segment in filename.split('/'))

边界处理，URL路径以斜杠/结尾结尾，此时斜杠后边的空字符串会成为一个非法的文件名。为了解析URL，我们使用urllib.prase.urlsplit()函数将url分割成几部分

new_url = 'http://example.webscraping.com/default/view/'
components = urllib.parse.urlsplit(new_url)
print(components)
# SplitResult(scheme='http', netloc='example.webscraping.com', path='/default/view/', query='', fragment='')

print(components.path)
# /default/view/

path = components.path
if not path:
    path = '/index.html'
elif path.endswith('/'):
    path+='index.html'
new_filename = components.netloc +path + components.query
print(new_filename)

   (2)在__setitem()__中,我们使用url_2_path方法将url映射为安全文件名,在必要时创建父目录,这里使用的pickle模块会把输入转化为字符串并保存在磁盘中.
   在__getitem()__中,首先将url映射为安全文件名,然后如果文件存在,则加载其内容,并执行反序列化,恢复原始数据类型,如果文件不存在,则说明缓存中还没有该url数据,此时抛出keyerror异常

   代码如下

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-18 上午11:07"""

import os
import re
import urllib.parse
import pickle


class DiskCache:
    def __init__(self, cache_dir='cahce', max_length=255):
        self.cache_dir = cache_dir
        self.max_length = max_length

    def url_2_path(self, url):
        '''
        Create file system path for this url
        :param url:
        :return:
        '''
        componts = urllib.parse.urlsplit(url)
        # append index.html to empty paths
        path = componts.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = componts.netloc + path + componts.query
        # replace invaild characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))

        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        '''
        Load data from disk for this url
        :param url:
        :return:
        '''
        path = self.url_2_path(url)
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                return pickle.load(fp)
        else:
            # url hs not been cacheda
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        '''
        Save data to disk for this url
        :param url:
        :param result:
        :return:
        '''
        path = self.url_2_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        with open(path, 'wb') as fp:
            fp.write(pickle.dumps(result))

          (3)节省磁盘空间
             为了最小化缓存所需的磁盘空间,我们可以对下载的html文件进行压缩处理.只需在保存到磁盘之前进行zlib压缩序列化字符即可
             代码如下
             data = zlib.decompress(data)

         (4) 清理过期数据
                存储在缓存中的数据存在过期风险,需要对其设置过期时间,以让爬虫可以重新下载页面

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-18 上午11:07"""

import os
import re
import urllib.parse
import pickle
from datetime import datetime, timedelta
import zlib


class DiskCache:
    def __init__(self, cache_dir='cahce', max_length=255, expires=timedelta(days=30)):
        self.cache_dir = cache_dir
        self.max_length = max_length
        self.expires = expires

    def url_2_path(self, url):
        '''
        Create file system path for this url
        :param url:
        :return:
        '''
        componts = urllib.parse.urlsplit(url)
        # append index.html to empty paths
        path = componts.path
        if not path:
            path = '/index.html'
        elif path.endswith('/'):
            path += 'index.html'
        filename = componts.netloc + path + componts.query
        # replace invaild characters
        filename = re.sub('[^/0-9a-zA-Z\-.,;_]', '_', filename)
        # restrict maximum number of characters
        filename = '/'.join(segment[:255] for segment in filename.split('/'))

        return os.path.join(self.cache_dir, filename)

    def __getitem__(self, url):
        '''
        Load data from disk for this url
        :param url:
        :return:
        '''
        path = self.url_2_path(url)
        if os.path.exists(path):
            with open(path, 'rb') as fp:
                result, timestamp = pickle.loads(zlib.decompress(fp.read))
                if self.has_expired(timestamp):
                    raise KeyError(url + 'has exxpired')
                return result
        else:
            # url hs not been cacheda
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        '''
        Save data to disk for this url
        :param url:
        :param result:
        :return:
        '''
        path = self.url_2_path(url)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        timestamp = datetime.utcnow()
        data = pickle.dumps((result, timestamp))
        with open(path, 'wb') as fp:
            fp.write(pickle.dumps(zlib.compress(data)))

    def has_expired(self, timestamp):
        '''

        :param timestamp:
        :return: boolean whether this timestamp has expired
        '''
        return datetime.utcnow() > timestamp + self.expires

        (5) 缺点
               ①基于磁盘的缓存系统比较容易实现,但是存在一个缺点,受制于本地文件系统的限制,一些url会映射为相同的文件名,比如 .../?a+b   ../?a*b   等
               解决方案使用url的哈希值作为文件名
               ②每个卷和每个目录下的文件数量是有限制的,文件系统可存储的文件总数也是有限制的
               解决方案将多个缓存网页合并到一个文件中,并使用类似B+树的算法进行索引,或使用实现该类算法的数据库

3、数据库缓存
    为了避免磁盘缓存方案的已知限制,我们在现有数据库系统上穿件缓存,在此选用NoSQL数据库,这种数据库更易于扩展

    (1)NoSQL是什么
        NoSQL全称,Not only SQL,是一种相对较新的数据库设计方式.传统的关系模型使用的是固定模式,NoSQL数据库通常是无模式的,从设计支出就考虑了跨服务器无缝分片的问题.在NoSQL中,有多种方式可以实现该目标,分别是列数据存储(如：HBase)、键值对存储(如：Redis)、面向文档的数据库(如：MongoDB)、以及图形数据库(如：Neo4j)

    (2)安装MongoDB
        MongoDB可以从http://www.mongodb.org/downloads下载，在此我们使用如下命令额外安装python封装库
        pip install pymongo

    (3)MongoDB缓存实现

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-19 上午9:07"""

from datetime import datetime, timedelta
from pymongo import MongoClient


class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost', 27017)
        self.db = client.cache
        # self.expires = expires
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id': url})
        if record:
            return record['result']
        else:
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        record = {'result': result, 'timestamp': datetime.utcnow()}
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)

(4)压缩（与磁盘缓存类似，序列化数据后使用zlib库进行压缩）

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" @author [email protected]
    @function:
    @create 18-6-19 上午9:07"""
import pickle
import zlib
from datetime import datetime, timedelta
from pymongo import MongoClient
from bson.binary import Binary


class MongoCache:
    def __init__(self, client=None, expires=timedelta(days=30)):
        self.client = MongoClient('localhost', 27017)
        self.db = client.cache
        # self.expires = expires
        self.db.webpage.create_index('timestamp', expireAfterSeconds=expires.total_seconds())

    def __getitem__(self, url):
        record = self.db.webpage.find_one({'_id': url})
        if record:
            return pickle.loads(zlib.decompress(record['result']))
        else:
            raise KeyError(url + 'does not exist')

    def __setitem__(self, url, result):
        record = {'result': Binary(zlib.compress(pickle.dumps(result))),
                  'timestamp': datetime.utcnow()}
        self.db.webpage.update({'_id': url}, {'$set': record}, upsert=True)

https://github.com/ice1995/python_web_crawler-/tree/master/day03_cache

Python爬虫学习日记三 缓存支持

猜你喜欢

Python爬虫学习日记三缓存支持