day070 CrawlSpider

CrawlSpider的基本使用,Request,Response,DownloadMiddlewares,Settings

CrawlSpider

  • 所有自定义的spider爬虫类都会继承scrapy.Spider类,Spider是最基本的类
  • CrawlSpider是Spider类的拓展类
  • CrawlSpider类跟Spider类相比,多了一个Rule类,这个类用来匹配获取页面中的链接的

Rule类

class scrapy.spiders.Rule(
        link_extractor, 
        callback = None, 
        cb_kwargs = None, 
        follow = None, 
        process_links = None, 
        process_request = None
)
  • 其中最常用的参数:
    • link_extractor表示匹配的链接对象,包含rep匹配规则
    • callback表示回调函数,该回调函数接受一个response作为第一个参数。*注意当编写爬虫规则时,要求避免使用parse作为回调函数,是由于CrawlSpider使用parse()方法来实现其逻辑,如果覆盖了parse()方法,crawl_spider将会运行失效
    • follow表示是否跟进,None表示默认,默认为Truw,表示跟进,否则为False,表示不跟进

LinkExtractors类

  • linkExtractors类是就是生Rule类对象时的第一个link_extractor对象的类,它的作用就一个: 提取网页中的链接
class scrapy.linkextractors.LinkExtractor(
    allow = (),
    deny = (),
    allow_domains = (),
    deny_domains = (),
    deny_extensions = None,
    restrict_xpaths = (),
    tags = ('a','area'),
    attrs = ('href'),
    canonicalize = True,
    unique = True,
    process_value = None
)

常用的参数:allow,表示提取网站允许的匹配条件,()里写re规则
其他不常用:

deny:满足括号中“正则表达式”的URL一定不提取 *(优先级高于allow)。

allow_domains:会被提取的链接的domains。

deny_domains:一定不会被提取链接的domains。

restrict_xpaths:使用xpath表达式,和allow共同作用过滤链接。

LinkExtractor类的使用步骤:

  1. 定义一个link对象,初始化参数为匹配规则:link = LinkExtractor('正则表达式,如: start=\d+')
  2. 使用link对象调用extract_link(response)方法,返回查询的结果的列表集:result = link.extract_link(response)
  3. 可以发现使用方法和正则的使用方法很相似:
re1 = re.compile('正则表达式')
result = re1.find('需要匹配的字符串')

示例:腾讯社招网站的爬取

  • 目标:爬取列表页信息及详情页信息

tentent.py

# -*- coding: utf-8 -*-
import scrapy

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from TestCrawlSpider.items import TestcrawlspiderListItem, TestcrawlspiderDetailItem

class TencentSpider(CrawlSpider):
    name = 'tencent3'
    allowed_domians = ['tencent.com']
    start_urls = ['https://hr.tencent.com/position.php?keywords=python&tid=87&lid=2156&start=0']

    # rules 提取规则
    rules = (
        # 先提取列表页
        Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_list', follow=True),

        # 提取详情页
        Rule(LinkExtractor(allow=r'position_detail.php'), callback='parse_detail', follow=False),
    )

    def parse_list(self, response):
        # 解析出tr数据
        tr_list = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')
        for tr in tr_list:
            item = TestcrawlspiderListItem()
            item['work_name'] = tr.xpath('./td[1]/a/text()').extract_first()
            item['work_type'] = tr.xpath('./td[2]/text()').extract_first()
            item['work_count'] = tr.xpath('./td[3]/text()').extract_first()
            item['work_place'] = tr.xpath('./td[4]/text()').extract_first()
            item['work_time'] = tr.xpath('./td[5]/text()').extract_first()

            # 将解析完毕的数据 --》引擎--》管道
            yield item

    def parse_detail(self, response):
        # 解析出详情页的数据
        ul_list = response.xpath('//ul[@class="squareli"]')
        item = TestcrawlspiderDetailItem()
        item['work_duty'] = ''.join(ul_list[0].xpath('./li/text()').extract())
        item['work_requir'] = ''.join(ul_list[1].xpath('./li/text()').extract())

        # 将解析完的数据--》引擎--》管道
        yield item

items.py


import scrapy


class TestcrawlspiderListItem(scrapy.Item):
    # 设置列表页目标数据
    work_name = scrapy.Field()
    work_type = scrapy.Field()
    work_count = scrapy.Field()
    work_place = scrapy.Field()
    work_time = scrapy.Field()


class TestcrawlspiderDetailItem(scrapy.Item):
    # 设置详情页目标数据
    work_duty = scrapy.Field()
    work_requir = scrapy.Field()

piplines.py

import json
from items import TestcrawlspiderDetailItem, TestcrawlspiderListItem


class TestcrawlspiderListPipeline(object):
    def open_spider(self, spider):
        self.file = open('list.json', 'w')

    def process_item(self, item, spider):
        # 判断item是否是列表页的数据,是,才存储
        if isinstance(item, TestcrawlspiderListItem):
            dict_item = dict(item)
            str_item = json.dumps(dict_item) + '\n'
            self.file.write(str_item)
        return item

    def close_spider(self, spider):
        self.file.close()


class TestcrawlspiderDetailPipeline(object):
    def open_spider(self, spider):
        self.file = open('detail.json', 'w')

    def process_item(self, item, spider):
        # 判断item是否是详情页的数据,是,才存储
        if isinstance(item, TestcrawlspiderDetailItem):
            dict_item = dict(item)
            str_item = json.dumps(dict_item) + '\n'
            self.file.write(str_item)
        return item

    def close_spider(self, spider):
        self.file.close()

middlewares.py

import scrapy
from TestCrawlSpider.settings import USER_AGENT_LIST
import random


# 设置随机的User_Agent的中间件
class UserAgentMiddleWares(object):
    def process_request(self, request, spider):
        # 随机挑选一个User-Agent
        user_agent = random.choice(USER_AGENT_LIST)

        # 修改request对象中的user-agent
        request.headers['User-Agent'] = user_agent
        print '***' * 30
        print user_agent


# 设置随机代理proxy的中间件
class ProxyMiddleWares(object):
    def process_request(self, request, spider):
        # 随机一个代理(要钱的/免费的)
        proxy = 'http://162.138.3.1:8888'
        # 修改request的meta参数
        request.meta['proxy']=proxy

        print '----'*30
        print proxy

settings.py

BOT_NAME = 'TestCrawlSpider'

SPIDER_MODULES = ['TestCrawlSpider.spiders']
NEWSPIDER_MODULE = 'TestCrawlSpider.spiders'

DOWNLOADER_MIDDLEWARES = {
   'TestCrawlSpider.middlewares.UserAgentMiddleWares': 543,
    'TestCrawlSpider.middlewares.ProxyMiddleWares':888,
}

ITEM_PIPELINES = {
   'TestCrawlSpider.pipelines.TestcrawlspiderListPipeline': 300,


USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
]

Request&Response

  • request对象是Request类的实例化对象,就是集成了请求相关的信息,以备发起请求的对象。
  • 常用参数有:

url: 就是需要请求,并进行下一步处理的url

callback: 指定该请求返回的Response,由那个函数来处理。

method: 请求一般不需要指定,默认GET方法,可设置为"GET", "POST", "PUT"等,且保证字符串大写

headers: 请求时,包含的头文件。一般不需要。内容一般如下:
        # 自己写过爬虫的肯定知道
        Host: media.readthedocs.org
        User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0
        Accept: text/css,*/*;q=0.1
        Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3
        Accept-Encoding: gzip, deflate
        Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/
        Cookie: _ga=GA1.2.1612165614.1415584110;
        Connection: keep-alive
        If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT
        Cache-Control: max-age=0

meta: 比较常用,在不同的请求之间传递数据使用的。字典dict型

        request_with_cookies = Request(
            url="http://www.example.com",
            cookies={'currency': 'USD', 'country': 'UY'},
            meta={'dont_merge_cookies': True}
        )

encoding: 使用默认的 'utf-8' 就行。
errback: 指定错误处理函数
  • response对象是Response类的实例化,包含服务端响应给浏览器的相关信息。

  • 常用参数/信息有:

status: 响应码
_set_body(body): 响应体
_set_url(url):响应url
self.request = request

DownloadMiddlewares

  • 温故下scrapy框架图

# scrapy框架图.png

  • 有爬虫,就有反爬虫,有反爬虫,就有反反爬虫。。。。。。爬虫和反爬虫的斗争是无穷无尽的。
  • scrapy基本的功能可以满足基本的爬虫任务,使用CrawlSpider甚至只需要简单的设置一下过滤条件follow=True,就可以自动爬取大量的列表页及详情页。但是,大部分网站不会这么容易爬取。这时候就需要进行额外的操作,就需要在基本设置的基础上进行中间件的设置。
  • scrapy的中间件有spider中间件和download中间件两种,常用的是download中间件,可以在request对象正式发送前进行响应的设置。
  • 常见的反反爬虫设置有:

    • 模拟用户登录(自动获取cookie)/禁用cookies
    • 设置随机的User-Agent
    • 设置随机的代理IP
    • 控制访问频次
    • 使用selenium自动化控制浏览器,从浏览器渲染后的页面中换取数据
    • 。。。
  • 示例:模拟用户登录到github主页

# -*- coding: utf-8 -*-
import scrapy


class GithubSpider(scrapy.Spider):
    name = 'github2'
    allowed_domains = ['github.com']
    start_urls = ['https://github.com/login']

    # 模拟登录 : scrapy框架 会自动保存当前的cookie
    def parse(self, response):
        login_url = 'https://github.com/session'

        formdata = {
            "login": '账户名',
            'password': '密码',
            'commit': 'Sign in',
            'utf8': '✓',
            'authenticity_token': response.xpath('//*[@id="login"]/form/input[2]/@value').extract_first()
        }

        # 发送 登录请求  POST
        yield scrapy.FormRequest(url=login_url, formdata=formdata, callback=self.parse_logined)

    def parse_logined(self, response):

        with open('222githublogin.html', 'w') as f:
            f.write(response.body)

Settings

  • setting设置是对整个爬虫架构进行相关的设置,设置项有:
# -*- coding: utf-8 -*-

# Scrapy settings for GitHub project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'GitHub'

SPIDER_MODULES = ['GitHub.spiders']
NEWSPIDER_MODULE = 'GitHub.spiders'

# 设置 log文件
# LOG_FILE = 'github.log'

# 日志的等级  5个等级 debug info warning error 严重

LOG_LEVEL = 'ERROR'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"


# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 设置并发数  3000个  3000 1次  硬件
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# 设置延迟 批次
# DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# 禁用cookie
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'GitHub.middlewares.GithubSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'GitHub.middlewares.GithubDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'GitHub.pipelines.GithubPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

猜你喜欢

转载自blog.csdn.net/michael_cool/article/details/80045240
今日推荐