scrapy中间件的应用

一 . 先了解一下scrapy五大组件的工作流程

  

二 . 中间件的应用

  从上图可以看出来,scrapy的工作流程中有两个中间件,分别是spider中间件,一个是Downloader中间件

  这里我们先介绍一下Downloader中间件

    爬虫文件(middle.py)

import scrapy

class MiddleSpider(scrapy.Spider):
    name = 'middle'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.baidu.com/s?wd=ip']

    def parse(self, response):
        page_text = response.text
        # 这里直接保存一个页面看一下效果,就不写到数据库啦
        with open('./ip.html', 'w', encoding='utf-8') as f:
            f.write(page_text)

  middlewares.py

 1 from scrapy import signals
 2 import random
 3 class MiddleproDownloaderMiddleware(object):
 4 
 5     # 这是UA池,基本包含了各大浏览器的UA
 6     user_agent_list = [
 7         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
 8         "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 9         "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
10         "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
11         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
12         "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
13         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
14         "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
15         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
16         "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
17         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
18         "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
19         "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
20         "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
21         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
22         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23         "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
24         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
26         "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
27         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
28         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
29         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
30         "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
31         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
32         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
33         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
34         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
35         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
36         "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
37         "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
38         "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
39         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
40         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
41         "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
42         "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
43     ]
44     PROXY_http = [
45         '153.180.102.104:80',
46         '195.208.131.189:56055',
47     ]
48     PROXY_https = [
49         '120.83.49.90:9000',
50         '95.189.112.214:35508',
51     ]
52     @classmethod
53     def from_crawler(cls, crawler):
54         # This method is used by Scrapy to create your spiders.
55         s = cls()
56         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
57         return s
58 
59     # 可以处理拦截到所有的非异常的请求
60     # spider参数表示的就是爬虫类实例化的一个对象
61     def process_request(self, request, spider):
62         print('this is process_request()')
63         #UA伪装
64         request.headers['User-Agent'] = random.choice(self.user_agent_list)
65 
66         # 测试:代理操作是否生效
67         request.meta['proxy'] = 'https://218.60.8.83:3129'
68         return None
69     # 拦截所有的响应
70     def process_response(self, request, response, spider):
71 
72         return response
73     # 拦截发生异常的请求对象
74     def process_exception(self, request, exception, spider):
75         if request.url.split(':')[0] == 'https':
76             request.meta['proxy'] = 'https://'+random.choice(self.PROXY_https)
77         else:
78             request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
79 
80     def spider_opened(self, spider):
81         spider.logger.info('Spider opened: %s' % spider.name)
middlewares.py

  应用selenium和中间件对动态生成的 响应数据进行处理

    爬虫文件(news.py)

# -*- coding: utf-8 -*-
import scrapy
from ..items import MiddleWareItem
from selenium import webdriver


class NewsSpider(scrapy.Spider):
    # scrapy中应用selenium获取动态加载数据
    browser = webdriver.Chrome(r'D:\spider\chromedriver.exe')

    name = 'news'
    sort_urls = []  # 放两个分类的url
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://news.163.com/']

    def newsContentParse(self, response):
        item = response.meta['item']
        # 解析新闻内容,然后直接存储到item中
        content_list = response.xpath('//div[@id="endText"]//text()').extract()
        item['news_content'] = ''.join(content_list)

        yield item

    # 用来解析分类对应页面中的新闻数据
    def parse_detail(self, response):
        div_list = response.xpath('//div[@class="ndi_main"]/div')
        for div in div_list:
            news_title = div.xpath('.//h3/a/text()').extract_first()
            detail_news_url = div.xpath('.//h3/a/@href').extract_first()
            item = MiddleWareItem()
            item['news_title'] = news_title
            # 获取新闻的内容,进行请求传参,将item传递给下一个解析方法
            yield scrapy.Request(url=detail_news_url, callback=self.newsContentParse, meta={'item': item})

    def parse(self, response):
        # 解析出两个分类对应url,为了使用一个爬虫函数,要保证这两个分类排版一样
        li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
        # 取到国内和国际新闻的li标签索引
        indexs = [3, 4]
        sort_li_list = []  # 放置选出的两个分类对应的li
        for index in indexs:
            li = li_list[index]
            sort_li_list.append(li)
        # 解析出两个板块的url
        for li in sort_li_list:
            sort_url = li.xpath('./a/@href').extract_first()
            self.sort_urls.append(sort_url)
            # 对每一个分类的url发起请求获取详情页的页面源码数据
            yield scrapy.Request(url=sort_url, callback=self.parse_detail)

  middlewares.py

from scrapy import signals
from time import sleep
from scrapy.http import HtmlResponse

class MiddleWareDownloaderMiddleware(object):

    def process_request(self, request, spider):
        return None

    # 改方法可以拦截到所有的响应对象(需求中需要处理的是指定的某些响应对象)
    def process_response(self, request, response, spider):
        """
        1.找出指定的响应对象进行处理操作
        2.可以根据指定的请求对象定位到指定的响应对象
        3.指定的请求对象可以通过请求的url进行定位
        4.定位指定的url方法: spider.sort_urls
        """
        sort_urls = spider.sort_urls
        browser = spider.browser
        if request.url in sort_urls:
            """
            1.通过指定的url就定位到了指定request
            2.通过指定request就定位到了指定的response(不符合需求的)
            3.自己手动创建2个符合要求的新的响应对象
            4.使用新的响应对象替换原始的响应对象
            """
            browser.get(request.url)  # 使用浏览器对两类板块对应的url发起请求
            sleep(2)  # 看清楚一点
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            browser.execute_script(js)
            sleep(2)
            # 页面源码数据中包含来了动态加载出来的新闻数据
            page_text = browser.page_source

            # 手动创建一个新的响应对象,将page_text作为响应数据封装到响应对象中
            return HtmlResponse(url=browser.current_url, body=page_text, encoding='utf-8', request=request)
            # body参数表示的就是响应数据
        return response

  items.py

import scrapy

class MiddleWareItem(scrapy.Item):
    
    news_title = scrapy.Field()
    news_content = scrapy.Field()

 

猜你喜欢

转载自www.cnblogs.com/attila/p/10920516.html