scrapy + selenium use practices

scrapy + the Selenium use practices

To crawl Baidu picture, but found Baidu picture changed to dynamic web sites, based on scrapy can not be achieved, so I try to use + the Selenium PhantomJS achieve.

Download middleware

Download middleware needs to be defined for the content of selenium can be relevant document.

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time

class SeleniumMiddleware():
    def __init__(self, timeout=None, service_args=[]):
        self.logger = getLogger(__name__)
        self.timeout = timeout
        self.browser = webdriver.PhantomJS(service_args=service_args)
        self.browser.set_window_size(1400, 700)
        self.browser.set_page_load_timeout(self.timeout)
        self.wait = WebDriverWait(self.browser, self.timeout)

    def __del__(self):
        self.browser.close()

    def process_request(self, request, spider):
        """
        用PhantomJS抓取页面
        :param request: Request对象
        :param spider: Spider对象
        :return: HtmlResponse
        """
        if 'image.baidu.com/search/flip' in request.url:
            self.logger.debug('image.baidu.com/search/flip PhantomJS is Starting')
            try:
                self.browser.get(request.url)
                time.sleep(3)
                self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '.imglink')))

                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
            except TimeoutException:
                return HtmlResponse(url=request.url, status=500, request=request)
        elif 'image.baidu.com/search/detail' in request.url:
            self.logger.debug('image.baidu.com/search/detail PhantomJS is Starting')
            try:
                self.browser.get(request.url)
                time.sleep(3)
                self.wait.until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#currentImg')))

                return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
            except TimeoutException:
                return HtmlResponse(url=request.url, status=500, request=request)
            
    @classmethod
    def from_crawler(cls, crawler):
        return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
                   service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))

setting

Add in the setting.py

ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
    'getBaiduImg.middlewares.SeleniumMiddleware': 543,
    'getBaiduImg.middlewares.GetbaiduimgDownloaderMiddleware': 550,
}

#selenium与PhantomJS的配置
SELENIUM_TIMEOUT = 20
PHANTOMJS_SERVICE_ARGS = ['--disk-cache=true']

+ PhantomJS here configuration selenium.

Dynamic Web Images Download

setting

ITEM_PIPELINES = {
   'getBaiduImg.pipelines.GetbaiduimgPipeline': 300,
   'getBaiduImg.pipelines.MyImagesPipeline': 100
}

#图片下载地址
IMAGES_STORE = 'C:\\Users\\20668\\Desktop\\img\\test2'
BOT_NAME = 'MyImagesPipeline'

pipelines

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class GetbaiduimgPipeline(object):
    def process_item(self, item, spider):
        return item

class MyImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            # print(image_url)
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        item['image_paths'] = image_paths
        # print(image_paths)
        return item

item

class GetbaiduimgItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    image_urls = scrapy.Field()
    image_paths = scrapy.Field()

At last

Some pictures Baidu picture site loaded using caching technology, can not climb that Cache-Control: max-Age = 0 .

Attach scrapy architectural overview, explain Downloader middleware will execute one by one, the smaller the value closer to the downloader, the greater the value closer to the engine.
scrapy Architecture Overview

Guess you like

Origin www.cnblogs.com/Kseven77/p/12215166.html