scrapy + the Selenium use practices
To crawl Baidu picture, but found Baidu picture changed to dynamic web sites, based on scrapy can not be achieved, so I try to use + the Selenium PhantomJS achieve.
Download middleware
Download middleware needs to be defined for the content of selenium can be relevant document.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
import time
class SeleniumMiddleware():
def __init__(self, timeout=None, service_args=[]):
self.logger = getLogger(__name__)
self.timeout = timeout
self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser.set_window_size(1400, 700)
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
if 'image.baidu.com/search/flip' in request.url:
self.logger.debug('image.baidu.com/search/flip PhantomJS is Starting')
try:
self.browser.get(request.url)
time.sleep(3)
self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.imglink')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
elif 'image.baidu.com/search/detail' in request.url:
self.logger.debug('image.baidu.com/search/detail PhantomJS is Starting')
try:
self.browser.get(request.url)
time.sleep(3)
self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#currentImg')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8', status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
setting
Add in the setting.py
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'getBaiduImg.middlewares.SeleniumMiddleware': 543,
'getBaiduImg.middlewares.GetbaiduimgDownloaderMiddleware': 550,
}
#selenium与PhantomJS的配置
SELENIUM_TIMEOUT = 20
PHANTOMJS_SERVICE_ARGS = ['--disk-cache=true']
+ PhantomJS here configuration selenium.
Dynamic Web Images Download
setting
ITEM_PIPELINES = {
'getBaiduImg.pipelines.GetbaiduimgPipeline': 300,
'getBaiduImg.pipelines.MyImagesPipeline': 100
}
#图片下载地址
IMAGES_STORE = 'C:\\Users\\20668\\Desktop\\img\\test2'
BOT_NAME = 'MyImagesPipeline'
pipelines
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class GetbaiduimgPipeline(object):
def process_item(self, item, spider):
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
# print(image_url)
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
# print(image_paths)
return item
item
class GetbaiduimgItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
image_paths = scrapy.Field()
At last
Some pictures Baidu picture site loaded using caching technology, can not climb that Cache-Control: max-Age = 0 .
Attach scrapy architectural overview, explain Downloader middleware will execute one by one, the smaller the value closer to the downloader, the greater the value closer to the engine.