在爬取网页时,经常会遇到动态网页,而scrapy是无法爬取动态网页,这时就需要借助其他爬虫技术,爬取动态网页的技术有很多,这里是将selenium框架集成到scrapy中。
middlewares.py
class SeleniumMiddleware(object):
"""
使用selenium框架去请求那些动态页面
并将请求后的response返回
"""
def process_request(self, request, spider):
#判断是否为tmailSpider
if spider == "tmailSpider":
#使用driver请求网页
spider.driver.get(url=request.url)
spider.driver.implicitly_wait(3)
#放回response
return HtmlResponse(url=spider.driver.current, body=spider.driver.page_source, encoding='utf-8', request=request)
spider.py
import scrapy
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class TmailspiderSpider(scrapy.Spider):
name = 'tmailSpider'
allowed_domains = ['detail.tmall.com']
start_urls = ['https://detail.tmall.com/item.htm?spm=875.7931836/B.20161011.18.66144265ixEZiP&pvid=d0ea629e-d89d-4ac2-b1f2-2a388cfd08c3&pos=18&acm=201509290.1003.1.1286473&id=575626668913&scm=1007.12710.81708.100200300000000']
def __init__(self):
super(TmailspiderSpider, self).__init__()
#初始化webdriver参数
options = ChromeOptions()
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)
options.set_headless()
#创建webdriver对象
self.driver = webdriver.Chrome(options=options)
#创建连接,当爬虫程序结束时调用quitDriver方法
dispatcher.connect(self.quitDriver, signals.spider_closed)
"""
爬虫程序结束执行此方法
关闭driver
"""
def quitDriver(self, spider):
self.driver.quit()
print('driver quit')