from scrapy.http.response.html import HtmlResponse from selenium import webdriver from selenium.webdriver.chrome.options import Options #自定义中间件 class CustomMiddleware(object): def process_request(self, request, spider): url = request.url opt = Options() opt.add_argument('--headless') #创建谷歌浏览器对象 driver = webdriver.Chrome(chrome_options=opt) driver.get(url) #让浏览器滚动到底部 for x in range(1,11,2): j = x / 10 js = 'document.documentElement.scrolTop = document.documentElement.scrollHeigent*%f'%j driver.execute_script(js) #获取网页源代码 page_source = driver.page_source #退出 driver.quit() #根据网页源代码,创建htmlresponse对象 #因为返回的是文本内容,指定字符编码格式 response = HtmlResponse(url,body=page_source,encoding='utf-8',request=request) return response
scrapy 中间件重写,与selenium结合爬取动态页面
猜你喜欢
转载自blog.csdn.net/a10090492/article/details/79646601
今日推荐
周排行