版权声明:转载请注明来源谢谢! https://blog.csdn.net/qq_43004728/article/details/84636468
智联爬取中,页码的数字和url是不匹配的,因此盲目的拼接url会造成错误,因此可以采用模拟浏览器爬取网页
要模拟浏览器需要知道scrapy流程,简图如下:
这里只是简单的写一些伪码,设计的数据清洗部分请看scrapy数据清洗
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SeleniumMiddleware(object):
def __init__(self):
self.options = Options()
self.browser = webdriver.Chrome(chrome_options = self.options)
def process_request(self,request,spider):
if int(request.meta['page']) ==2:
self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(1)
div = self.browser.find_element_by_css_selector('.soupager')
next_page = div.find_elements_by_tag_name('button')
#两种方法二选一
# next_page[1].click()
for button in next_page:
if button.text == "下一页":
button.click()
else:
if int(request.meta['page']) == 0:
try:
print('url is :::', request.url)
self.browser.get(request.url)
except TimeoutException as e:
print('超时')
time.sleep(5)
return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source,
encoding='utf-8', request=request)
#爬取过程中有时候会出现一种情况就是网页一直在加载,右上角有一个小圆圈一直再转,此时把圆圈差掉内容就可以加载出来
#browser.execute_script('window.stop()') 用这个方法.
# -*- coding: utf-8 -*-
import time
import lxml.html
import scrapy
from lxml import etree
from scrapy import Request
class Jobparse():
def __init__(self):
pass
def parse_div_content(html_str):
pass
def parse_lxml_zhilian(html_str):
tree = lxml.html.fromstring(html_str)
job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')
print(job_url)
print(job_name)
return job_url
#全部变量判断翻页速度与下载速度
count = 0
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
# allowed_domains = ['ts.zhaopin.com']
# start_urls = ['http://ts.zhaopin.com/']
def start_requests(self):
url_str = "https://sou.zhaopin.com/?pageSize=60&jl=489&kw=python&kt=3"
yield Request(url_str, callback=self.parse, dont_filter=True, meta={'page': '0'})
def parse(self, response):
# rs = response.css('div:nth-child(2)').extract()
# print('rs is :::::', rs)
# page_next = response.css('.soupager').extract()
# print('page next is ::::', page_next)
rs = response.css('div.contentpile_content_wrapper:nth-child(2)').extract()
page_next = response.css('.soupager').extract()
#假设每页60条数据 没翻一页数据多60条
global count
count += 60
for r in rs:
job_url = parse_lxml_zhilian(r)
yield Request(url=job_url,callback=self.parse_detail,meta={'page': '3'}, dont_filter=True)
if len(page_next) > 0:
#当数据>300可以让翻页暂停等待数据下载
if count > 300:
time.sleep(0.5)
# 使用selenium模拟点击下一页,该请求不会产生实质的下载动作
yield Request(url=response.url, callback=self.parse, meta={'page': '2'}, dont_filter=True)
def parse_detail(self,response):
#数据没下载一条,count - 1
pass
global count
count -= 1
#实现浏览器模拟与非模拟 只要中间件中不return 该请求就会自动留到downloader
#控制翻页速度和下载速度同步: