使用scrapy框架+模拟浏览器方法实现爬取智联的职位信息

由于智联的页面是由js动态加载的,一般的方法只能得到js加载前的页面,为了得到加载过的页面需要通过模拟浏览器来拿到完整的页面.
下面的代码只是简单的实现,爬取智联页面的部分功能,其他根据需要自己实现

中间件(middleswares.py)代码:


from scrapy.http import HtmlResponse
from selenium import webdriver
import time
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options


class SeleniumMiddleware(object):
    def __init__(self):
        self.options = Options()
        # self.options.add_argument('-headless')
        # self.browser = webdriver.Chrome(executable_path=r"D:\python_others\Spider\code\day06\tools\chromedriver.exe",chrome_options=self.options)
        self.browser = webdriver.Firefox(executable_path=r"D:\python_others\Spider\code\day06\tools\geckodriver.exe",
                                         firefox_options=self.options)

    def process_request(self, request, spider):
        if int(request.meta['page']) == 2:
            # 执行javascript使浏览器滚动条滚动到最后
            self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(3)
            div = self.browser.find_element_by_css_selector(".soupager")
            next_page = div.find_element_by_tag_name("button")
            next_page[1].click()
            # page = self.browser.find_element_by_xpath('//*[@id="pagination_content"]/div/button[2]')
            # page.click()
            # time.sleep(10)
        else:
            if (request.meta['page']) == 0:
                try:
                    print("url is ::::", request.url)
                    self.browser.get(request.url)
                except TimeoutError as e:
                    print("超时")
                time.sleep(5)

                return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
                                    request=request)

# 在模拟浏览器过程中如果还想要在downloader实现下载 只要中间件不return就可以
# 出现页面一直加载的情况时,显示页面一直在加载,只要差掉加载过程的小圆圈,页面就会加载出来
# browser.execute_script(('window.stop()') 使用这个方法

爬虫文件(spider.py)代码:

# -*- coding: utf-8 -*-
import time
import scrapy
import lxml.html
from scrapy import Request


class JobDes(object):
    def __init__(self):
        self.detail_url = ""
        self.title = ""


def parse_lxml_zhilian(html_str):
    tree = lxml.html.fromstring(html_str)
    job_url = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@href')
    job_name = tree.xpath('//a[@class="contentpile__content__wrapper__item__info__boxle"]/@title')

    print(job_url)
    print(job_name)

#全局变量用于判断翻页速度是否远大于局部下载速度
count = 0
class ZhaopinSpider(scrapy.Spider):
    name = 'zhaopin'

    # allowed_domains = ['ts.zhaopin.com']
    # start_urls = ['http://ts.zhaopin.com/']

    def start_requests(self):
        url_str = 'https://sou.zhaopin.com/?jl=489&kw=python&kt=3'
        yield Request(url=url_str, callback=self.parse, meta={"page": "0"})

    def parse(self, response):
        #使用模拟器翻页加载ajax页面
        #在模拟器弹出页面分析抓取页面
        #抓取标签不是一成不变的,谨慎使用带数字的css选择器nth-child(1)
        #使用简单可调式的页面去调试
        #selenium可以用于模拟测试
        rs = response.css('#listContent > div:nth-child(1)')
        page_next = response.xpath('//*[@id="pagination_content"]/div/button[2]')
        # pagination_content > div > button:nth-child(7)
        print("rs is :::::", rs)
        print("page_next is :::::", page_next)
        # listContent > div:nth-child(1)
        # pagination_content > div > button:nth-child(7)
        # button.btn:nth-child(8)
        #每页60个下载任务,每翻一页多60条任务
        global count
        count += 60
        for r in rs:
            job_url = parse_lxml_zhilian(r)
            yield Request(url=job_url, callback=self.parse_detal, meta={"page": "3"}, dont_filter=True)
        if len(page_next) > 0:
            #当下载任务大于300时,暂停翻页等待数据下载
            while count > 300:
                time.sleep(0.5)
            yield Request(url=response.url, callback=self.parse, meta={"page": "2"}, dont_filter=True)

    def parse_detal(self):
        pass

猜你喜欢

转载自blog.csdn.net/qq_42827960/article/details/84637180