爬虫---------scrapy------浏览器爬取()

# -*- coding: utf-8 -*-
import scrapy
# from scrapy.linkextractors import LinkExtractor
# from scrapy.spiders import CrawlSpider, Rule
from Zhilian.items import ZhilianItem
class ZhilianSpider(scrapy.Spider):
    name = 'zhilian'
    allowed_domains = ['zhaopin.com']
    start_urls = ["https://sou.zhaopin.com/?pageSize=60&jl=北京" + "&kw=python" + "&kt=3&p=" + str(i) for i in
                  range(int(input("起始:")), int(input("终止:")))]


    # rules = (
    #     Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    # )
    def parse(self, response):
        job_list = response.xpath("//div[@id='listContent']/div")
        # print(job_list)
        for job in job_list:
            item = ZhilianItem()
            item["name"] = job.xpath(".//span/@title").extract_first()
            item["salary"] = job.xpath(".//p/text()").extract_first()
            item["fuli"] = job.xpath(".//div[contains(@class,'welfare')]/text()").extract()
            item["address"] = job.xpath(".//ul/li[1]/text()").extract_first()
            item["jingyan"] = job.xpath(".//li[contains(@class,'demand')][2]/text()").extract_first()
            item["company"] = job.xpath(".//div/a/@title").extract_first()
            next_url = job.xpath(".//div[contains(@class,'jobName')]//a/@href").extract_first()#获取第二页链接
            # yield item
            yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
    def parse_next(self,response):
        item = response.meta["item"]
        item["job_info"] = r"\n".join(response.xpath("//div[@class='pos-ul']/p").exract())
        item["company_info"] = r"\n".join(response.xpath("//div[@class='intro-content']/p/text()")).extract()
        yield item






猜你喜欢

转载自blog.csdn.net/qq_42817166/article/details/83313140
今日推荐