猎聘

import scrapy
from zhaopin_project.items import LagouItem


class LiepinSpider(scrapy.Spider):
    name = 'liepin'
    allowed_domains = ['liepin.com']
    # start_urls = ['https://www.liepin.com/it/']
    def start_requests(self):
        url = 'https://www.liepin.com/it/'
        referer_url = 'https://www.liepin.com/'
        get_req = scrapy.Request(url=url, callback=self.parse)
        get_req.headers['referer']=referer_url
        yield get_req

    def parse(self, response):

        a_hrefs =response.xpath('//ul[@class="sidebar float-left"]/li/dl/dd/a/@href').extract()

        for a_href in a_hrefs:

            url = response.urljoin(a_href)
            print(url)
            rep = scrapy.Request(url= url, callback=self.parse_list)
            # print(rep)
            yield rep
    #
    def parse_list(self, response):
        # print('我进来了')

        a_hrefs = response.xpath('//div[@class="sojob-item-main clearfix"]/div[@class="job-info"]/h3/a/@href').extract()
        for a_href in a_hrefs:
            try:
                req = scrapy.Request(url=a_href, callback=self.parse_detail)
                yield req
            except:
                continue

    def parse_detail(self, response):

        name_a = response.xpath('//div[@class="title-info"]/h1/@title').extract()
        #公司
        name_a = ''.join(name_a)
        print(name_a)
        #薪资
        salart = response.xpath('//p[@class="job-item-title"]/text()').extract()[0]
        print(salart)
        # print('--------')
        #地址
        title = response.xpath('//p[@class="basic-infor"]/span/a/text()').extract()[0]
        print(title)
        #经验
        jy = response.xpath('//div[@class="job-qualifications"]/span[2]/text()').extract()[0]
        print(jy)
        #学历
        xuel = response.xpath('//div[@class="job-qualifications"]/span[1]/text()').extract()[0]
        print(xuel)
        #发布时间
        publish_time_a = response.xpath('//p[@class="basic-infor"]/time/@title').extract()[0]
        print(publish_time_a)
        #发布网站
        publish_wz = response.xpath('//title/text()').extract()
        print(publish_wz)
        #岗位职责
        paddingleft = response.xpath('//div[@class="content content-word"]/text()').extract()
        print(paddingleft)
        for paddingleft_b in paddingleft:
            paddingleft_a=paddingleft_b

            # print(paddingleft_a)
            item = LagouItem()
            item['title'] = name_a
            item['salary'] = salart
            item['position'] = title
            item['jingyan'] = jy
            item['xueli'] = xuel
            item['shijian'] = publish_time_a
            item['fabu'] = publish_wz
            item['job_bt'] = paddingleft_a
            yield item

猜你喜欢

转载自www.cnblogs.com/lxh777/p/9581013.html