import scrapy from zhaopin_project.items import LagouItem class LiepinSpider(scrapy.Spider): name = 'liepin' allowed_domains = ['liepin.com'] # start_urls = ['https://www.liepin.com/it/'] def start_requests(self): url = 'https://www.liepin.com/it/' referer_url = 'https://www.liepin.com/' get_req = scrapy.Request(url=url, callback=self.parse) get_req.headers['referer']=referer_url yield get_req def parse(self, response): a_hrefs =response.xpath('//ul[@class="sidebar float-left"]/li/dl/dd/a/@href').extract() for a_href in a_hrefs: url = response.urljoin(a_href) print(url) rep = scrapy.Request(url= url, callback=self.parse_list) # print(rep) yield rep # def parse_list(self, response): # print('我进来了') a_hrefs = response.xpath('//div[@class="sojob-item-main clearfix"]/div[@class="job-info"]/h3/a/@href').extract() for a_href in a_hrefs: try: req = scrapy.Request(url=a_href, callback=self.parse_detail) yield req except: continue def parse_detail(self, response): name_a = response.xpath('//div[@class="title-info"]/h1/@title').extract() #公司 name_a = ''.join(name_a) print(name_a) #薪资 salart = response.xpath('//p[@class="job-item-title"]/text()').extract()[0] print(salart) # print('--------') #地址 title = response.xpath('//p[@class="basic-infor"]/span/a/text()').extract()[0] print(title) #经验 jy = response.xpath('//div[@class="job-qualifications"]/span[2]/text()').extract()[0] print(jy) #学历 xuel = response.xpath('//div[@class="job-qualifications"]/span[1]/text()').extract()[0] print(xuel) #发布时间 publish_time_a = response.xpath('//p[@class="basic-infor"]/time/@title').extract()[0] print(publish_time_a) #发布网站 publish_wz = response.xpath('//title/text()').extract() print(publish_wz) #岗位职责 paddingleft = response.xpath('//div[@class="content content-word"]/text()').extract() print(paddingleft) for paddingleft_b in paddingleft: paddingleft_a=paddingleft_b # print(paddingleft_a) item = LagouItem() item['title'] = name_a item['salary'] = salart item['position'] = title item['jingyan'] = jy item['xueli'] = xuel item['shijian'] = publish_time_a item['fabu'] = publish_wz item['job_bt'] = paddingleft_a yield item
猎聘
猜你喜欢
转载自www.cnblogs.com/lxh777/p/9581013.html
今日推荐
周排行