# -*- coding: utf-8 -*-
import scrapy
# from scrapy.linkextractors import LinkExtractor
# from scrapy.spiders import CrawlSpider, Rule
from Zhilian.items import ZhilianItem
class ZhilianSpider(scrapy.Spider):
name = 'zhilian'
allowed_domains = ['zhaopin.com']
start_urls = ["https://sou.zhaopin.com/?pageSize=60&jl=北京" + "&kw=python" + "&kt=3&p=" + str(i) for i in
range(int(input("起始:")), int(input("终止:")))]
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
job_list = response.xpath("//div[@id='listContent']/div")
# print(job_list)
for job in job_list:
item = ZhilianItem()
item["name"] = job.xpath(".//span/@title").extract_first()
item["salary"] = job.xpath(".//p/text()").extract_first()
item["fuli"] = job.xpath(".//div[contains(@class,'welfare')]/text()").extract()
item["address"] = job.xpath(".//ul/li[1]/text()").extract_first()
item["jingyan"] = job.xpath(".//li[contains(@class,'demand')][2]/text()").extract_first()
item["company"] = job.xpath(".//div/a/@title").extract_first()
next_url = job.xpath(".//div[contains(@class,'jobName')]//a/@href").extract_first()#获取第二页链接
# yield item
yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
def parse_next(self,response):
item = response.meta["item"]
item["job_info"] = r"\n".join(response.xpath("//div[@class='pos-ul']/p").exract())
item["company_info"] = r"\n".join(response.xpath("//div[@class='intro-content']/p/text()")).extract()
yield item
爬虫---------scrapy------浏览器爬取()
猜你喜欢
转载自blog.csdn.net/qq_42817166/article/details/83313140
今日推荐
周排行