scrapy-redis 分布式爬虫爬取前程无忧的岗位数据

1.爬取思路

  • 进入’https://jobs.51job.com/zhongshan/p1/'页面,分页进行爬取,每一页中获取所有招聘岗位详情的URL

2.创建项目

scrapy startproject ping
cd ping
scrapy genspider ping 'jobs.51job.com'

3. 编辑需要爬取的数据字段

import scrapy
class ZhaopingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #职位名称
    title = scrapy.Field()
    #公司名称
    company = scrapy.Field()
    #公司的人数
    companyperson = scrapy.Field()
    #公司类别
    companycategory = scrapy.Field()
    #公司主要做什么的
    companydo = scrapy.Field()
    #工作地点
    location = scrapy.Field()
    #工作地址
    address = scrapy.Field()
    #薪水
    salary = scrapy.Field()
    #招聘人数
    person = scrapy.Field()
    #发布时间
    data = scrapy.Field()
    #学历要求
    request = scrapy.Field()
    #工作经验
    experience = scrapy.Field()

4. 编辑爬虫解析数据和请求转发

class PingSpider(scrapy.Spider):
    name = 'ping'
    allowed_domains = ['jobs.51job.com']
    start_urls = ['https://jobs.51job.com/zhongshan/p1/']
    def parse(self, response):
        print(response.url)
        url = response.url.split('/')
        city = url[len(url) - 3]
        page = url[len(url) - 2]
        new_page = re.findall(r'\d+',page)[0]
        urls = response.xpath('.//div[@class="detlist gbox"]/div/p/span/a/@href').extract()
        for url in urls:
            yield scrapy.Request(url, callback=self.parse_info)
        pages = response.xpath('.//div[@class="p_in"]/span[@class="td"]/text()').get()
        pagenum = re.findall(r'\d+',pages)[0]
        if int(new_page) < int(pagenum):
            new_page = int(new_page) + 1
            new_url = "https://jobs.51job.com/" + city + "/p" + str(new_page) + "/"
            yield scrapy.Request(new_url, callback=self.parse)


    def parse_info(self, response):
        # print(response.url)
        pass
        item = ZhaopingItem()
        #职位名称
        item['title'] = response.xpath('//div[@class="in"]/div/h1/@title').get()
        #公司名称
        item['company'] = response.xpath('//div[@class="in"]/div/p/a[@class="catn"]/@title').get()

        #获取公司信息
        company = self.getPerAndCatAndDo(response)
        #公司类别
        item['companycategory'] = company[0]
        #公司的人数
        item['companyperson'] = company[1]
        #公司主要做什么的
        item['companydo'] = company[2]
        #薪水
        salary= response.xpath('//div[@class="tHeader tHjob"]/div/div/strong/text()').get()
        if salary is not None:
            item['salary'] = salary
        else:
            item['salary'] = ""
        #工作地址
        address = response.xpath('//div[@class="tBorderTop_box"]/div[@class="bmsg inbox"]/p/text()').get()
        if address is not None:
            item['address'] = address
        else:
            item['address'] = ''
        #获取招聘信息
        recruit = self.getRequestInfo(response)
        #工作地点
        item['location'] = recruit[0]
        #工作经验
        item['experience'] = recruit[1]
        #学历要求
        item['request'] = recruit[2]
        #招聘人数
        item['person'] = recruit[3]
        #发布时间
        item['data'] = recruit[4]
        print(item)
        yield item
    def getPerAndCatAndDo(self,response):
        result = response.xpath('//div[@class="tBorderTop_box"]/div[@class="com_tag"]/p/@title').extract()
        if len(result) <= 2:
            companycategory = result[0]
            companydo = result[1]
        elif len(result) == 3:
            companycategory = result[0]
            companyperson = re.findall(r'\d+',result[1])[0]
            companydo = result[2]
        else:
            companycategory = result[0]
            companyperson = ''
            companydo = ''
        return [companycategory,companyperson,companydo]
    def getRequestInfo(self,response):
        result = response.xpath('//div[@class="in"]/div/p[@class="msg ltype"]/text()').getall()
        # result = result.replace(r'\xa0',"")
        if len(result) >= 5:
            location = result[0].replace(u'\xa0', u'')
            experience = result[1].replace(u'\xa0', u'')
            request = result[2].replace(u'\xa0', u'')
            person = result[3].replace(u'\xa0', u'')
            data = result[4].replace(u'\xa0', u'')
        elif len(result) == 4:
            location = result[0].replace(u'\xa0', u'')
            experience = result[1].replace(u'\xa0', u'')
            request = ''
            person = result[2].replace(u'\xa0', u'')
            data = result[3].replace(u'\xa0', u'')
        return [location,experience,request,person,data]

5. 将爬取的数据保存到mongo中

import pymongo
from scrapy.exporters import JsonLinesItemExporter


class ZhaopingPipeline:
    def __init__(self):
        # 获取连接
        client = pymongo.MongoClient(host='127.0.0.1', port=27017)
        # 设置数据库
        db = client.ping
        #设置集合(表)
        self.col = db.ping
    def process_item(self,item,spider):
        #写入集合(表)
        self.col.insert(dict(item))
        print('插入成功')
        return item

6. 设置配置文件 settings.py

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
  'zhaoping.pipelines.ZhaopingPipeline': 300,
}

7. 启动爬虫

scrapy crawl ping

8.构建分布式爬虫

  • 安装scrapy-redis
## 安装scrapy-redis:
pip3 install scrapy-redis -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
  • 将爬虫的类 scrapy.Spider 换成 scrapy_redis.spiders.RedisSpider
  • 将 start_urls = [‘https://jobs.51job.com/zhongshan/p1/’] 删掉,添加一个 redis_key
class PingSpider(RedisCrawlSpider):
    name = 'ping'
    allowed_domains = ['jobs.51job.com']
    # start_urls = ['https://jobs.51job.com/zhongshan/p1/']
    # 将start_url插入redis,键为fang_url(lpush ping:start_url https://jobs.51job.com/zhongshan/p1/)
    redis_key = "ping:start_url"
  • 在配置文件中添加配置
# #使用scrapy-redis里的去重组件,不使用scrapy的默认去重,用来将url去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# #使用scrapy-redis里的调度器组件,不使用scrapy的默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 允许暂停,Redis请求记录不丢失
SCHEDULER_PERSIST = True

# 指定要存储的redis数据库的主机IP
REDIS_HOST = '127.0.0.1'  # 远端的ip地址

# 指定redis数据库主机的端口
REDIS_PORT = 6379
  • 在redis中添加url数据
lpush ping:start_url https://jobs.51job.com/zhongshan/p1/
  • 启动爬虫,就可以在redis中看到爬取的数据了
  • 爬出的数据如下:爬取的数据如下

猜你喜欢

转载自blog.csdn.net/weixin_43950643/article/details/107294470