pyspider爬虫框架之猎聘网招聘信息爬取

需求

抓取需求
1、 按地区抓取
2、 抓取职位名称,薪酬,学历要求,工作年限要求,发布时间,公司名称,所属行业

代理

注释很详细,不解释了,没有代理慎用。

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-16 11:04:59
# Project: hunting_recruit

from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient

# 连接线下数据库


DB_NAME = 'research'
DB_COL = 'hunting_recruit'
db = client[DB_NAME]
col = db[DB_COL]

class Handler(BaseHandler):
    crawl_config = {
        "headers":{"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
                  },
        "proxy": "http://localhost:6666"
    }

    url = 'https://www.liepin.com/zhaopin'


    def format_date(self, date):
        return datetime.datetime.strptime(date, '%Y%m%d')

    @every(minutes=24 * 60)
    def on_start(self):

        self.crawl(self.url, callback=self.index_page)

    @config(age=60)
    def index_page(self, response):
        page = response.etree


        base_url = 'https://www.liepin.com'
        ##行业列表
        industry_list = page.xpath("//dd[@data-param='industries']/ul/li")

        for each in industry_list:
            title = each.xpath("./span/text()")[0]
            print('-------',title,'--------')
            ## 子目录
            sub_list = each.xpath("./div[@class='sub-industry']/a")
            for sub in sub_list:
                belonging = sub.xpath("./text()")[0]
                print(belonging)
                link_url = base_url + sub.xpath("./@href")[0]

                save = {"belonging": belonging}
                self.crawl(link_url, callback=self.parse_city, save=save)

    @config(age=60)
    def parse_city(self, response):
        page = response.etree
        base_url = 'https://www.liepin.com'
        ## 城市列表
        city_list = page.xpath("//dd[@data-param='city']/a")[1:-1]  #去掉全国 不要其他

        for each in city_list:
            city = each.xpath("./text()")[0]
            print(city)
            link_url = base_url + each.xpath("./@href")[0]
            save = {"belonging": response.save["belonging"], "city": city}
            self.crawl(link_url, callback=self.parse_district, save=save)

    @config(age=60)
    def parse_district(self, response):
        page = response.etree
        base_url = 'https://www.liepin.com'
        ## 地区列表
        district_list = page.xpath("//dd[@data-param='dqs']/a")

        for each in district_list:
            district = each.xpath("./text()")[0]
            print(district)
            link_url = base_url + each.xpath("./@href")[0]
            save = {"belonging": response.save["belonging"], "city": response.save["city"], "district": district}
            self.crawl(link_url, callback=self.parse_detail, save=save)

    @config(age=60)
    def parse_detail(self, response):
        page = response.etree

        ## 翻页
        tail_url = page.xpath(u"//a[@title='末页']/@href")[0]
        print(tail_url)
        page_num = int(re.findall('&curPage=(\d+)', tail_url)[0])
        print(page_num)

        for each in range(page_num):
            page_url = response.url + '&curPage={}'.format(each)
            self.crawl(page_url, callback=self.parse_page, save=response.save)



    def parse_page(self, response):
        page = response.etree


        ## 内容列表
        contents = page.xpath("//ul[@class='sojob-list']/li")

        for each in contents:
            try:
                ##职位名称
                position_name = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/h3/a/text()")[0].strip()
                print(position_name)
                ## 薪酬
                salary = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='text-warning']/text()")[0]
                print(salary)
                ## 学历
                education = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[@class='edu']/text()")[0]
                print(education)
                ## 工作经验
                experience = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='condition clearfix']/span[last()]/text()")[0]
                print(experience)
                ## 发布时间
                public_time = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/time/@title")[0]
                public_time = ''.join(re.findall('\d+', public_time))
                print(public_time)
                ## 公司名称
                company = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='company-name']/a/@title")[0]
                print(company)

                ##公司所属行业
                company_belong = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='field-financing']/span/a/text()")[0]
                print(company_belong)

                ##反馈时间
                time_delay = each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='job-info']/p[@class='time-info clearfix']/span/text()")[0]
                print(time_delay)

                ##福利
                welfare = '-'.join(each.xpath("./div[@class='sojob-item-main clearfix']/div[@class='company-info nohover']/p[@class='temptation clearfix']/span/text()"))
                print(welfare)

                print('------------------------------')

                result = {"belonging": response.save["belonging"],
                        "city": response.save["city"],
                        "district": response.save["district"],
                        "position_name": position_name,
                        "salary": salary,
                        "education": education,
                        "experience": experience,
                        "public_time": self.format_date(public_time),
                        "company": company,
                        "update_time": datetime.datetime.now(),
                        "company_belong": company_belong,
                        "time_delay": time_delay,
                        "welfare": welfare
                       }

                yield result

            except:
                pass



    def on_result(self, result):
        if result is None:
            return

        update_key = {
                'position_name': result['position_name'],
                'public_time': result['public_time'],
                'city': result['city'],
                'district': result['district'],
                'company': result['company'],
                'belonging': result['belonging']
            }

        col.update(update_key, {'$set': result}, upsert=True)          


猜你喜欢

转载自blog.csdn.net/qq_36653505/article/details/81781642