pyspider爬虫框架之拉勾网招聘信息爬取

需求

遍历所有职位目录
点击职位分类，进入之后按照地区抓取，职位名称，发布时间，薪酬，工作年限要求，学历要求，招聘公司，所属行业，所处轮次
进入职位详情页，抓取HR聊天意愿（用时），简历处理，活跃时段。
代码

代码有详细的注解，就不一步一步讲解了，再有个问题是，如果没有代理，慎用，IP容易被禁
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-17 14:49:15
# Project: lagou

from pyspider.libs.base_handler import *
import re
import datetime
from pymongo import MongoClient

DB_NAME = 'research'
DB_COL = 'lagou_recruit'
db = client[DB_NAME]
col = db[DB_COL]


class Handler(BaseHandler):

    crawl_config = {
        "headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
                  },
        "proxy": "http://localhost:6666"  ## 搭建的代理服务
    }

    url = 'https://www.lagou.com/'

    def format_date(self, date):
        return datetime.datetime.strptime(date, '%Y-%m-%d')

    def get_today(self):
        return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'), '%Y-%m-%d')

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(self.url, callback=self.index_page)



    @config(age=60)
    def index_page(self, response):
        page = response.etree

        ## 职位分类列表
        cat_list = page.xpath("//div[@class='mainNavs']/div[@class='menu_box']/div[@class='menu_sub dn']/dl")
        ## 遍历每个类别
        for each in cat_list:
            ## 大类
            b_title = each.xpath("./dt/span/text()")[0]
            print('-----------',b_title,'------------')

            ##子类列表
            sub_list = each.xpath("./dd/a")
            for sub in sub_list:
                sub_title = sub.xpath("./text()")[0]
                link_url = sub.xpath("./@href")[0]
                print(sub_title,' ',link_url)
                save = {"belonging": sub_title}
                self.crawl(link_url, callback=self.parse_categories, save=save)


    @config(age=60)
    def parse_categories(self, response):
        page = response.etree

        base_url = "https://www.lagou.com/jobs/list_"

        ## 每个城市列表
        city_list = page.xpath("//div[@class='details']/div[@class='has-more']/div[@class='more more-positions workPosition']/li/a")[1:-1] ## 去掉第一个全国和最后一个。。。。

        ## 遍历每个城市
        for each in city_list:
            city = each.xpath("./text()")[0]
            print(city)
            link_url = base_url + response.save["belonging"]
            params = {"px": "default",
                        "city": city
                     }

            save = {"belonging": response.save["belonging"], "city": city}

            self.crawl(link_url, callback=self.parse_city, params=params, save=save)


    @config(age=60)
    def parse_city(self, response):
        page = response.etree

        ## 地区列表
        district_list = page.xpath("//div[@class='contents' and @data-type='district']/a")[1:] ## 不要 不限
        print(response.url)
        ## 遍历地区
        for num,each in enumerate(district_list):
            district = each.xpath("./text()")[0]
            print(district)
            params = {
                        "district": district
                     }
            link_url = response.url + "#%s" % num

            save = {"belonging": response.save["belonging"],
                    "city": response.save["city"],
                    "district": district
                   }

            self.crawl(link_url, callback=self.parse_district, params=params, save=save)


    @config(age=60)
    def parse_district(self, response):
        page = response.etree

        headers = {"Host": "www.lagou.com",          #### 不加这些请求头参数，数据请求不到
                    "Origin": "https://www.lagou.com",
                   "Referer": response.url,
                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
                  }

        base_url = 'https://www.lagou.com/jobs/positionAjax.json'
        ##翻页
        try:
            page_num = int(page.xpath("//div[@class='page-number']/span[@class='span totalNum']/text()")[0])
            print(page_num)
        except:
            return

        for each in range(1,page_num+1):
            data = {"first": "false",
                    "pn": each,
                    "kd": response.save["belonging"]
                   }

            params = {"px": "default",
                        "city": response.save["city"],
                        "district": response.save["district"],
                        "needAddtionalResult": "false"
                     }

            link_url = base_url + "#%s" % each

            self.crawl(link_url, callback=self.parse_page, params=params, method='POST', data=data, save=response.save, headers=headers)

    @config(age=60)        
    def parse_page(self, response):
        page = response.json

        base_url = 'https://www.lagou.com/jobs/{}.html'
        #内容列表
        contents = page["content"]["positionResult"]["result"]
        print(contents)

        ## 遍历
        for each in contents:
            ## 职位名称
            position_name = each["positionName"]
            print(position_name)
            ## 发布时间
            public_time = each["formatCreateTime"]
            print(public_time)
            if re.findall('\d+:\d+',public_time):
                public_time = datetime.datetime.now().strftime('%Y-%m-%d')
                print(public_time)
            if re.findall(u'(\d+)天前',public_time):
                delta = int(re.findall(u'(\d+)天前',public_time)[0])
                public_time = (datetime.datetime.now()+datetime.timedelta(days=-delta)).strftime('%Y-%m-%d')
                print(public_time)

            if re.findall(u'昨天',public_time):
                public_time = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
                print(public_time)

            ## 薪酬
            salary = each["salary"]
            print(salary)
            ## 工作年限要求
            experience = each["workYear"]
            print(experience)
            ## 学历要求
            education = each["education"]
            print(education)
            ## 公司
            company = each["companyFullName"]
            print(company)
            ## 公司所属行业
            company_belong = each["industryField"]
            print(company_belong)
            ## 所属轮次
            rounds = each["financeStage"]
            print(rounds)
            ## 福利
            welfare = '-'.join(each["companyLabelList"])
            print(welfare)
            print('----------------------------------------')

            save = {"belonging": response.save["belonging"],
                    "city": response.save["city"],
                    "district": response.save["district"],
                     "position_name": position_name,
                     "public_time": public_time,
                     "salary": salary,
                     "experience": experience,
                     "education": education,
                     "company": company,
                     "company_belong": company_belong,
                     "rounds": rounds,
                     "welfare": welfare
                   }

            position_id = each["positionId"]
            link_url = base_url.format(position_id)

            self.crawl(link_url, callback=self.parse_detail, save=save)


    def parse_detail(self, response):
        page = response.etree

        try:
            ## hr的信息
            hr_info = page.xpath("//dd[@class='jd_publisher']/div/div[@class='publisher_data']")[0]

            chat_will = hr_info.xpath("./div[1]/span[@class='data']/text()")[0]
            print(chat_will)    
            resume_processing = hr_info.xpath("./div[2]/span[@class='data']/text()")[0]
            print(resume_processing)
            active_time = hr_info.xpath("./div[3]/span[@class='data']/text()")[0]
            print(active_time)

        except:
            chat_will = ''
            resume_processing = ''
            active_time = ''


        result = {"belonging": response.save["belonging"],
                "city": response.save["city"],
                "district": response.save["district"],
                "position_name": response.save["position_name"],
                "public_time": self.format_date(response.save["public_time"]),
                "salary": response.save["salary"],
                "experience": response.save["experience"],
                "education": response.save["education"],
                "company": response.save["company"],
                "company_belong": response.save["company_belong"],
                "rounds": response.save["rounds"],
                "welfare": response.save["welfare"],
                  "chat_will": chat_will,
                  "resume_processing": resume_processing,
                  "active_time": active_time,
                  "update_time": datetime.datetime.now(),
                  "date": self.get_today()
               }

        yield result




    def on_result(self, result):
        if result is None:
            return

        update_key = {
                'position_name': result['position_name'],
                'public_time': result['public_time'],
                'city': result['city'],
                'district': result['district'],
                'company': result['company'],
                'belonging': result['belonging']
            }

        col.update(update_key, {'$set': result}, upsert=True)
pyspider爬虫框架之拉勾网招聘信息爬取

需求

代码

猜你喜欢