关键词一键查询51job工作岗位招聘详情(python正则表达式)

# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/9 10:17'

import re
from urllib import request, parse


class JobSpider(object):
    def __init__(self):

        self.url = ''
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
        self.html = ''
        self.title = ''
        self.total = 0
        self.job_type = ''
        self.job_page = ''

    def get_url(self):
        # 网页解析
        rs = parse.urlencode({'job_type': self.job_type})
        self.url = 'https://search.51job.com/list/170200,000000,0000,00,9,99,{},2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(rs.split('=')[-1])


    def get_html(self, url):
        """
        构建请求,发送请求,接收响应数据
        获取网页源代码,赋值给self.html属性
        :param url: 网址
        :return:
        """
        # 构建请求对象

        req = request.Request(url=url, headers=self.headers)
        #发送请求,获取源代码
        response = request.urlopen(req)
        # 将返回的数据读取并转换为字符串,赋值给对象的html属性
        self.html = response.read().decode('gb18030', 'ignore')

    def get_title_total(self):
        """
        获取标题和总页数
        :return:
        """
        # 准备正则,获取标题
        tit_pattern = re.compile(r'<title>(.*?)</title>', re.S)
        # 根据正则匹配标题
        tit_res = re.search(tit_pattern, self.html)
        if tit_res:
            self.title = tit_res.group(1)
        # 匹配总页数
        page_pattern = re.compile(r'<span.*?class="td">共(.*?)页', re.S)
        page_res = re.search(page_pattern, self.html)
        if page_res:
            self.total = int(page_res.group(1))
            # print(self.total)

    def parse_html(self):
        """提取网页数据"""
        # 准备正则
        pattern = re.compile(r'<p.*?class="t1.*?<a.*?_blank.*?="(.*?)".*?class="t2".*?title="(.*?)".*?t3">(.*?)<.*?t4">(.*?)<.*?t5">(.*?)<', re.S)
        results = re.findall(pattern, self.html)
        for r in results:
            print(r)

    def run(self):
        print('启动爬虫')
        self.job_type = input("请输入编程语言:")
        self.get_url()

        self.get_html(self.url)
        self.get_title_total()
        print('正在爬取:{},共{}页,请稍后.....'.format(self.title, self.total))
        # self.parse_html()
        # 循环爬取每一页数据
        for job_page in range(1, self.total + 1):
            print('****************************************')
            print('...正在下载第{}页...请稍后...'.format(job_page))
            print('****************************************')
            # 拼接每一页完整地址
            rs = parse.urlencode({'job_type': self.job_type})
            url = "https://search.51job.com/list/170200%252C020000%252C030200,000000,0000,00,9,99,{},2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(
                rs.split('=')[-1], job_page)
            # 获取源代码
            self.get_html(url=url)

            # 解析网页数据
            self.parse_html()
        print('信息采集完毕')

if __name__ =='__main__':

    job = JobSpider()
    job.run()

猜你喜欢

转载自blog.csdn.net/A_fool_Program_ape/article/details/81569788
今日推荐