# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/9 10:17'
import re
from urllib import request, parse
class JobSpider(object):
def __init__(self):
self.url = ''
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'}
self.html = ''
self.title = ''
self.total = 0
self.job_type = ''
self.job_page = ''
def get_url(self):
# 网页解析
rs = parse.urlencode({'job_type': self.job_type})
self.url = 'https://search.51job.com/list/170200,000000,0000,00,9,99,{},2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(rs.split('=')[-1])
def get_html(self, url):
"""
构建请求,发送请求,接收响应数据
获取网页源代码,赋值给self.html属性
:param url: 网址
:return:
"""
# 构建请求对象
req = request.Request(url=url, headers=self.headers)
#发送请求,获取源代码
response = request.urlopen(req)
# 将返回的数据读取并转换为字符串,赋值给对象的html属性
self.html = response.read().decode('gb18030', 'ignore')
def get_title_total(self):
"""
获取标题和总页数
:return:
"""
# 准备正则,获取标题
tit_pattern = re.compile(r'<title>(.*?)</title>', re.S)
# 根据正则匹配标题
tit_res = re.search(tit_pattern, self.html)
if tit_res:
self.title = tit_res.group(1)
# 匹配总页数
page_pattern = re.compile(r'<span.*?class="td">共(.*?)页', re.S)
page_res = re.search(page_pattern, self.html)
if page_res:
self.total = int(page_res.group(1))
# print(self.total)
def parse_html(self):
"""提取网页数据"""
# 准备正则
pattern = re.compile(r'<p.*?class="t1.*?<a.*?_blank.*?="(.*?)".*?class="t2".*?title="(.*?)".*?t3">(.*?)<.*?t4">(.*?)<.*?t5">(.*?)<', re.S)
results = re.findall(pattern, self.html)
for r in results:
print(r)
def run(self):
print('启动爬虫')
self.job_type = input("请输入编程语言:")
self.get_url()
self.get_html(self.url)
self.get_title_total()
print('正在爬取:{},共{}页,请稍后.....'.format(self.title, self.total))
# self.parse_html()
# 循环爬取每一页数据
for job_page in range(1, self.total + 1):
print('****************************************')
print('...正在下载第{}页...请稍后...'.format(job_page))
print('****************************************')
# 拼接每一页完整地址
rs = parse.urlencode({'job_type': self.job_type})
url = "https://search.51job.com/list/170200%252C020000%252C030200,000000,0000,00,9,99,{},2,{}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(
rs.split('=')[-1], job_page)
# 获取源代码
self.get_html(url=url)
# 解析网页数据
self.parse_html()
print('信息采集完毕')
if __name__ =='__main__':
job = JobSpider()
job.run()
关键词一键查询51job工作岗位招聘详情(python正则表达式)
猜你喜欢
转载自blog.csdn.net/A_fool_Program_ape/article/details/81569788
今日推荐
周排行