爬取招聘网站

# coding: utf-8

import requests
import re
import xlwt



class ZLZP(object):

'''
http://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京%2B上海%2B广州%2B深圳%2B杭州&kw=python
'''

def __init__(self):
    self.html = ''
    self.nextLink =''
    self.filename = ''
    # 准备请求头
    self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
    self.result_data = []

# 根据url获取html源代码
def get_html(self):
    # 携带请求头
    response = requests.get(self.nextLink, headers=self.headers)
    self.html = response.content

# 获取搜索职位首页的url地址
def get_index_url(self):
    # 把kw添加为对象的属性
    self.kw = raw_input('请输入要爬取的职位名称：')
    city_list = []
    # 输入查询的城市,最多输入五个
    while len(city_list) < 5:
        city = raw_input('请输入要查询的城市(最多5个，输入0结束：)：')
        if city == '0':
            break
        city_list.append(city)
    # 可以把列表中的每一条数据使用字符串隔开，最终返回一个字符串
    citys = '%2B'.join(city_list)
    # 拼接第一页的url地址，并赋值给nextLink
    self.nextLink = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%s&kw=%s' % (citys, self.kw)

# 获取下一页的链接
def get_nextLink(self):
    pattern = re.compile(r'<li class="pagesDown.*?<a.*?href="(.*?)" class="next-page">',re.S)
    rs = re.search(pattern, self.html)
    # 如果搜索到结果，说明有下一页的链接
    if rs:
        self.nextLink = rs.group(1)
    else:
        self.nextLink = None

# 获取所有页的数据
def get_result_data(self):
    # 只要有下一页的链接，执行获取数据
    count = 0
    while self.nextLink:
        count += 1
        print '正在爬取第%s页数据.....'%count
        # 获取html源代码
        self.get_html()
        # 准备正则
        pattern = re.compile(r'<table.*?class="newlist.*?<td class="zwmc.*?<a.*?>(.*?)</a>.*?<td class="gsmc.*?<a.*?>(.*?)</a>.*?<td class="zwyx">(.*?)</td>.*?<td class="gzdd">(.*?)</td>',re.S)
        # 根据正则查找数据
        rs = re.findall(pattern, self.html)
        # 把所有的数据添加到大列表中
        self.result_data.append(rs)
        # 获取下一页链接
        self.get_nextLink()

# 保存数据
def save_data(self):
    # 创建工作簿
    workbook = xlwt.Workbook(encoding='utf-8')
    # kw如果是中文需要进行编码
    sheet = workbook.add_sheet(u'%s职位表'%unicode(self.kw, 'utf-8'))
    sheet.write(0, 0, '职位名称')
    sheet.write(0, 1, '公司名称')
    sheet.write(0, 2, '职位月薪(最低)')
    sheet.write(0, 3, '职位月薪(最高)')
    sheet.write(0, 4, '工作地点')
    # 从1行开始写入
    count = 1
    # for循环遍历所有页数据
    for rs in self.result_data:
        for detail in rs:
            # 去除职位名称中的标签
            strip_ele =  re.compile(r'&nbsp;|<.*?>')
            # 取出 职位名称 公司名称 职位月薪  工作地点
            zwmc = re.sub(strip_ele, '', detail[0])
            gsmc = detail[1]
            zwyx = detail[2]
            if '-' in zwyx:
                min_money = zwyx.split('-')[0]
                max_money = zwyx.split('-')[1]
            else:
                min_money = max_money = '面议'
            gzdd = detail[3]
            sheet.write(count, 0,zwmc)
            sheet.write(count, 1,gsmc)
            sheet.write(count, 2,min_money)
            sheet.write(count, 3,max_money)
            sheet.write(count, 4,gzdd)
            # 行号+1
            count += 1
    # 保存
    workbook.save(u'%s职位爬取数据.xls'%unicode(self.kw, 'utf-8'))

# 启动爬虫
def start(self):
    self.get_index_url() # self.nextLink = '搜索网页的url地址'
    self.get_html() # self.html = html网页源代码
    self.get_result_data()
    self.save_data()

zlzp = ZLZP()
zlzp.start()
猜你喜欢