爬虫（5）：爬取拉钩网数据

import json
import urllib
from urllib import parse, request
import math

# 请求头
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'Content-Length': '25',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'user_trace_token=20180702155921-d3d20412-7dcd-11e8-bccb-525400f775ce; LGUID=20180702155921-d3d2078c-7dcd-11e8-bccb-525400f775ce; index_location_city=%E5%B9%BF%E5%B7%9E; JSESSIONID=ABAAABAAAIAACBI79C85F71B2CEC5CEF072374DD0B0E6BF; _gat=1; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530518359,1530523033,1530578881; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1530578881; LGSID=20180703084805-bf1518ef-7e5a-11e8-98e2-5254005c3644; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_Java%3Fcity%3D%25E5%25B9%25BF%25E5%25B7%259E%26cl%3Dfalse%26fromSearch%3Dtrue%26labelWords%3D%26suginput%3D; LGRID=20180703084805-bf151b78-7e5a-11e8-98e2-5254005c3644; _ga=GA1.2.947372204.1530518359; _gid=GA1.2.1007997539.1530519627; SEARCH_ID=05d0d1e544af4a5e9c0dfe21533df3f9',
    'Host': 'www.lagou.com',
    'Origin': 'https://www.lagou.com',
    'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%B9%BF%E5%B7%9E&cl=false&fromSearch=true&labelWords=&suginput=',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4482.400 QQBrowser/9.7.13001.400',
    'X-Anit-Forge-Code': '0',
    'X-Anit-Forge-Token': 'None',
    'X-Requested-With': 'XMLHttpRequest'
}


# 获得相关网页数方法
def getPageNum(kw):
    # url = 'http://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false'
    # 路由（没有输查找关键字的路由）
    url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false'

    # form data
    data = {
        'first': 'true',
        'pn': '1',
        'kd': kw
    }
    # url编码
    data = urllib.parse.urlencode(data).encode('utf-8')
    # 请求体
    req = urllib.request.Request(url, data=data, headers=headers)  # POST请求
    # 获取响应
    response = urllib.request.urlopen(req).read().decode('utf-8')
    # 转json
    data = json.loads(response)
    # 获取岗位数
    jobnum = data['content']['positionResult']['totalCount']
    print(jobnum)
    # 获取单页岗位数
    pagesize = data['content']['pageSize']
    print(pagesize)
    # 获取页码数
    totalpage = math.ceil(jobnum / pagesize)
    print(totalpage)
    return int(totalpage)


# 获得岗位信息的方法
def getJobInfo(kw, pagenum):
    # url = 'http://www.lagou.com/jobs/positionAjax.json?city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false'
    # 路由（没有输查找关键字的路由）
    url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E5%B9%BF%E5%B7%9E&needAddtionalResult=false'
    for i in range(1, pagenum + 1):
        data = {
            'first': 'true',
            'pn': i,
            'kd': kw
        }

        data = urllib.parse.urlencode(data).encode('utf-8')

        req = urllib.request.Request(url, data=data, headers=headers)  # POST请求
        response = urllib.request.urlopen(req).read().decode('utf-8')

        data = json.loads(response)

        joblist = data['content']['positionResult']['result']
        # print(joblist)

        for job in joblist:
            city = job['city']
            companyFullName = job['companyFullName']
            companyLabelList = ['companyLabelList']
            companyShortName = job['companyShortName']
            companySize = job['companySize']
            district = job['district']
            education = job['education']
            firstType = job['firstType']
            hitags = job['hitags']
            positionAdvantage = job['positionAdvantage']
            positionLables = job['positionLables']

            print(city, companyFullName, companyLabelList, companySize, district, education, firstType, hitags,
                  positionAdvantage, positionLables)

            # 将爬取的结果保存到pythonJob.txt中
            with open('pythonJob.txt', 'a+', encoding='utf-8', errors='ignore')as f:
                f.write(
                    str((city, companyFullName, companyLabelList, companySize, district, education, firstType, hitags,
                         positionAdvantage, positionLables)) + '\n')
                # 清除缓存
                f.flush()


if __name__ == '__main__':
    totalpage = getPageNum('python')
    getJobInfo('python', totalpage)
爬虫（5）：爬取拉钩网数据

猜你喜欢