用python爬取拉勾网招聘信息并以CSV文件存储

爬取拉勾网招聘信息

1、在网页源代码中搜索信息,并没有搜到,判断网页信息使用Ajax来实现的
在这里插入图片描述

2、查看网页中所需的数据信息,返回的是JSON数据;
在这里插入图片描述

3、条件为北京+数据分析师的公司一共40087家,而实际拉勾网展示的数据只有
15条/页 * 30页 = 450条,所以需要判断想要展示的数据是否能在30页展示完成,超过30页则获取30页数据在这里插入图片描述

4、获取请求头与Form Data数据
在这里插入图片描述

5、将数据以CSV文件存储(首先创建文件,可以最后将表头信息写入)

with open('lagou.csv', 'w', newline = '', encoding='utf-8') as csvfile:
    fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district',
                  'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion',
                  'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

6、源代码展示

import json
import requests
import math
import time
import csv

headers = {
            'Cookie':'LGUID=20160325221916-8e713da1-f294-11e5-baa9-5254005c3644; __guid=237742470.2209186392686119200.1542463319285.1892; WEBTJ-ID=20181117220200-16721fa777529f-063fc0ee1ebcef-5768397b-1049088-16721fa77761fb; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463323; _ga=GA1.2.1018218803.1542463324; _gid=GA1.2.545935771.1542463324; user_trace_token=20181117220345-9917d84f-ea71-11e8-892e-5254005c3644; LGSID=20181117220345-9917db98-ea71-11e8-892e-5254005c3644; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3Dlagouwang%26oq%3D%252526lt%25253BSDN%2525E5%25258F%252591%2525E5%2525B8%252583%2525E7%25259A%252584%2525E5%25258D%25259A%2525E5%2525AE%2525A2%2525E5%25258F%2525AF%2525E4%2525BB%2525A5%2525E4%2525BF%2525AE%2525E6%252594%2525B9%26rsv_pq%3Ded43d71700033d83%26rsv_t%3D43b1GYsCSHSQp1N%252FIp1eR1J3VXskMjt44RcbJkSNM8%252BbE%252Fc4aKUjcI%252FhflA%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D10%26rsv_sug1%3D3%26rsv_sug7%3D100%26rsv_sug2%3D0%26inputT%3D25206156%26rsv_sug4%3D25206157; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; JSESSIONID=ABAAABAAADEAAFI6E3B7886F4C8194B687AAD66C7925F67; index_location_city=%E5%85%A8%E5%9B%BD; SEARCH_ID=96d9378ed73e4c278dc3c4b140ecebaf; LGRID=20181117220435-b693f2da-ea71-11e8-a49f-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1542463375; monitor_count=7',
            'Host':'www.lagou.com',
	        'Origin':'https://www.lagou.com',	 
	        'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88labelWords=sug&fromSearch=true&suginput=shujufenxishi',labelWords=sug&fromSearch=true&suginput=shujufenxishi',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'X-Anit-Forge-Code':'0',
            'X-Anit-Forge-Token':'None',
            'X-Requested-With':'XMLHttpRequest'
}


# 创建一个csv文件,并将表头信息写入文件中
with open('lagou.csv', 'w', encoding='utf-8') as csvfile:
    fieldnames = ['businessZones', 'companyFullName', 'companyLabelList', 'companyShortName', 'companySize', 'district',
                  'education', 'financeStage', 'firstType', 'industryField', 'industryLables', 'linestaion',
                  'positionAdvantage', 'positionName', 'publisherId', 'salary', 'secondType', 'stationname', 'workYear']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# 判断所查询的信息是否用30页可以展示完,大于30页的爬取30页内容
def get_page(url, params):
    html = requests.post(url, data=params, headers=headers)
    json_data = json.loads(html.text)
    total_count = json_data['content']['positionResult']['totalCount'] # 获取信息公司信息总数
    page_number = math.ceil(total_count / 15) if math.ceil(total_count / 15) < 30 else 30
    get_info(url, page_number)


def get_info(url, page):
    for pn in range(1, page + 1):
        params = {
            'first': 'true' if pn == 1 else 'false',  # 第一页点击是true,其余页均为false
            'pn':str(pn), # 传入页面数的字符串类型
            'kd':'数据分析师' # 想要获取的职位
        }
        try:
            html = requests.post(url, data=params, headers=headers)
            json_data = json.loads(html.text)
            results = json_data['content']['positionResult']['result'] # 获取JSON数据内容
            for result in results: # 获取每条数据并以字典类型存储
                infos = {
                    'businessZones' : result['businessZones'],
                'companyFullName' : result['companyFullName'],
                'companyLabelList' : result['companyLabelList'],
                'companyShortName' : result['companyShortName'],
                'companySize' : result['companySize'],
                'district' : result['district'],
                'education' : result['education'],
                'financeStage' : result['financeStage'],
                'firstType' : result['firstType'],
                'industryField' : result['industryField'],
                'industryLables' : result['industryLables'],
                'linestaion' : result['linestaion'],
                'positionAdvantage' : result['positionAdvantage'],
                'positionName' : result['positionName'],
                'publisherId' : result['publisherId'],
                'salary' : result['salary'],
                'secondType' : result['secondType'],
                'stationname' : result['stationname'],
                'workYear' : result['workYear']
                }
                print('-------------')
                print(infos)
                write_to_file(infos) # 调用写入文件函数
            time.sleep(2)
        except requests.RequestException :
            pass


# 将数据追加写入之前创建的lagou.csv文件中
def write_to_file(content):
    with open('lagou.csv', 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(content)
        csvfile.close()

# 传入url ,(url中参数包括,城市:北京;),引用get_page函数
if __name__ == '__main__':
    url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
    params = {
        'first': 'true',
        'pn': '1',
        'kd': '数据分析师'
    }
    get_page(url, params)

猜你喜欢

转载自blog.csdn.net/qq_43033513/article/details/84197358