#coding=utf-8 import requests import json import time #拉钩使用的是ajax,通过观察url发现是通过Network-XHR-Headers-FormData-pn来进行换页的 #发送request的url是Headers里的https://www.lagou.com/jobs/positionAjax.json #所以用列表生成params参数来控制爬取页数,获得json并解析获得数据 #敲代码时遇到"您操作太频繁,请稍后再访问"的bug,尝试使用ip代理后仍未解决,百度后得知是headers头不全,被服务器识破,补全headers头后成功 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 \ Safari/537.36', 'Cookie': 'user_trace_token=20180512170100-8f629b1f-6c80-4493-b647-9ba5f2de2ba6; _ga=GA1.2.1959989744.1526115547; ' 'LGUID=20180512170100-fe0ecd62-55c2-11e8-8221-5254005c3644; ' 'JSESSIONID=ABAAABAAAGGABCBD9F9956003C9898101464EA7EC566ABC; _gid=GA1.2.1865548297.1526385869; ' 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526115548,1526385869; index_location_city=%E5%8C%97%E4%BA%AC; ' 'TG-TRACK-CODE=search_code; _gat=1; LGSID=20180516134636-7f1dbdf3-58cc-11e8-aaa5-525400f775ce; ' 'PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; ' 'PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue' '%26suginput%3D; LGRID=20180516134636-7f1dbf88-58cc-11e8-aaa5-525400f775ce; ' 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526449493; SEARCH_ID=96e947ca34b649c2a8aa83901fb16c49', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=' } #ip代理 proxie = { 'http': 'http://61.135.217.7:80', } def get_info(url, params): html = requests.get(url, proxies=proxie, headers=headers, params=params) #url,ip代理,headers头,换页参数 json_data = json.loads(html.text) #加载json results = json_data['content']['positionResult']['result'] #Network-XHR-Preview中获得数据 for result in results: infos = { 'companyFullName': result['companyFullName'], #公司名 'district':result['district'], #地址 'linestaion':result['linestaion'], #具体地址 'firstType':result['firstType'], #类型 'salary':result['salary'], #薪资 'workYear':result['workYear'], #年限 } print(infos,) time.sleep(2) if __name__ == '__main__': url = 'https://www.lagou.com/jobs/positionAjax.json' for pn in range(1, 10): params = { 'first': 'true', 'pn': str(pn), 'kd': 'Python'} get_info(url, params)
python爬取拉钩网
猜你喜欢
转载自blog.csdn.net/qq_18525247/article/details/80342609
今日推荐
周排行