1.信息获取,所需工具:拉勾网、Python3。 原来课程地址:python拉勾网爬虫
反爬:伪造浏览器请求
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
本质是http请求,json也是请求,找到正确url(小心post,get请求)
多观察,对获取数据检查是否符合要求。
编码问题: 小心写入方式(追加,覆盖)
with open ('lagou.json','ab+') as fp: fp.write(line.encode('utf-8'))
哈哈,发现大家挺喜欢的:上代码
getdata.py
#导入需要的库 import requests #请求网页 from bs4 import BeautifulSoup #解析 import json import time import numpy as np import pandas as pd import openpyxl headers={ 'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c', 'Host':'www.lagou.com', 'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=', 'Origin':'https://www.lagou.com', 'X-Anit-Forge-Code':'0', 'X-Anit-Forge-Token':'None', 'X-Requested-With':'XMLHttpRequest', 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1' } #模拟头部信息 positions=[] #职位信息 def main(): for x in range(1,3): #爬取几页 #构造post请求参数 data={ 'first':'true', 'pn':x, 'kd':'数据分析' } result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data) #请求网页 # print(result.content) json_result=result.json() # print(json_result) # print(type(json_result)),json转为字典 page_position=json_result['content']['positionResult']['result'] print(page_position) # # #放里面,保存为json格式,追加方式写入 line=json.dumps(page_position,ensure_ascii=False) # print(line),放for循环外边就不用追加了 with open ('lagou.json','ab+') as fp: fp.write(line.encode('utf-8')) positions.extend(page_position) #列表添加列表,所有数据以列表形式存在列表里面 #时间改大,少请求 time.sleep(3) #睡3秒钟 #print(positions) # 放外面,最后列表追加数据到excel,哈哈 # positions=pd.DataFrame(data=positions) # positions.to_excel('拉勾网数据分析职位.xlsx',index=False) if __name__ == '__main__': #主函数入口 main()
get_datail.py
#导入需要的库 import requests #请求网页 from bs4 import BeautifulSoup #解析 import json import time import numpy as np import pandas as pd import openpyxl headers={ 'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c', 'Host':'www.lagou.com', 'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=', 'Origin':'https://www.lagou.com', 'X-Anit-Forge-Code':'0', 'X-Anit-Forge-Token':'None', 'X-Requested-With':'XMLHttpRequest', 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1' } #模拟头部信息 positions=[] #职位信息 def page_detail(id): url='https://www.lagou.com/jobs/%s.html' %id headers={ 'Host':'www.lagou.com', 'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E6%88%90%E9%83%BD', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } req=requests.get(url,headers=headers) soup=BeautifulSoup(req.content,'lxml') googs=soup.select('#job_detail > dd.job-advantage > p') woekerjob=soup.select('#job_detail > dd.job_bt > div') if googs[0]=='': print(None) else: print(googs[0].get_text()) print( woekerjob[0].get_text()) time.sleep(1) def main(): for x in range(1,2): #爬取几页 #构造post请求参数 data={ 'first':'true', 'pn':x, 'kd':'数据分析' } result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data) #请求网页 json_result=result.json() page_position=json_result['content']['positionResult']['result'] for position in page_position: position_dict={ 'position_name': position["positionName"], 'position_salary': position[ "salary"], 'position_year': position["workYear"], 'position_companyname' : position["companyShortName"], } position_number=position["positionId"] print(position_number) page_detail(position_number) # print(page_position) # # #放里面,保存为json格式,追加方式写入 positions.extend(page_position) #列表添加列表,所有数据以列表形式存在列表里面 #时间改大,少请求 time.sleep(3) #睡3秒钟 #print(positions) if __name__ == '__main__': #主函数入口 main()其中getdetail里面获取数据为空时处理不好,总是IndexError: list index out of range,这个让我接着学习再说。
总结:发现对py的字典不熟悉,json搞混,在就是if a==b容易搞成a=b,以后还得多加练习。