import requests import json from bs4 import BeautifulSoup def crawl_detail(id): url = 'https://www.lagou.com/jobs/%s.html' % id headers = { 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E6%88%90%E9%83%BD', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/63.0.3239.132 Safari/537.36' } req = requests.get(url, headers=headers) # print(req.text) soup = BeautifulSoup(req.text, 'lxml') job_bt = soup.find('dd', attrs={'class': 'job_bt'}) job_bt = job_bt.text return job_bt def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/63.0.3239.132 Safari/537.36', 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E6%88%90%E9%83%BD', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': None, 'X-Requested-With': 'XMLHttpRequest' } positions = [] for x in range(1, 2): form_data = { 'first': 'true', 'pn': x, 'kd': 'python' } result = requests.post('https://www.lagou.com/jobs/positionAjax.json?' 'px=default&city=%E6%88%90%E9%83%BD&' 'needAddtionalResult=false',headers=headers,data=form_data) json_result = result.json() # print('*'*30) # print(json_result) # print('*' * 30) page_positions = json_result['content']['positionResult']['result'] for position in page_positions: position_dict = { 'workYear': position['workYear'], 'positionName': position['positionName'], 'salary': position['salary'], 'district': position['district'], 'companyFullName': position['companyFullName'] } position_id = position['positionId'] position_detail = crawl_detail(position_id) position_dict['position_detail'] = position_detail positions.append(position_dict) line = json.dumps(positions, ensure_ascii=False) with open('lagou.json', 'wb') as f: f.write(line.encode('utf-8')) if __name__ == '__main__': main() # crawl_detail('4613044')
python爬虫:爬取拉勾网数据
猜你喜欢
转载自blog.csdn.net/yunfeiyang520/article/details/80627440
今日推荐
周排行