python爬取拉钩网

#coding=utf-8
import requests
import json
import time
#拉钩使用的是ajax,通过观察url发现是通过Network-XHR-Headers-FormData-pn来进行换页的
#发送request的url是Headers里的https://www.lagou.com/jobs/positionAjax.json
#所以用列表生成params参数来控制爬取页数,获得json并解析获得数据
#敲代码时遇到"您操作太频繁,请稍后再访问"的bug,尝试使用ip代理后仍未解决,百度后得知是headers头不全,被服务器识破,补全headers头后成功

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 \
            Safari/537.36',
    'Cookie': 'user_trace_token=20180512170100-8f629b1f-6c80-4493-b647-9ba5f2de2ba6; _ga=GA1.2.1959989744.1526115547; '
              'LGUID=20180512170100-fe0ecd62-55c2-11e8-8221-5254005c3644; '
              'JSESSIONID=ABAAABAAAGGABCBD9F9956003C9898101464EA7EC566ABC; _gid=GA1.2.1865548297.1526385869; '
              'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526115548,1526385869; index_location_city=%E5%8C%97%E4%BA%AC; '
              'TG-TRACK-CODE=search_code; _gat=1; LGSID=20180516134636-7f1dbdf3-58cc-11e8-aaa5-525400f775ce; '
              'PRE_UTM=; PRE_HOST=; PRE_SITE=https%3A%2F%2Fwww.lagou.com%2F; '
              'PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue'
              '%26suginput%3D; LGRID=20180516134636-7f1dbf88-58cc-11e8-aaa5-525400f775ce; '
              'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1526449493; SEARCH_ID=96e947ca34b649c2a8aa83901fb16c49',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
}
#ip代理
proxie = {
    'http': 'http://61.135.217.7:80',
}

def get_info(url, params):
    html = requests.get(url, proxies=proxie, headers=headers, params=params)    #url,ip代理,headers头,换页参数
    json_data = json.loads(html.text)                                           #加载json
    results = json_data['content']['positionResult']['result']                  #Network-XHR-Preview中获得数据
    for result in results:
        infos = {
            'companyFullName': result['companyFullName'],    #公司名
            'district':result['district'],                   #地址
            'linestaion':result['linestaion'],               #具体地址
            'firstType':result['firstType'],                 #类型
            'salary':result['salary'],                       #薪资
            'workYear':result['workYear'],                   #年限
        }
        print(infos,)
    time.sleep(2)

if __name__ == '__main__':
    url = 'https://www.lagou.com/jobs/positionAjax.json'
    for pn in range(1, 10):
        params = {
            'first': 'true',
            'pn': str(pn),
            'kd': 'Python'}
        get_info(url, params)
猜你喜欢