Python爬取拉勾网数据分析职位

python文件打开方式详解——a、a+、r+、w+区别

Python模块学习 - openpyxl


1.信息获取,所需工具:拉勾网、Python3。  原来课程地址:python拉勾网爬虫

反爬:伪造浏览器请求

'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'

本质是http请求,json也是请求,找到正确url(小心post,get请求)

多观察,对获取数据检查是否符合要求。

编码问题: 小心写入方式(追加,覆盖)

 with open ('lagou.json','ab+') as fp:
      fp.write(line.encode('utf-8'))

哈哈,发现大家挺喜欢的:上代码

getdata.py

#导入需要的库
import requests  #请求网页
from bs4 import BeautifulSoup   #解析
import json
import time
import numpy as np
import pandas as pd
import openpyxl

headers={
         'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c',
         'Host':'www.lagou.com',
         'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
         'Origin':'https://www.lagou.com',
         'X-Anit-Forge-Code':'0',
         'X-Anit-Forge-Token':'None',
         'X-Requested-With':'XMLHttpRequest',
         'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
        }     #模拟头部信息

positions=[]  #职位信息

def main():
  for x in range(1,3):   #爬取几页
    #构造post请求参数
    data={
          'first':'true',
          'pn':x,
          'kd':'数据分析'
          }
    result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data)  #请求网页
    # print(result.content)
    json_result=result.json()
    # print(json_result)
    # print(type(json_result)),json转为字典
    page_position=json_result['content']['positionResult']['result']
    
    print(page_position)
    # # #放里面,保存为json格式,追加方式写入
    line=json.dumps(page_position,ensure_ascii=False)
    # print(line),放for循环外边就不用追加了
    with open ('lagou.json','ab+') as fp:
      fp.write(line.encode('utf-8'))

    positions.extend(page_position)   #列表添加列表,所有数据以列表形式存在列表里面
    #时间改大,少请求
    time.sleep(3)   #睡3秒钟
    #print(positions)

  # 放外面,最后列表追加数据到excel,哈哈
  # positions=pd.DataFrame(data=positions)
  # positions.to_excel('拉勾网数据分析职位.xlsx',index=False)

if __name__ == '__main__':  #主函数入口
  main()

get_datail.py

#导入需要的库
import requests  #请求网页
from bs4 import BeautifulSoup   #解析
import json
import time
import numpy as np
import pandas as pd
import openpyxl

headers={
         'Cookie':'_ga=GA1.2.2046537735.1519346482; user_trace_token=20180223084123-462157d6-1832-11e8-8df7-525400f775ce; LGUID=20180223084123-46215dc1-1832-11e8-8df7-525400f775ce; LG_LOGIN_USER_ID=a74eb645299f49ec2b1f0f98d8f27071b23ad1b8c3e4a22f; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; index_location_city=%E4%B8%8A%E6%B5%B7; WEBTJ-ID=20180417084813-162d112cf2b4d9-0300e63ea642e6-4545092c-2073600-162d112cf2c306; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523407416,1523407491,1523926094; _gid=GA1.2.1279679635.1523926094; LGSID=20180417084813-0203736a-41d9-11e8-b8a4-5254005c3644; PRE_UTM=m_cf_cpt_sogou_pc; PRE_HOST=www.sogou.com; PRE_SITE=https%3A%2F%2Fwww.sogou.com%2Fsogou%3Fquery%3D%25C0%25AD%25B9%25B4%26_asf%3Dwww.sogou.com%26_ast%3D1523926090%26w%3D01019900%26p%3D40040100%26pid%3Dsogou-site-c02d0450cdd75ce7%26sut%3D1050%26sst0%3D1523926089994%26lkt%3D0%252C0%252C0; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_sogou_pc; _putrc=6EA73BBBB51DEF7E; JSESSIONID=ABAAABAAADEAAFI11A5943D5AD8FA9FDAB240DFAD660213; login=true; unick=%E5%86%AF%E7%AB%B9%E5%90%9B; hasDeliver=134; gate_login_token=cc1c59fd3fc91706eb01534899470d38000e54a63b0db428; TG-TRACK-CODE=index_search; LGRID=20180417084823-0861f794-41d9-11e8-88dc-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1523926106; SEARCH_ID=6be8a0349e5e4993a98387489af5cc6c',
         'Host':'www.lagou.com',
         'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=',
         'Origin':'https://www.lagou.com',
         'X-Anit-Forge-Code':'0',
         'X-Anit-Forge-Token':'None',
         'X-Requested-With':'XMLHttpRequest',
         'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1'
        }     #模拟头部信息

positions=[]  #职位信息


def page_detail(id):

  url='https://www.lagou.com/jobs/%s.html' %id
  headers={
  'Host':'www.lagou.com',
  'Referer':'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?px=default&city=%E6%88%90%E9%83%BD',
  'Upgrade-Insecure-Requests':'1',
  'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
  }
  req=requests.get(url,headers=headers)
  soup=BeautifulSoup(req.content,'lxml')
  googs=soup.select('#job_detail > dd.job-advantage > p')
  woekerjob=soup.select('#job_detail > dd.job_bt > div')
  if googs[0]=='':
    print(None)
  else:
    print(googs[0].get_text())
    print( woekerjob[0].get_text())
    time.sleep(1)



def main():
  for x in range(1,2):   #爬取几页
    #构造post请求参数
    data={
          'first':'true',
          'pn':x,
          'kd':'数据分析'
          }
    result=requests.post('https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E6%88%90%E9%83%BD&needAddtionalResult=false',headers=headers,data=data)  #请求网页

    json_result=result.json()
    page_position=json_result['content']['positionResult']['result']
    for position in page_position:
      position_dict={
      'position_name': position["positionName"],
      'position_salary': position[ "salary"],
      'position_year': position["workYear"],
      'position_companyname' : position["companyShortName"],
      
      }
      position_number=position["positionId"]
      print(position_number)
      page_detail(position_number)



    # print(page_position)
    # # #放里面,保存为json格式,追加方式写入
    positions.extend(page_position)   #列表添加列表,所有数据以列表形式存在列表里面
    #时间改大,少请求
    time.sleep(3)   #睡3秒钟
    #print(positions)


if __name__ == '__main__':  #主函数入口
  main()
其中getdetail里面获取数据为空时处理不好,总是IndexError: list index out of range,这个让我接着学习再说。

总结:发现对py的字典不熟悉,json搞混,在就是if a==b容易搞成a=b,以后还得多加练习。



猜你喜欢

转载自blog.csdn.net/sinat_23880167/article/details/80471860