from bs4 import BeautifulSoup
import requests
import re
import pymongo
import json
client=pymongo.MongoClient('localhost',27017)
laGou=client['laGou']
sheetLaGou=laGou['sheetLagou']
url = 'https://www.lagou.com/zhaopin/Python/?labelWords=label'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Cookie':'JSESSIONID=ABAAABAAAGFABEF6BBD467BA5628B1ED4B4CED513FDC3A3; user_trace_token=20180822154700-04d36825-377a-47de-b308-a072d89d9d52; _ga=GA1.2.1422200195.1534924022; LGSID=20180822154701-8dcf844b-a5df-11e8-9d32-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fzhaopin%2FPython%2F%3FlabelWords%3Dlabel; LGUID=20180822154701-8dcf8739-a5df-11e8-9d32-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534924022; _gid=GA1.2.571422913.1534924022; X_HTTP_TOKEN=99f9237c908140b8bf95703783f315ee; LG_LOGIN_USER_ID=d5d19028cbc1bc8486708afb288bed37bf822e1f54f85efa; _putrc=39E114755E2348A0; login=true; unick=%E5%BA%84%E5%A9%B7%E5%A9%B7; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=205; gate_login_token=a526a75444357763c74a5a2ea56a671444f5f0d02eb46709; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; SEARCH_ID=86f5a937ee734407b97c705fcfb4aee8; LGRID=20180822160935-b4e6c850-a5e2-11e8-9d33-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534925376'
}
wb_data=requests.get(url,headers=headers)
soup=BeautifulSoup(wb_data.text,'lxml')
areas=soup.select('span.add em')
salarys=soup.select('span.money')
expAndDegrees=soup.select('div.p_bot div.li_b_l')
w1='经验'
w2='/'
w3=' '
fileObject = open('data.json', 'w')
for area,salary,expAndDegree in zip(areas,salarys,expAndDegrees):
expe = (re.compile(w1 + '(.*?)' + w2, re.S)).findall(expAndDegree.get_text())[0].strip()
degree = (re.compile(w2 + '(.*?)' + w3, re.S)).findall(expAndDegree.get_text())[0].strip()
data = {
'area':area.get_text(),
'salary':salary.get_text(),
'expe' :expe,
'degree' : degree
}
print(data)
# sheetLaGou.insert_one(data)
jsonData = json.dumps(data)
fileObject.write(jsonData)
fileObject.close()
有点经验
1,要注意如果是动态加载的网页,注意利用header, 不加的话爬出来的是空列表
2..代码中有部分是要向数据库mongoDb中写数据,根据情况进行注释