I tend to use obs4 match the amount of data, how simple how come the
url for the search keyword, the default position of the country
last write json file
import requests
import json
from bs4 import BeautifulSoup
# url = 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python'
def get_page():
"""
获取页数/职业
"""
page = input('请输入页数:')
job = input('请输入职业:')
url = 'https://www.liepin.com/zhaopin/?&key={}'.format(job)
page_url = '&curPage='
# 条件:一页/多页
if page == '1':
full_url = url
else:
for i in range(int(page)):
full_url = url+page_url+str(i)
parse(full_url,headers)
def parse(url,headers):
"""
解析页面/将数据保存到json文件
"""
response = requests.get(url=url,headers=headers)
res = response.text
#创建soup对象
soup = BeautifulSoup(res,'lxml')
#定位
content = soup.select('.sojob-list li')
# import os
# os.mkdir('./liepin')
items = []
# 组成json对象
for c in content:
item = {}
# 依次为职位,地区,学历,工作经验,年薪
title = c.select('div h3')[0].get_text().strip()
area = c.select('p .area')[0].get_text()
edu = c.select('p .edu')[0].get_text()
time = c.select('.sojob-item-main div p span')[2].get_text()
text_warning = c.select('p span')[0].get_text()
item['title'] = title
item['area'] = area
item['edu'] = edu
item['time'] = time
item['text_warning'] = text_warning
items.append(item)
import time
import hashlib
# 使用MD5构造一个不重名的文件名
key = time.time()
md = hashlib.md5()
md.update(str(key).encode("utf-8"))
#加密后的字符串
file_name = md.hexdigest()
print('正在下载:%s'%file_name)
json.dump(items,open('./liepin/'+file_name +'.json','w',encoding="utf-8"),ensure_ascii=False,indent=4)
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.16 Safari/537.36',
}
get_page()
What problems can leave a message below will reply