Crawling headhunting net jobs (requests + bs4)

I tend to use obs4 match the amount of data, how simple how come the
url for the search keyword, the default position of the country
last write json file

import requests
import json
from bs4 import BeautifulSoup
# url = 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&d_sfrom=search_fp&key=python'


def get_page():
    """
    获取页数/职业
    """
    page = input('请输入页数:')
    job = input('请输入职业:')

    url = 'https://www.liepin.com/zhaopin/?&key={}'.format(job)
    page_url = '&curPage='
    # 条件:一页/多页
    if page == '1':
        full_url = url
    else:
        for i in range(int(page)):
            full_url = url+page_url+str(i)
            parse(full_url,headers)


def parse(url,headers):
    """
    解析页面/将数据保存到json文件

    """
    response = requests.get(url=url,headers=headers)

    res = response.text
    #创建soup对象
    soup = BeautifulSoup(res,'lxml')
    #定位
    content = soup.select('.sojob-list li')

    # import os
    # os.mkdir('./liepin')
    items = []
    # 组成json对象
    for c in content:
        item = {}
        # 依次为职位,地区,学历,工作经验,年薪
        title = c.select('div h3')[0].get_text().strip()
        area = c.select('p .area')[0].get_text()
        edu = c.select('p .edu')[0].get_text()
        time = c.select('.sojob-item-main div p span')[2].get_text()
        text_warning = c.select('p span')[0].get_text()
        
        item['title'] = title
        item['area'] = area
        item['edu'] = edu
        item['time'] = time
        item['text_warning'] = text_warning
        items.append(item)

    import time
    import hashlib
    # 使用MD5构造一个不重名的文件名
    key = time.time()
    md = hashlib.md5()
    md.update(str(key).encode("utf-8"))
    #加密后的字符串
    file_name = md.hexdigest()
    print('正在下载:%s'%file_name)
    json.dump(items,open('./liepin/'+file_name +'.json','w',encoding="utf-8"),ensure_ascii=False,indent=4)


if __name__ == "__main__":
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.16 Safari/537.36',
        }
    get_page()

What problems can leave a message below will reply

Guess you like

Origin blog.csdn.net/weixin_44220464/article/details/94759663