Simply use Python to collect recruitment data content and do visual analysis!

Hello everyone, I just graduated now. Many friends don’t like it because they can’t find a job or find a lot of jobs. Some of them are lazy. It’s too hot and they don’t want to go out to look around.

So today I will share with you the use of Python to collect recruitment data in batches, perform visual analysis, and easily find your favorite job!

Not much to say, let's start directly~

Preparation

software tools

  • Python 3.8
  • Pycharm
  • Google Chrome
  • google drive

selenium --> automated test module
to simulate human behavior to operate the browser

manual operation

  1. open browser
  2. enter URL
  3. Find the data content we need
  4. Manually copy and paste into the table file

module use

selenium  # pip install selenium==3.141.0 自动化测试 操作浏览器
csv  # 保存数据 保存csv文件

win + R, enter cmd, and enter the installation command pip install module name (if you think the installation speed is slow, you can switch the domestic mirror source)

If you don’t understand the article, I have also prepared a video explanation, which is packaged together with the code, and the business card at the end of the article is taken by yourself

Data acquisition part code

open browser

Browser driver selection and download:

  • Check browser version
  • Select the driver version that is closest to your browser version
  • The driver file <no need to double-click to install>, put it directly in your python installation directory

main code

driver = webdriver.Chrome()
# 访问网站
driver.get(
    'https://www.***.com/zhaopin/?inputFrom=www_index&workYearCode=0&key=python&scene=input&ckId=rn8762mqhk78fi3d9fiqzzoobk7f66u9&dq=')
"""
找到我们需要的数据内容
    前提 -> 你要让网页元素<数据>加载出来
    通过元素定位, 获取数据内容 <通过标签提取数据> 
"""
# 设置延时, 让网页元素加载完整
driver.implicitly_wait(10)  # 隐式等待, 只要网页加载出来就运行下面的代码
time.sleep(1)  # 死等, 一定要等够
# 获取所有招聘信息对应div标签
divs = driver.find_elements_by_css_selector('.job-list-box div .job-card-left-box')
# print(driver) # webdriver.Chrome() 返回selenium对象
# print(divs)  # 返回列表, 列表里面元素是selenium对象
for div in divs:
    """
    提取具体数据内容, 提取每个div标签里面所包含的数据内容
        .job-title-box div.ellipsis-1
    """
    # 职位
    title = div.find_element_by_css_selector('.job-title-box div.ellipsis-1').text
    # 城市
    city = div.find_element_by_css_selector('.job-title-box span.ellipsis-1').text
    salary = div.find_element_by_css_selector('.job-salary').text
    # 列表推导式
    info_list = [i.text for i in div.find_elements_by_css_selector('.job-labels-box .labels-tag')]
    print(info_list)
    exp = info_list[0]
    edu = info_list[1]
    labels = ','.join(info_list[2:])
    company = div.find_element_by_css_selector('.company-name').text
    company_type = div.find_element_by_css_selector('.company-tags-box span').text
    company_num = div.find_element_by_css_selector('.company-tags-box span:last-of-type').text
    href = div.find_element_by_css_selector('.job-detail-box a').get_attribute('href')
    dit = {
    
    
        '职位': title,
        '城市': city,
        '薪资': salary,
        '经验': exp,
        '学历': edu,
        '技术点': labels,
        '公司': company,
        '公司领域': company_type,
        '公司规模': company_num,
        '详情页': href,
    }
    csv_writer.writerow(dit)
    print(dit)

save form

f = open('python.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
    '职位',
    '城市',
    '薪资',
    '经验',
    '学历',
    '技术点',
    '公司',
    '公司领域',
    '公司规模',
    '详情页',
])
csv_writer.writeheader()

Visualization

read file

import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Distribution of educational requirements for python positions

from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(
                edu_type,
                edu_num,
            )
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="python职位学历需求分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    
)
c.load_javascript()

Distribution of educational requirements for python positions

edu_num = df['经验'].value_counts().to_list()
edu_type = df['经验'].value_counts().index.to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(
                edu_type,
                edu_num,
            )
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="python职位经验需求分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    
)
c.render_notebook()

python job city distribution

edu_num = df['城市'].str[:2].value_counts().to_list()
edu_type = df['城市'].str[:2].value_counts().index.to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(
                edu_type,
                edu_num,
            )
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="python职位城市分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    
)
c.render_notebook()

Python company field distribution

edu_num = df['公司领域'].value_counts().to_list()
edu_type = df['公司领域'].value_counts().index.to_list()
c = (
    Pie()
    .add(
        "",
        [
            list(z)
            for z in zip(
                edu_type,
                edu_num,
            )
        ],
        center=["40%", "50%"],
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="python公司领域分布"),
        legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    
)
c.render_notebook()

Well, that's the end of today's sharing, see you next time!

Guess you like

Origin blog.csdn.net/ooowwq/article/details/131808791