前程无忧51Job岗位爬取~~新手路过,请多指教

import urllib.request  # 用于请求打开网页
from bs4 import BeautifulSoup  # 网页解析获取数据
import re  # 引入正则表达式
import xlwt  # 保存文件为excel

# 正则表达式提取岗位信息
jobHref = re.compile(r'"job_href":"(.*?)"', re.S)  # 岗位链接
jobName = re.compile(r'"job_name":"(.*?)"', re.S)  # 岗位名称
comHref = re.compile(r'"company_href":"(.*?)"', re.S)  # 公司链接
comName = re.compile(r'"company_name":"(.*?)"')  # 公司名称
salary = re.compile(r'"providesalary_text":"(.*?)"')  # 薪资水平
companytype = re.compile(r'"companytype_text":"(.*?)"')  # 公司类型
attribute = re.compile(r'"attribute_text":\[(.*?)\]', re.S)  # 招聘条件
workarea = re.compile(r'"workarea_text":"(.*?)"', re.S)  # 工作地点
companysize = re.compile(r'"companysize_text":"(.*?)"', re.S)  # 公司规模
companyind = re.compile(r'"companyind_text":"(.*?)"', re.S)  # 主要业务
jobwelf = re.compile(r'"jobwelf":"(.*?)"', re.S)  # 福利待遇


def main():
    key = input("请输入您想查询的岗位:")
    baseurl1 = "https://search.51job.com/list/030200,000000,0000,00,9,99," + key + ",2,"
    baseurl2 = ".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
    savePath = f".\\{key}_广州.xls"
    # 1.爬取网页
    datalist = getData(baseurl1, baseurl2)
    # 2.保存数据
    saveData(datalist, savePath)


# 模拟浏览器,包装url,
def askUrl(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
    }
    request = urllib.request.Request(url=url, headers=head)
    response = urllib.request.urlopen(request)
    html = response.read().decode("gbk")
    return html


# 爬取网页 ,注意这一步我们所需要的数据不在标签中,而是在Js里面
def getData(baseurl1, baseurl2):
    datalist = []

    for i in range(1, 11):
        datapage = []
        url = baseurl1 + str(i) + baseurl2  # 将两个baseurl串起来就是一个整体的url,str[i]是随着页面变化的变量
        html = askUrl(url)  # 保存获取到的html

        # 逐一解析
        soup = BeautifulSoup(html, "html.parser")

        # 找到所需要的数据块
        for item in soup.find_all(text=jobHref):
            job_href = re.findall(jobHref, item)
            job_name = re.findall(jobName, item)
            com_href = re.findall(comHref, item)
            com_name = re.findall(comName, item)
            salary_level = re.findall(salary, item)
            company_type = re.findall(companytype, item)
            condition = re.findall(attribute, item)
            work_area = re.findall(workarea, item)
            company_size = re.findall(companysize, item)
            company_service = re.findall(companysize, item)
            job_welf = re.findall(jobwelf, item)
        datapage.append([job_href, job_name, com_href, com_name, salary_level, company_type, condition, work_area,
                         company_size, company_service, job_welf])

        for j in range(len(job_href)):
            record = [job_href[j], job_name[j], com_href[j], com_name[j], salary_level[j], company_type[j],
                      condition[j], work_area[j], company_size[j], company_service[j], job_welf[j]]
            datalist.append(record)
    return datalist


# 保存数据
def saveData(datalist, savePath):
    # 创建表格,添加sheet
    workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)
    sheet = workbook.add_sheet('java最新招聘情况', cell_overwrite_ok=True)

    # 将表头写入表中
    col = ["岗位链接", "岗位名称", "公司链接", "公司名称", "薪资水平", "公司类型", "招聘条件", "工作地点", "公司规模",
           "主要业务", "福利待遇"]
    for i in range(0, len(col)):
        sheet.write(0, i, col[i])  # 表示将col[i]写入第0行第i列

    # 将每条数据写入表中
    for i in range(len(datalist)):
        print(f"第{i + 1}条数据")
        data = datalist[i]
        for j in range(0, 11):
            sheet.write(i + 1, j, data[j])  # 表示将data[j]写入第i+1行第j列
    workbook.save(savePath)


if __name__ == "__main__":
    main()
    print("爬虫执行完毕!")

猜你喜欢

转载自blog.csdn.net/baidu_41833099/article/details/118764679