2023年python爬取中国大学排名并且进行数据分析

目录

保存到csv

柱形图

折线图

饼图

散点图

箱形图 

环形图 


爬取2023年python爬取中国大学排名并且进行数据分析,话不多说,2023年的最新的,复制就可以用,直接上代码。

import requests
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        return html
    except:
        print("爬取失败")
        return None


def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', class_='rk-table')
    if table is None:
        print("未找到排名表格")
        return
    tbody = table.find('tbody')
    if tbody is None:
        print("未找到<tbody>标签")
        return
    data = tbody.find_all('tr')
    for tr in data:
        tds = tr.find_all('td')
        if len(tds) < 5:  # 跳过不完整的行
            continue
        td_2 = tds[2].text.strip() if tds[2].text else ""
        td_3 = tds[3].text.strip() if tds[3].text else ""
        ulist.append([tds[0].string.strip(), tds[1].find('a').string.strip(),
                      td_2, td_3, tds[4].string.strip()])


def printUnivList(ulist, num):
    file_name = "大学排行.csv"
    with open(file_name, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
        for i in range(num):
            u = ulist[i]
            writer.writerow(u)
            print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")

# 折线图
def drawLineChart(ulist):
    ranks = [(u[1]) for u in ulist]
    scores = [float(u[4]) for u in ulist]
    names = [u[0] for u in ulist]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(10, 6))
    plt.plot(ranks, scores)
    plt.xlabel('大学名称')
    plt.ylabel('总分')
    plt.title('大学排行榜折线图')

    for i in range(len(ranks)):
        plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom')

    plt.show()


# 饼图
def generatePieChart(ulist, num):
    provinces = {}
    for i in range(num):
        province = ulist[i][2]
        if province in provinces:
            provinces[province] += 1
        else:
            provinces[province] = 1

    labels = provinces.keys()
    sizes = provinces.values()
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.axis('equal')
    plt.title('大学排行前30名的省份分布')
    plt.show()

    # plt.rcParams['font.sans-serif'] = ['SimHei']


# 柱形图
def printUnivList2(ulist, num):
    ranks = []
    scores = []
    names = []

    for i in range(num):
        u = ulist[i]
        ranks.append(int(u[0]))
        scores.append(float(u[4]))
        names.append(u[1])


    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.bar(ranks, scores)
    plt.xlabel('排名')
    plt.ylabel('总分')
    plt.title('大学排名')

    for i in range(len(ranks)):
        # 在每个柱形图上方显示大学名称
        plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom', fontsize=4)

    plt.show()


# 散点图
def generateScatterPlot(ulist, num):
    scores = [float(ulist[i][4]) for i in range(num)]
    ranks = [i + 1 for i in range(num)]
    names = [ulist[i][1] for i in range(num)]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(12, 8))
    plt.scatter(ranks, scores)
    plt.title('大学排名与总分的关系')
    plt.xlabel('排名')
    plt.ylabel('总分')

    for i, name in enumerate(names):
        plt.annotate(name, (ranks[i], scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8)

    plt.show()


# 箱形图
def generateBoxPlot(ulist, num):
    scores = [float(ulist[i][4]) for i in range(num)]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    plt.boxplot(scores)
    plt.title('大学总分箱形图')
    plt.ylabel('总分')
    plt.show()


# 环形图
def generateDonutChart(ulist, num):
    provinces = {}
    for i in range(num):
        province = ulist[i][2]
        if province in provinces:
            provinces[province] += 1
        else:
            provinces[province] = 1

    labels = provinces.keys()
    sizes = provinces.values()
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    _, _, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', wedgeprops=dict(width=0.4))
    plt.setp(autotexts, size=8)
    plt.title('大学排行前30名的省份分布(环形图)')
    plt.axis('equal')
    plt.show()


def main():
    ulist = []
    url = 'https://www.shanghairanking.cn/rankings/bcur/202311.html'
    html = getHTMLText(url)
    if html is not None:
        fillUnivList(ulist, html)
        printUnivList(ulist, 30)
        printUnivList2(ulist, 30)  # 柱形图
        drawLineChart(ulist)  # 折线图
        generatePieChart(ulist, 30)  # 饼图
        generateScatterPlot(ulist, 30)  # 调用生成散点图的函数
        generateBoxPlot(ulist, 30)  # 调用生成箱形图的函数
        generateDonutChart(ulist, 30)  # 调用生成环形图的函数


main()

下面这个是运行结果


保存到csv

#保存到csv
def printUnivList(ulist, num):
    file_name = "大学排行.csv"
    with open(file_name, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
        for i in range(num):
            u = ulist[i]
            writer.writerow(u)
            print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")

柱形图

# 柱形图
def printUnivList2(ulist, num):
    ranks = []
    scores = []
    names = []

    for i in range(num):
        u = ulist[i]
        ranks.append(int(u[0]))
        scores.append(float(u[4]))
        names.append(u[1])


    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.bar(ranks, scores)
    plt.xlabel('排名')
    plt.ylabel('总分')
    plt.title('大学排名')

    for i in range(len(ranks)):
        # 在每个柱形图上方显示大学名称
        plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom', fontsize=4)

    plt.show()

折线图

# 折线图
def drawLineChart(ulist):
    ranks = [(u[1]) for u in ulist]
    scores = [float(u[4]) for u in ulist]
    names = [u[0] for u in ulist]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(10, 6))
    plt.plot(ranks, scores)
    plt.xlabel('大学名称')
    plt.ylabel('总分')
    plt.title('大学排行榜折线图')

    for i in range(len(ranks)):
        plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom')

    plt.show()

饼图

# 饼图
def generatePieChart(ulist, num):
    provinces = {}
    for i in range(num):
        province = ulist[i][2]
        if province in provinces:
            provinces[province] += 1
        else:
            provinces[province] = 1

    labels = provinces.keys()
    sizes = provinces.values()
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    plt.axis('equal')
    plt.title('大学排行前30名的省份分布')
    plt.show()

散点图

# 散点图
def generateScatterPlot(ulist, num):
    scores = [float(ulist[i][4]) for i in range(num)]
    ranks = [i + 1 for i in range(num)]
    names = [ulist[i][1] for i in range(num)]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(12, 8))
    plt.scatter(ranks, scores)
    plt.title('大学排名与总分的关系')
    plt.xlabel('排名')
    plt.ylabel('总分')

    for i, name in enumerate(names):
        plt.annotate(name, (ranks[i], scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8)

    plt.show()

箱形图
 

# 箱形图
def generateBoxPlot(ulist, num):
    scores = [float(ulist[i][4]) for i in range(num)]
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    plt.boxplot(scores)
    plt.title('大学总分箱形图')
    plt.ylabel('总分')
    plt.show()

环形图 

# 环形图
def generateDonutChart(ulist, num):
    provinces = {}
    for i in range(num):
        province = ulist[i][2]
        if province in provinces:
            provinces[province] += 1
        else:
            provinces[province] = 1

    labels = provinces.keys()
    sizes = provinces.values()
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.figure(dpi=300)  # 设置图像分辨率为300
    plt.figure(figsize=(8, 6))
    _, _, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', wedgeprops=dict(width=0.4))
    plt.setp(autotexts, size=8)
    plt.title('大学排行前30名的省份分布(环形图)')
    plt.axis('equal')
    plt.show()

关注博主下篇更精彩

一键三连!!!

一键三连!!!

一键三连!!!
感谢一键三连!!!

猜你喜欢

转载自blog.csdn.net/m0_56073435/article/details/131650604