目录
爬取2023年python爬取中国大学排名并且进行数据分析,话不多说,2023年的最新的,复制就可以用,直接上代码。
import requests
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
return html
except:
print("爬取失败")
return None
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='rk-table')
if table is None:
print("未找到排名表格")
return
tbody = table.find('tbody')
if tbody is None:
print("未找到<tbody>标签")
return
data = tbody.find_all('tr')
for tr in data:
tds = tr.find_all('td')
if len(tds) < 5: # 跳过不完整的行
continue
td_2 = tds[2].text.strip() if tds[2].text else ""
td_3 = tds[3].text.strip() if tds[3].text else ""
ulist.append([tds[0].string.strip(), tds[1].find('a').string.strip(),
td_2, td_3, tds[4].string.strip()])
def printUnivList(ulist, num):
file_name = "大学排行.csv"
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
for i in range(num):
u = ulist[i]
writer.writerow(u)
print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")
# 折线图
def drawLineChart(ulist):
ranks = [(u[1]) for u in ulist]
scores = [float(u[4]) for u in ulist]
names = [u[0] for u in ulist]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(10, 6))
plt.plot(ranks, scores)
plt.xlabel('大学名称')
plt.ylabel('总分')
plt.title('大学排行榜折线图')
for i in range(len(ranks)):
plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom')
plt.show()
# 饼图
def generatePieChart(ulist, num):
provinces = {}
for i in range(num):
province = ulist[i][2]
if province in provinces:
provinces[province] += 1
else:
provinces[province] = 1
labels = provinces.keys()
sizes = provinces.values()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.title('大学排行前30名的省份分布')
plt.show()
# plt.rcParams['font.sans-serif'] = ['SimHei']
# 柱形图
def printUnivList2(ulist, num):
ranks = []
scores = []
names = []
for i in range(num):
u = ulist[i]
ranks.append(int(u[0]))
scores.append(float(u[4]))
names.append(u[1])
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.bar(ranks, scores)
plt.xlabel('排名')
plt.ylabel('总分')
plt.title('大学排名')
for i in range(len(ranks)):
# 在每个柱形图上方显示大学名称
plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom', fontsize=4)
plt.show()
# 散点图
def generateScatterPlot(ulist, num):
scores = [float(ulist[i][4]) for i in range(num)]
ranks = [i + 1 for i in range(num)]
names = [ulist[i][1] for i in range(num)]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(12, 8))
plt.scatter(ranks, scores)
plt.title('大学排名与总分的关系')
plt.xlabel('排名')
plt.ylabel('总分')
for i, name in enumerate(names):
plt.annotate(name, (ranks[i], scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.show()
# 箱形图
def generateBoxPlot(ulist, num):
scores = [float(ulist[i][4]) for i in range(num)]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
plt.boxplot(scores)
plt.title('大学总分箱形图')
plt.ylabel('总分')
plt.show()
# 环形图
def generateDonutChart(ulist, num):
provinces = {}
for i in range(num):
province = ulist[i][2]
if province in provinces:
provinces[province] += 1
else:
provinces[province] = 1
labels = provinces.keys()
sizes = provinces.values()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
_, _, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', wedgeprops=dict(width=0.4))
plt.setp(autotexts, size=8)
plt.title('大学排行前30名的省份分布(环形图)')
plt.axis('equal')
plt.show()
def main():
ulist = []
url = 'https://www.shanghairanking.cn/rankings/bcur/202311.html'
html = getHTMLText(url)
if html is not None:
fillUnivList(ulist, html)
printUnivList(ulist, 30)
printUnivList2(ulist, 30) # 柱形图
drawLineChart(ulist) # 折线图
generatePieChart(ulist, 30) # 饼图
generateScatterPlot(ulist, 30) # 调用生成散点图的函数
generateBoxPlot(ulist, 30) # 调用生成箱形图的函数
generateDonutChart(ulist, 30) # 调用生成环形图的函数
main()
下面这个是运行结果
保存到csv
#保存到csv
def printUnivList(ulist, num):
file_name = "大学排行.csv"
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
for i in range(num):
u = ulist[i]
writer.writerow(u)
print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")
柱形图
# 柱形图
def printUnivList2(ulist, num):
ranks = []
scores = []
names = []
for i in range(num):
u = ulist[i]
ranks.append(int(u[0]))
scores.append(float(u[4]))
names.append(u[1])
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.bar(ranks, scores)
plt.xlabel('排名')
plt.ylabel('总分')
plt.title('大学排名')
for i in range(len(ranks)):
# 在每个柱形图上方显示大学名称
plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom', fontsize=4)
plt.show()
折线图
# 折线图
def drawLineChart(ulist):
ranks = [(u[1]) for u in ulist]
scores = [float(u[4]) for u in ulist]
names = [u[0] for u in ulist]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(10, 6))
plt.plot(ranks, scores)
plt.xlabel('大学名称')
plt.ylabel('总分')
plt.title('大学排行榜折线图')
for i in range(len(ranks)):
plt.text(ranks[i], scores[i], names[i], ha='center', va='bottom')
plt.show()
饼图
# 饼图
def generatePieChart(ulist, num):
provinces = {}
for i in range(num):
province = ulist[i][2]
if province in provinces:
provinces[province] += 1
else:
provinces[province] = 1
labels = provinces.keys()
sizes = provinces.values()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.axis('equal')
plt.title('大学排行前30名的省份分布')
plt.show()
散点图
# 散点图
def generateScatterPlot(ulist, num):
scores = [float(ulist[i][4]) for i in range(num)]
ranks = [i + 1 for i in range(num)]
names = [ulist[i][1] for i in range(num)]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(12, 8))
plt.scatter(ranks, scores)
plt.title('大学排名与总分的关系')
plt.xlabel('排名')
plt.ylabel('总分')
for i, name in enumerate(names):
plt.annotate(name, (ranks[i], scores[i]), xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.show()
箱形图
# 箱形图
def generateBoxPlot(ulist, num):
scores = [float(ulist[i][4]) for i in range(num)]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
plt.boxplot(scores)
plt.title('大学总分箱形图')
plt.ylabel('总分')
plt.show()
环形图
# 环形图
def generateDonutChart(ulist, num):
provinces = {}
for i in range(num):
province = ulist[i][2]
if province in provinces:
provinces[province] += 1
else:
provinces[province] = 1
labels = provinces.keys()
sizes = provinces.values()
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(dpi=300) # 设置图像分辨率为300
plt.figure(figsize=(8, 6))
_, _, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', wedgeprops=dict(width=0.4))
plt.setp(autotexts, size=8)
plt.title('大学排行前30名的省份分布(环形图)')
plt.axis('equal')
plt.show()
关注博主下篇更精彩
一键三连!!!
一键三连!!!
一键三连!!!
感谢一键三连!!!