2023深圳杯A

1.1

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#设置字体和解决负号显示问题
plt.rcParams['font.sans-serif']=['simHei']
plt.rcParams['axes.unicode_minus'] = False

#加载数据
data_path = “附件22.csv”#请将此路径替换为你的文件路径,非常重要,文件路径一定要写对
data = pd.read_csv(data_path)

#使用DataFrame的第一行替换列名
data.columns = data.iloc[0]

#删除第一行
data = data.drop(0)

data = data.sample(50, random_state=42)#这一行是新添加的

#定义相关的的列,这里也是给你降低查重用的,每个人选择都不一样,你根据自己的选择来进行分析

columns_of_interest = [
                     "是否吃大米","食用大米的频率",
                     "是否吃小麦面粉","食用小麦面粉的频率",
                     "是否吃杂粮","食用杂粮的频率",
                     "是否吃薯类","食用薯类的频率",
                     "是否吃油炸面食","食用油炸面食的频率",
                     "是否吃猪肉","食用猪肉的频率",
                     "是否吃牛羊肉","食用牛羊肉的频率",
                     "是否吃禽肉","食用禽肉的频率",
                     "是否吃内脏类","食用内脏的频率",
                     "是否吃水产类","食用水产的频率",
                     "是否吃鲜奶","食用鲜奶的频率",
                     "是否吃奶粉","食用奶粉的频率",
                     "是否吃酸奶","食用酸奶的频率",
                     "是否吃蛋类","食用蛋类的频率",
                     "胆固醇","血糖","高密度脂蛋白","低密度脂蛋白","甘油三酯","尿酸"]

data_filtered = data[columns_of_interest]

#删除只有一个唯一值的列
data_filtered = data_filtered.loc[:, data_filtered.nunique() > 1]

#删除全是NaN的列
data_filtered = data_filtered.dropna(how='all', axis=1)

#将所有可以转换为数值的列转换为数值
data_filtered = data_filtered.apply(pd.to_numeric, errors='coerce')

#计算相关性矩阵
corr_matrix = data_filtered.corr()

#删除相关性矩阵中的NaN值
corr_matrix = corr_matrix.dropna(how='all').dropna(axis=1, how='all')

#创建热图
plt.figure(figsize=(14,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='viridis',
            cbar=True, square=True)

plt.title("变量之间的相关性热图")

plt.show()

1.2

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#设置字体和解决负号显示问题
plt.rcParams['font.sans-serif']=['simHei']
plt.rcParams['axes.unicode_minus'] = False

#加载数据
data = pd.read_csv("附件22.csv")

#使用DataFrame的第一行替换列名
data.columns = data.iloc[0]

#删除第一行
data = data.iloc[1:]

#选取相关列
food_columns = ["是否吃大米","食用大米的频率","是否吃小麦面粉","食用小麦面粉的频率","是否吃杂粮",“食用杂粮的频率",
"是否吃薯类","食用薯类的频率","是否吃油炸面食","食用油炸面食的频率","是否吃猪肉","食用猪肉的频率",
"是否吃牛羊肉","食用牛羊肉的频率","是否吃禽肉","食用禽肉的频率","是否吃鲜奶","食用鲜奶的频率",
"是否吃奶粉","食用奶粉的频率","是否吃酸奶","食用酸奶的频率","是否吃蛋类","食用蛋类的频率"]

#获取相关数据
food_data = data[food_columns]

#将数据类型转换为数值型
food_data = food_data.apply(pd.to_numeric, errors='coerce')

#对于"是否吃..."的列,将缺失值填充为0(表示不吃)
for column in food_data.columns :
    if column .startswith("是否吃"):
        food_data[column] = food_data[column].fillna(0)

#对于食用...频率"的列,将缺失值填充为食物的平均食用频率
for column in food_data.columns:
    if column. startswith("食用")∶
        food_data[column] = food_data[column].fillna(food_data[column].mean())

#生成食物名称和颜色的字典
food_colors = {"大米": "#07B9F7","小麦面粉": "#45A1F1","杂粮": "#7087E1","薯类": "#906AC5","油炸面食": "#A54A9E","猪肉": "#AC2571","牛单肉": "#07B9F7","禽肉": "#45A1F1","鲜奶": "#7087E1,"奶粉": "#906AC5","酸奶": "#A54A9E","蛋类": "#AC2571"
}


#韧始化图像
plt.figure(figsize=(20,10))

#为每种食物生成一个条形图
for i,(food, color) in enumerate(food_colors.items()):
    #计算食物的平均食用频率
    food_frequency = food_data[f"食用{food}的频率"].mean()

    #生成条形图,设置柱子的宽度为0.5
    plt.bar(i, food_frequency, color=color, width=0.5)

    #在柱子上方添加数值标签
    plt.text(i, food_frequency + 0.01, round(food_frequency,2), ha='center', va= 'bottom', fontsize=10, color=" black')

#设置图像的标题和坐标轴标签
plt.title("食物的平均食用频率", fontsize=16, color='blue')
plt.xlabel("食物", fontsize=12, color='blue')
plt.ylabel("平均食用频率", fontsize=12, color='blue')

#设置x轴的刻度和标签
plt.xticks(range(len(food_colors)), food_colors.keys(), rotation=45)

#添加网格
plt.grid(True, linestyle='--', color='grey', alpha=0.5)

#显示图像
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

#加载数据
data = pd.read_csv("附件22.csv", skiprows=1)

#选取相关列
food_columns = ["是否吃大米","是否吃小麦面粉","是否吃杂粮","是否吃薯类","是否吃油炸面食",
"是否吃猪肉","是否吃牛羊肉","是否吃禽肉","是否吃内脏类","是否吃水产",
"是否吃鲜奶","是否吃奶粉","是否吃酸奶","是否吃蛋类"]

#血糖值
y = data["血糖"]

#填充数据中的缺失值
data_filled = data.fillna(data.mean())

#计算斯皮尔曼相关系数
correlation = [spearmanr(data_filled[food_column], data_filled["血糖"])[0] for food_column in food_columns)

#归一化特征的相关性,以便可视化
correlation_scaled = (correlation - np.min(correlation)) / (np.max(correlation) - np.min(correlation)

#颜色列表
colors = ["#D707F7", "#FF4577", "#FF914D", "#FFCA46", "#F9F871"] * 3

#创建条形图
fig, ax = plt.subplots(figsize=(10, 6))

#为每个条形设定颜色
bars = ax.barh(range(len(correlation_scaled)), correlation_scaled, color=colors)

#设置y轴标签
ax.set_yticks(range(len(food_columns)))
ax.set_yticklabels(food_columns)

plt.title('食物与血糖的斯皮尔曼相关性')
plt.show()

2.1

#引入所需的库
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#设置字体和解决负号显示问题
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False

#读取数据
data_path =“附件22.csv"
data = pd.read_csv(data_path,skiprows=1)
data.columns = data.columns.str.replace( ' ', '')

#填充缺失值
data_filled = data.fillna(0)

#年龄分组
bins = [18, 30, 40, 50, 60, np.inf]
labels = ['18-30', '31-40', '41-50', '51-60', '61+']
data_filled['age_group'] = pd.cut(data_filled['出生年'], bins=bins, labels=labels)


#定义生活习惯、饮食习惯和人口统计因素的列
lifestyle_cols = ['是否吸烟', '开始吸烟年龄', '平均每周吸烟天数', '一天吸烟支数', '被动吸烟天数', '被动吸烟天数', '是否饮酒', '饮酒年数', '是否饮用高度白酒', '饮用频率', '是否饮用黄酒、糯米酒', '平均每次饮用量', '是否饮用葡萄酒', '平均每次饮用量'
diet_cols = ['是否吃大米', '食用大米的频率', '平均每次食用量', '是否吃小麦面粉', '食用小麦面粉的频率', '平均每次食用量', '是否吃杂粮', '食用杂粮的频率', '平均每次食用量',
demo_cols = ['age_group', '性别', '婚姻状况', '文化程度', '职业']

#初始化一个空的dataframe来存储结果
chi2_results_lifestyle = pd.DataFrame(columns=['variable1', 'variable2', 'chi2', 'p-value'])
chi2_results_diet = pd.DataFrame(columns=['variable1', 'variable2', 'chi2', 'p-value'])

#进行卡方检验
for col1 in lifestyle_cols:
for col2 in demo_cols:
contingency_table = pd.crosstab(data_filled[col1], data_filled[col2])
chi2, p, _, _ = chi2_contingency(contingency_table)
chi2_results_lifestyle = chi2_results_lifestyle.append({'variable1': col1,  'variable': col2,  'chi2': chi2, 'p-value': p}, ignore_index=True)

for col1 in diet_cols:
for col2 in demo_cols:
contingency_table = pd.crosstab(data_filled[col1], data_filled[col2])
chi2, p, _, _ = chi2_contingency(contingency_table)
chi2_results_diet = chi2_results_diet.append({'variable1': col1,  'variable': col2,  'chi2': chi2, 'p-value': p}, ignore_index=True)



subset_lifestyle = chi2_results_lifestyle[chi2_results_lifestyle 'p-value ] < 0.05].sort_yalues(by chi2', ascending=False)
subset_diet = chi2_results_diet[chi2_results_diet[ 'p-value'] < 0.05].sort_yalues(by='chi2', ascending=False)

n_subplots = len(subset_lifestyle['variable1'].unique())

plt.figure(figsize=(10, 6 * n_subplots))
for i, var in enumerate(subset_lifestyle['variable1'].unique(), start=1):
plt.subplot(n_subplots, 1, i)
data_to_plot = subset_lifestyle[subset_lifestyle['variable1'] == var]
sns.barplot(data=data_to_plot, x= 'chi2', y= 'variable2', hue='variable2', dodge=False, palette=['#38147c', '#00440',
plt.xlabel('卡方统计量')
plt.ylabel('人口统计因素')
plt.title(f'生活习惯: {var}')
plt.legend(title='人口统计因素')

plt.tight_layout()
plt.show()

n_subplots = len(subset_diet['variable1'].unique())

plt.figure(figsize=(10, 6 * n_subplots))
for i, var in enumerate(subset_diet['variable1'].unique(), start=1):
plt.subplot(n_subplots, 1, i)
data_to_plot = subset_diet[subset_diet['variable1'] == var]
sns.barplot(data=data_to_plot, x= 'chi2', y= 'variable2', hue='variable2', dodge=False, palette=['#38147c', '#00440',
plt.xlabel('卡方统计量')
plt.ylabel('人口统计因素')
plt.title(f'饮食习惯: {var}')
plt.legend(title='人口统计因素')

plt.tight_layout()
plt.show()

3.1

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingclassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

#设置字体和解决负号显示问题
plt.rcParams['font.sans-serif']=['simHei']
plt.rcParams['axes.unicode_minus'] = False

#读取数据
data = pd .read_csv("附件22.csv", skiprows=1)
data.fillna(0, inplace=True)

#定义特征和目标变量
lifestyle_cols = ['是否吸烟', '开始吸烟年龄', '平均每周吸烟天数', '一天吸烟支数','被动吸烟天数', '是否饮酒', "饮酒年数', '是否饮用高度白酒', '饮用频率', '平均每次饮用量', '是否饮用低度白酒', '是否饮用啤酒', '是否饮用黄酒、糯米酒', '是否饮用葡萄酒']
diet_cols = ['是否吃大米', '食用大米的频率', '平均每次食用量', '是否吃小麦面粉', '食用小麦面粉的频率', '平均每次食用量', '是否吃杂粮', '食用杂粮的频率']
target_col ='您有没有被社区或以上医院的医生诊断过患有高血压' #您有没有被社区或以上医院的医生诊断过患有糖尿病

#将数据划分为训练集和测试集
X_train, x_test, y_train, y test = train_test_split(data[lifestyle _cols + diet_cols], datal[target_col], test_size=0.2, random_state=42)

#训练模型
gbm = GradientBoostingclassifier(random_state=42)
gbm.fit(X_train, y_train)

#预测测试集
y _pred = gbm.predict(x_test)

#输出分类报告
print(classification_report(y_test, y_pred))

#获取特性重要性并创建一个DataFrame
feature_importances = gbm.feature_importances_
feature_names = X_train.columns
feature_importances_df = pd.DataFrame({
'特性': feature_names,
'重要性': feature_importances
})

#使用seaborn 创建水平条形图,只显示前20个最重要的特性
plt.figure(figsize=(10, 8))#可以调整图形大小
top_20_features = feature_importances_df[:20]
barplot = sns.barplot(x='重要性', y='特性', data=top_20_features, ci=None)

plt.title('特征重要性')
plt.xlabei('相对重要性')
plt.show()

3.2

#导入必要的库

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingclassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

#忽略警告
warnings.filterwarnings("ignore")

#设置中文显示
plt.rcParams['font.sans-serif'] =['simHei']
plt.rcParams['axes.unicode_minus'] = False

#读入数据
dataframe = pd.read_csv(附件22.csv", skiprows=1)

#使用a填充缺失值
dataframe.fillna(0,inplace=True)

#定义特征列
feature_cols = ['工作主要属于以下何种活动', '您做休闲、家务活动的强度', '是否参加体育锻炼', '体育锻炼的强度', '平均每天体育锻炼时间']

#定义目标列
target_col = '有没有被社区或以上医院的医生诊断过患有高血压' #'您有没有被社区或以上医院的医生诊断过患有糖尿病'

#划分训练集和测试集
train_features, test_features, train_labels, test_labels = train_test_split(dataframe[feature cols), dataframe[target col],  test_size=0.2, random_state=42)

#训练椟型
model = GradientBoostingclassifier(random_state=42)
model.fit(train_features, train_labels)

#在测试集上进行预测
predicted_labels = model.predict(test_features)

#打印分类报告
classification_report_output = classification_report(test_labels, predicted_labels)

#获取特征重要性并
importances = model.feature_importances_
feature_names = train_features.columns
importances_dataframe = pd.DataFrame({
'Feature': feature_names,
'Importance' : importances
})

plt.figure(figsize=(10, 8))#调整图形大小
sns.barplot(x='Importance', y="Feature',
data=importances_dataframe.sort_values('Importance', ascending=False).iloc[:3], 
ci=None,
palette=sns.color_palette(["#86F9Bc", "#4ACO87", "#008955"]))

plt.title("特征重要性")
plt.xlabel('相对是否患有高血压重要性")
plt.show()

4.1

#导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

#忽略警告
warnings.filterwarnings("ignore")

#设置中文显示
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False

#读取CSV文件,跳过第一行
data = pd.read_csv("附件22.csv', skiprows=1)

#根据年龄分类
current_year = 2023
data['年龄']= current_year - data[ '出生年"]
bins = [0, 18, 40, 60, np.inf]
labels =['少年','青年','中年','老年']
data['年龄段']= pd.cut(data['年龄'], bins=bins, labels=labels)

#根据血压分类,血压大子130的的被认为是高血压
bins = [0, 120, 130, np.inf]
labels =['正常', '高血压高风险', '高血压'
data['血压分类']= pd.cut(data['收缩压'], bins=bins, labels=labels)

#根据血糖分类,血糖大于7.0的被认为是糖尿病
bins = [0, 5.6, 7.0, np.inf]
labels =['正常', '糖尿病高风险', '糖尿病]
data['血糖分类'] = pd.cut(data['血糖'], bins=bins, labels=labels)

#定义图形和布局
fig,axs = plt.subplots(2, 2, figsize=(14,14))
axs = axs.ravel()


#获取年龄段的唯一值
age_groups = data['年龄段'].unique()

#初始化子图计数器
counter = 0

#定义颜色列表
colors_list = ['#58D950', '#OEA008', '#FE7FF', '#5ABAB6', '#IFFA500', '#F24F00', '#006400', '#8B0000', '#FFD700', '#F8C00']

def make_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return 'ip:.2f}%  (iv:d})'.format(p=pct, v=val)
return my_autopct

#遗历每个年龄段
for age _group in age_groups :

#筛选当前年龄段的数据
data_age_group = data[data['年龄段']== age_group]


#如果当前年龄段没有数据,就跳过
if data_age_group.empty:
continue

#选择一个子图
ax = axs[counter]counter +=1

#计算每个血压分类的人数
bp_counts = data_age_group['血压分类'].value_counts()

#计算每个血糖分类的人数
bs_counts = data_age_group['血糖分类'].value_counts()


#选择颜色
bp_colors = colors_list[:len(bp_counts)]
bs_colors = colors_list[len(bp_counts):len(bp_counts)+len(bs_counts)]

#绘制血糖的饼图
bs_wedges, texts, autotexts = ax.pie(bs_counts, radius=1, autopct=make_autopct(bs_counts), pctdistance=0.85, wedgeprops=dict(weidth=0.3,edgecolor='w'), colorsmbp_colors)

for text,autotext in zip(texts, autotexts):
text.set_fontsize(12)
autotext.set_fontsize(12)

#绘制血糖的饼图
bs_wedges, texts, autotexts = ax.pie(bs_counts, radius=0.7, autopct=make_autopct(bs_counts), pctdistance=0.55, wedgeprops=dict(weidth=0.3,edgecolor='w'), colorsmbp_colors)


for text,autotext in zip(texts, autotexts):
text.set_fontsize(10)
autotext.set_fontsize(10)

#为子图设置标题
ax.set_title(f'年龄段: {age_group}', fontsize=16)

#添加图例
ax.legend(bp_wedges+bs_wedges, labels=list(bp_counts.index)+list(bs_counts.index), title"血压分类&血糖分类", loc="upper left", bbox_to_anchor=(0,1.1))



for i in range(counter, 4):
fig.delaxes(axs[i])
#调整图的布局
plt.tight_layout()
#显示图
plt.show()

猜你喜欢

转载自blog.csdn.net/qq_53011270/article/details/131977734
今日推荐