Data Mining Competition - Diabetes Genetic Risk Detection Challenge Baseline

This competition is a data mining competition, which requires players to build a model through the training set data, then make predictions on the verification set data, and submit the prediction results.
The task of this question is to build a model that can predict whether a patient has diabetes based on the patient's test data. This type of task is a typical binary classification problem (with diabetes/without diabetes), where the predicted output of the model is 0 or 1 (with diabetes: 1, without diabetes: 0)

Competition link: https://challenge.xfyun.cn/topic/info?type=diabetes&option=tjjg

This competition is a data mining related competition tutorial from DataWhale:
https://xj15uxcopw.feishu.cn/docx/doxcn5bbI3eupMF95XW5Y5ZM6jd

DataWhale provides a baseline for LightGBM predictions. I also wrote a baseline for neural networks. Although the effect is not as good as LightBGM, I hope everyone can continue to optimize on this basis.

The downloaded data sets are as follows, I renamed them to English:
insert image description here

submit.csv : sample submission result
test.csv : test set data
train.csv : training set data

Baseline

Please implement the following code in jupyter notebook or python compiler environment.

1. Data preprocessing

Import third-party libraries

import pandas as pd
import numpy as np

data1=pd.read_csv('data/train.csv',encoding='gbk')
data2=pd.read_csv('data/test.csv',encoding='gbk')

#label标记为-1
data2['患有糖尿病标识']=-1
#训练集和测试机合并
data=pd.concat([data1,data2],axis=0,ignore_index=True)

#特征工程
#print(data.isnull().any(0)) #查看数据集中是否有空值
data['舒张压']=data['舒张压'].fillna(data['舒张压'].mean())#将舒张压特征中的缺失值填充为平均值

"""
将口服耐糖量测试中为-1的值设为所有非-1值的平均值
"""
def SUGAR(a):
   if a==-1:
       return data.loc[data['口服耐糖量测试']!=-1,:]['口服耐糖量测试'].mean()
   else:
       return a
    
data['口服耐糖量测试']=data['口服耐糖量测试'].apply(SUGAR)

"""
将出生年份换算成年龄
"""
data['出生年份']=2022-data['出生年份']  #换成年龄
"""
人体的成人体重指数正常值是在18.5-24之间
低于18.5是体重指数过轻
在24-27之间是体重超重
27以上考虑是肥胖
高于32了就是非常的肥胖。
"""
def BMI(a):
   if a<18.5:
       return 0
   elif 18.5<=a<=24:
       return 1
   elif 24<a<=27:
       return 2
   elif 27<a<=32:
       return 3
   else:
       return 4

data['BMI']=data['体重指数'].apply(BMI)

"""
无记录
叔叔或者姑姑有一方患有糖尿病/叔叔或姑姑有一方患有糖尿病
父母有一方患有糖尿病
"""
def FHOD(a):
   if a=='无记录':
       return 0
   elif a=='叔叔或者姑姑有一方患有糖尿病' or a=='叔叔或姑姑有一方患有糖尿病':
       return 1
   else:
       return 2

data['糖尿病家族史']=data['糖尿病家族史'].apply(FHOD)

"""
舒张压范围为60-90
"""
def DBP(a):
   if a<60:
       return 0
   elif 60<=a<=90:
       return 1
   elif a>90:
       return 2
   else:
       return a
data['DBP']=data['舒张压'].apply(DBP)

#其中编号和患者是否得糖尿病没有任何联系，属于无关特征予以删除
data=data.drop(['编号'],axis=1)

'''标准化'''
'''1. 所有标签标准化'''
# cols = [i for i in data.columns]
# cols.remove('患有糖尿病标识')
'''2. 指定标签标准化'''
cols = ['出生年份','体重指数','舒张压','口服耐糖量测试','胰岛素释放实验','肱三头肌皮褶厚度']

data[cols] = data[cols].apply(lambda x: (x - x.min()) / (x.max()-x.min()))

#分割数据集
train=data[data['患有糖尿病标识'] !=-1]
test=data[data['患有糖尿病标识'] ==-1]

train_label=train['患有糖尿病标识']
train=train.drop(['患有糖尿病标识'],axis=1)
test=test.drop(['患有糖尿病标识'],axis=1)

2. Model training and prediction

2.1 LightBGM & XGBoost

#使用Lightgbm和XGBoost方法训练数据集，使用5折交叉验证的方法获得5个测试集预测结果
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm
def select_by_lgb(train_data,train_label,test_data,random_state=2022,n_splits=5,metric='auc',num_round=10000,early_stopping_rounds=100):
   kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
   fold=0
   result=[]

   for train_idx, val_idx in kfold.split(train_data):
       random_state+=1
       train_x = train_data.loc[train_idx]
       train_y = train_label.loc[train_idx]
       test_x = train_data.loc[val_idx]
       test_y = train_label.loc[val_idx]
        
       #lgbm 
       clf=lightgbm
       train_matrix=clf.Dataset(train_x,label=train_y)
       test_matrix=clf.Dataset(test_x,label=test_y)
       params={
    
    
               'boosting_type': 'gbdt',
               'objective': 'binary',
               'learning_rate': 0.1,
               'metric': metric,
               'seed': 2020,
               'nthread':-1 }
       model1=clf.train(params,train_matrix,num_round,valid_sets=test_matrix,early_stopping_rounds=early_stopping_rounds)
       val1= model1.predict(test_x)
        
       #XGBoost 
       model2 = xgb.XGBRegressor(max_depth=30, learning_rate=0.1, n_estimators=2000, reg_alpha=0.005, subsample=0.8,
                             gamma=0,colsample_bylevel=0.8, objective ='reg:squarederror')
       model2.fit(train_x, train_y) 
       val2= model2.predict(test_x)        
  
       val = (val1+val2)/2
       y = np.int64(val>= 0.5)
       print("fl_score分数为：",f1_score(test_y, y, average='binary'))
       
       pre1 =  model1.predict(test_data)
       pre2 =  model2.predict(test_data) 
       pre_y= (pre1+pre2)/2
       result.append(pre_y)
       fold+=1
   return result

test_data=select_by_lgb(train,train_label,test)

#test_data就是5折交叉验证中5次预测的结果
pre_y=pd.DataFrame(test_data).T
#将5次预测的结果求取平均值，当然也可以使用其他的方法
pre_y['averge']=pre_y[[i for i in range(5)]].mean(axis=1)
#因为竞赛需要你提交最后的预测判断，而模型给出的预测结果是概率，因此我们认为概率>0.5的即该患者有糖尿病，概率<=0.5的没有糖尿病
pre_y['label']=pre_y['averge'].apply(lambda x:1 if x>0.5 else 0)

#保存预测结果
result=pd.read_csv('data/submit.csv')
result['label']=pre_y['label']
result.to_csv('result.csv',index=False)

When predicted using LightBGM alone, the post-commit score is: 0.96719

When averaging the predictions using LightBGM and XGBoost, the post-commit score is: 0.96088

You can try several models, and you can also assign different weights according to the predictions of different models.

2.2 Neural Networks

import torch
import torch.nn as nn
import torch.utils.data as Data

'''定义损失函数及模型'''
loss = nn.MSELoss()
in_features = train.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features,50),
                        nn.BatchNorm1d(50),
                        nn.ReLU(),
                        nn.Linear(50,25),
                        nn.BatchNorm1d(25),
                        nn.ReLU(),
                        nn.Linear(25,10),
                        nn.BatchNorm1d(10),
                        nn.ReLU(),
                        nn.Linear(10,1))
    return net

'''模型训练函数'''
def training(net, train_x, train_y, test_x, test_y, num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_x = torch.tensor(train_x.values, dtype=torch.float32)
    train_y = torch.tensor(train_y.values.reshape(-1, 1), dtype=torch.float32)
    if test_x is not None:
        test_x = torch.tensor(test_x.values, dtype=torch.float32)
        test_y = torch.tensor(test_y.values.reshape(-1, 1), dtype=torch.float32)
    dataset = Data.TensorDataset(train_x,train_y)
    train_iter = Data.DataLoader(dataset=dataset, batch_size=batch_size,shuffle=True, num_workers=0)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4, 0.9)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(loss(net(train_x),train_y))
        if test_x is not None:  
            test_ls.append(loss(net(test_x),test_y))
        scheduler.step()
    return train_ls, test_ls

'''采用5折交叉训练'''
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
def K_fold(train_data,train_label,num_epochs, learning_rate, weight_decay,batch_size,random_state=1022,n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    result=[]
    i = 0
    for train_idx, val_idx in kfold.split(train_data):
        net = get_net()
        train_x = train_data.loc[train_idx]
        train_y = train_label.loc[train_idx]
        test_x = train_data.loc[val_idx]
        test_y = train_label.loc[val_idx]
        train_ls,valid_ls = training(net,train_x,train_y,test_x,test_y,num_epochs,learning_rate,weight_decay,batch_size)
        print(f'折{
      
      i + 1}，训练loss{
      
      float(train_ls[-1]):f}, '
              f'验证loss{
      
      float(valid_ls[-1]):f}')
        
        test_x = torch.tensor(test_x.values, dtype=torch.float32) 
        val = net(test_x).detach().numpy().reshape(1014)
        y = np.int64(val>= 0.5)
        print("fl_score分数为：",f1_score(test_y, y, average='binary'))
        
        i+=1

K_fold(train, train_label,10, 0.01, 0, 32) #参数分别为：训练集输入x，训练集输入y，epoch，learning_rate，是否用Weight Decay，batch_size

After adjusting the parameters, run all the training sets on the model to get the final training model.

net = get_net()
train_ls,_ = training(net,train,train_label,None,None,100,0.01,0,32)
print(f'训练loss{
      
      float(train_ls[-1]):f}')

'''预测并保存'''
test_data = torch.tensor(test.values, dtype=torch.float32)
pre_y = net(test_data).detach().numpy()
pre_y = pd.DataFrame(pre_y,columns=['pre'])
pre_y['pre']=pre_y['pre'].apply(lambda x:1 if x>0.5 else 0)
result=pd.read_csv('data/submit.csv')
result['label']=pre_y['pre']
result.to_csv('result.csv',index=False)

The score after the result submission is: 0.91965

After I changed the model structure, learning rate and other hyperparameters, there was no significant improvement. If someone has a good optimization method, I hope you can tell me in the comment area, thank you very much!