GBDT tuning

 

import pandas as pd                                                                                 
import numpy as np                                                                                  
from sklearn.ensemble import GradientBoostingClassifier                                             
from sklearn.model_selection import train_test_split,GridSearchCV                                   
from sklearn.preprocessing import MinMaxScaler                                                      
from sklearn.metrics import accuracy_score,roc_curve,auc                                            
import matplotlib.pyplot as plt                                                                     
                                                                                                    
# 显示中文及负号                                                                                           
plt.rcParams['font.sans-serif'] = ['SimHei']                                                        
plt.rcParams['axes.unicode_minus'] = False                                                          
                                                                                                    
# 文件路径                                                                                              
data_path = '/Users/gaofei/Desktop/ensemble/data.csv' # 请自行更改                                              
                                                                                                    
# 读取数据                                                                                              
data = pd.read_csv(data_path, encoding='gbk')                                                       
                                                                                                    
# 归一化                                                                                               
scaler = MinMaxScaler()                                                                             
values = scaler.fit_transform(data.values[:,:-1])                                                   
                                                                                                    
# 划分训练集和测试集                                                                                         
X_train, X_test, y_train, y_test = train_test_split(values[:,:-1],data.values[:,-1],test_size=0.3)  

Data set address: extraction code: lg9r

https://pan.baidu.com/s/1e8txYy-PZrwKKP3JD4sJAg

# 不调参的模型效果                                                               
def default_param():                                                     
    gbdtclf = GradientBoostingClassifier(random_state=10)                
    gbdtclf.fit(X_train, y_train)                                        
    y_pre = gbdtclf.predict(X_test)                                      
    y_prb_1 = gbdtclf.predict_proba(X_test)[:,1]                         
    fpr, tpr, thresholds = roc_curve(y_test, y_prb_1)                    
    roc_auc = auc(fpr, tpr)                                              
    plt.plot(fpr, tpr, label='AUC = {0:.4f}'.format(roc_auc))            
    plt.title('ROC曲线')                                                   
    plt.legend(loc='lower right')                                        
    plt.plot([0, 1], [0, 1], 'r--')                                      
    plt.xlabel('误判率')                                                    
    plt.ylabel('命中率')                                                    
    plt.show()                                                           
    print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pre)))     

Output accuracy rate without tuning: 0.82, AUC is 0.917

 

# 首先从步长(learning rate) 和 迭代次数(n_estimator)入手,将步长初始值设为0.1,对迭代次数进行网格搜索                                                   
def adjust_n_estimators():                                                                                             
    param_dic = {'n_estimators': range(10, 101, 10)}                                                                   
    gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,                 
                                                                 min_samples_leaf=20, max_depth=8, max_features='sqrt',
                                                                 subsample=0.8, random_state=10),                      
                            param_grid=param_dic, scoring='roc_auc', iid=False, cv=5)                                  
    gscv.fit(X_train, y_train)                                                                                         
    print('best_params:{0}'.format(gscv.best_params_))                                                                 
    print('best_score:{0}'.format(gscv.best_score_))                                                              

best_params:{'n_estimators': 60}
best_score:0.9194666836254305

# 迭代次数有了,接下来对决策树进行调参:                                                                                                 
# 首先对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索                                                           
def adjust_depth_samples():                                                                                           
    param_dic = {'max_depth': range(3, 14, 2), 'min_samples_split': range(100, 801, 200)}                             
    gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, min_samples_leaf=20, 
                                             max_features='sqrt', subsample=0.8, random_state=10),                    
                                             param_grid=param_dic, scoring='roc_auc', iid=False, cv=5)                
    gscv.fit(X_train, y_train)                                                                                        
    print('best_params:{0}'.format(gscv.best_params_))   
    print('best_score:{0}'.format(gscv.best_score_))                                                                  

best_params:{'max_depth': 7, 'min_samples_split': 500}
best_score:0.9196201483525961

# 先定下深度为7,但min_samples_split和其他参数还有关联,接下来要和min_samples_leaf一起调参                                             
def adjust_samples_leaf():                                                                                  
    param_dic = {'min_samples_split':range(500,900,100), 'min_samples_leaf':range(50,151,10)}               
    gscv = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=7,
                                         max_features='sqrt', subsample=0.8, random_state=10),              
                                         param_grid = param_dic, scoring='roc_auc',iid=False, cv=5)         
    gscv.fit(X_train,y_train)                                                                               
    print('best_params:{0}'.format(gscv.best_params_))                                                      
    print('best_score:{0}'.format(gscv.best_score_))                                                        

best_params:{'min_samples_leaf': 110, 'min_samples_split': 500}
best_score:0.9202495480403471

def best_param():                                                                                                  
    gbdtclf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_depth=7,min_samples_leaf=110,     
                                       min_samples_split=500, max_features='sqrt', subsample=0.8, random_state=10) 
    gbdtclf.fit(X_train, y_train)                                                                                  
    y_pre = gbdtclf.predict(X_test)                                                                                
    y_prb_1 = gbdtclf.predict_proba(X_test)[:,1]                                                                   
    fpr, tpr, thresholds = roc_curve(y_test, y_prb_1)                                                              
    roc_auc = auc(fpr, tpr)                                                                                        
    plt.plot(fpr, tpr, label='AUC = {0:.4f}'.format(roc_auc))                                                      
    plt.title('ROC曲线')                                                                                             
    plt.legend(loc='lower right')                                                                                  
    plt.plot([0, 1], [0, 1], 'r--')                                                                                
    plt.xlabel('误判率')                                                                                              
    plt.ylabel('命中率')                                                                                              
    plt.show()                                                                                                     
    print('Accuracy: {0:.2f}'.format(accuracy_score(y_test, y_pre)))                                               

After tuning, the accuracy is 0.86 and the AUC is 0.92

 

Guess you like

Origin blog.csdn.net/gf19960103/article/details/89487399