LightGBM使用指南

1、准备

import lightgbm as lgbm
import pandas as pd 
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

train = pd.read_csv('Otto_train.csv')
# drop ids and get labels
y_train = train['target']
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s)-1)

train = train.drop(["id", "target"], axis=1)
X_train = np.array(train)

2、使用方法

params = {'boosting_type': 'gbdt', 
          'objective': 'multiclass', 
          'nthread': -1, 
          'silent': True,
          'learning_rate': 0.1, 
          'num_leaves': 50, 
          'max_depth': 6,
          'max_bin': 127, 
          'subsample_for_bin': 50000,
          'subsample': 0.8, 
          'subsample_freq': 1, 
          'colsample_bytree': 0.8, 
          'reg_alpha': 1, 
          'reg_lambda': 0,
          'min_split_gain': 0.0, 
          'min_child_weight': 1, 
          'min_child_samples': 20, 
          'scale_pos_weight': 1}

lgbm1 = lgbm.sklearn.LGBMClassifier(num_class= 9, n_estimators=1000, seed=0, **params)

#modelfit(params,lgbm1, X_train, y_train)

params['num_class'] = 9

lgbmtrain = lgbm.Dataset(X_train, y_train, silent=True)

cv_result = lgbm.cv(
        params, lgbmtrain, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='multi_logloss',
        early_stopping_rounds=10,show_stdv=True,seed=0)
    # note: cv_results will look like: {"multi_logloss-mean": <a list of historical mean>,
    # "multi_logloss-stdv": <a list of historical standard deviation>}
print('best n_estimators:', len(cv_result['multi_logloss-mean']))
print('best cv score:', cv_result['multi_logloss-mean'][-1])
    #json.dump(cv_result, open('lgbm_1.json', 'w'))
    
    # 采用交叉验证得到的最佳参数n_estimators，训练模型
lgbm1.set_params(n_estimators = len(cv_result['multi_logloss-mean']))
lgbm1.fit(X_train, y_train)

3、画图

test_means = cv_result['multi_logloss-mean']
#test_stds = cv_result['multi_logloss-std'] 

x_axis = range(0, cv_result.shape[0])
pyplot.plot(x_axis, test_means) 
pyplot.title("LightGBM n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.show()

附：GridSearchCV调参

params = {'boosting_type': 'gbdt', 
          'objective': 'multiclass', 
          'nthread': -1, 
          'silent': True,
          'learning_rate': 0.1, 
          'max_depth': 6,
          'max_bin': 127, 
          'subsample_for_bin': 50000,
          'subsample': 0.8, 
          'subsample_freq': 1, 
          'colsample_bytree': 0.8, 
          'reg_alpha': 1, 
          'reg_lambda': 0,
          'min_split_gain': 0.0, 
          'min_child_weight': 1, 
          'min_child_samples': 20, 
          'scale_pos_weight': 1}

lgbm2_1 = lgbm.sklearn.LGBMClassifier(num_class= 9, n_estimators=539, seed=0, **params)

gsearch2_1 = GridSearchCV(lgbm2_1, param_grid = param_test2_1, scoring='neg_log_loss',n_jobs=-1, cv=kfold)
gsearch2_1.fit(X_train , y_train)

# summarize results
print("Best: %f using %s" % (gsearch2_1.best_score_, gsearch2_1.best_params_))
test_means = gsearch2_1.cv_results_[ 'mean_test_score' ]
test_stds = gsearch2_1.cv_results_[ 'std_test_score' ]
train_means = gsearch2_1.cv_results_[ 'mean_train_score' ]
train_stds = gsearch2_1.cv_results_[ 'std_train_score' ]

pd.DataFrame(gsearch2_1.cv_results_).to_csv('my_preds_num_leaves_1.csv')

# plot results
pyplot.plot(num_leaves, -test_means)
pyplot.legend()
pyplot.xlabel( 'num_leaves' )                                                                                                      
pyplot.ylabel( 'Log Loss' )

1、准备

2、使用方法

3、画图

附：GridSearchCV调参

猜你喜欢