（六）XGBoost使用交叉验证

import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix(basePath+'data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2

print('running cross validation')

running cross validation

# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
#metrics：验证数据的评估指标，默认指标(rmse用于回归，error误差用于分类
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed=0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

[0] train-error:0.0506682+0.009201 test-error:0.0557316+0.0158887 [1] train-error:0.0213034+0.00205561 test-error:0.0211884+0.00365323

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	test-error-mean	test-error-std	train-error-mean	train-error-std
0	0.055732	0.015889	0.050668	0.009201
1	0.021188	0.003653	0.021303	0.002056

print('running cross validation, disable standard deviation display')

running cross validation, disable standard deviation display

# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value 
# num_boost_round=10：增强数量的迭代
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
             metrics={'error'}, seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                        xgb.callback.early_stop(3)])  #提前停止的条件:Will train until test-error hasn't improved in 3 rounds.

[0] train-error:0.0506682 test-error:0.0557316 Multiple eval metrics have been passed: ‘test-error’ will be used for early stopping. Will train until test-error hasn’t improved in 3 rounds. [1] train-error:0.0213034 test-error:0.0211884 [2] train-error:0.0099418 test-error:0.0099786 [3] train-error:0.0141256 test-error:0.0144336 [4] train-error:0.0059878 test-error:0.0062948 [5] train-error:0.0020344 test-error:0.0016886 [6] train-error:0.0012284 test-error:0.001228 [7] train-error:0.0012284 test-error:0.001228 [8] train-error:0.0009212 test-error:0.001228 [9] train-error:0.0006142 test-error:0.001228 Stopping. Best iteration: [6] train-error:0.0012284+0.000260265 test-error:0.001228+0.00104094

print(res)

test-error-mean test-error-std train-error-mean train-error-std 0 0.055732 0.015889 0.050668 0.009201 1 0.021188 0.003653 0.021303 0.002056 2 0.009979 0.004828 0.009942 0.006076 3 0.014434 0.003517 0.014126 0.001706 4 0.006295 0.003123 0.005988 0.001878 5 0.001689 0.000574 0.002034 0.001470 6 0.001228 0.001041 0.001228 0.000260

print('running cross validation, with preprocessing function')

running cross validation, with preprocessing function

# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
#预处理函数，接受(dtrain, dtest, param)并返回转换后的版本。
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio   #控制正权重和负权重的平衡，这对不平衡类很有用。要考虑的一个典型值:sum(负实例)/ sum(正实例)
    return (dtrain, dtest, param)

# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'auc'}, seed=0, fpreproc=fpreproc)  #auc:曲线下的面积

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	test-auc-mean	test-auc-std	train-auc-mean	train-auc-std
0	0.958232	0.005778	0.958228	0.001442
1	0.981431	0.002595	0.981414	0.000647

###使用自定义损失函数
# you can also do cross validation with customized loss function
# See custom_objective.py
##
print('running cross validation, with cutomsized loss function')

running cross validation, with cutomsized loss function

def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold=5, seed=0,
       obj=logregobj, feval=evalerror)

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	test-error-mean	test-error-std	test-rmse-mean	test-rmse-std	train-error-mean	train-error-std	train-rmse-mean	train-rmse-std
0	0.055732	0.015889	1.598043	0.012826	0.050668	0.009201	1.595072	0.003868
1	0.021188	0.003653	2.449282	0.080900	0.021303	0.002056	2.442600	0.076834

#rmse: root mean square error
#mae: mean absolute error

（六）XGBoost使用交叉验证

猜你喜欢