lightgbm参数解释
boosting = ‘gbdt‘,迭代器选’rf’效果略优
is_unbalance=True,实际数据的样本是不均衡的,但是设置该参数导致迭代效果变差
bagging_fraction=0.7,
bagging_freq =1,
使用了bagging方法,随机选择了70%的数据进行训练,每1步迭代做一次bagging
效果无明显提高,但是理应有所提高。(特征太菜)
num_leaves=64,
max_depth=6
num_leaves应设置为2^max_depth左右较为合适,但是附近的值有可能优于该值
colsample_bytree=0.8
随机选取80%的特征进行训练,防止过拟合
扫描二维码关注公众号,回复:
42938 查看本文章
安利一个新鲜的lightgbm中文文档:
http://lightgbm.apachecn.org/cn/latest/Parameters.html
bagging
bagging目前还没有具体实现,只会把两个预测较好的结果进行简单相加再取平均,进阶版本根据训练集logloss的大小对结果取加权平均。但是bagging的提升效果十分稳定,几乎总能改善成绩。
boosting
许多包都包括这个思想,所以没有过多涉及。
stacking
stacking据说是比赛的大杀器,但是跑起来的时间代价很大,xgb的速度太拖后腿了。
所以只是实现了该算法,对每个迭代器的迭代次数都设置在200次左右。
第一层 xgb+lgb+cat
第二层 xgb
效果变差了,但是这个方法从原理上讲是我非常喜欢认为理所当然的方法,可能是没有理解透彻。
附上简单的stacking+对结果加权平均的bagging代码。
输入:构建好特征的数据集data
from mlxtend.regressor import StackingRegressor from mlxtend.data import boston_housing_data from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVR import matplotlib.pyplot as plt from sklearn.model_selection import KFold import xgboost as xgb from sklearn.ensemble import BaggingClassifier from sklearn.neural_network import MLPClassifier import xgboost as xgb from catboost import CatBoostClassifier #------------------------------声明训练数据和测试数据 train= data[(data['day'] >= 18) & (data['day'] <= 23)] test= data[(data['day'] == 24)] drop_name = ['is_trade', 'item_category_list', 'item_property_list', 'predict_category_property', 'realtime','context_timestamp' ] col = [c for c in train if c not in drop_name] X = train[col] y = train['is_trade'].values X_tes = test[col] y_tes = test['is_trade'].values #------------------------------第一层stacking clfs=[ #RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'), # ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'), xgb.sklearn.XGBClassifier(n_estimators=200,max_depth=4,seed=5, learning_rate=0.1,subsample=0.8, min_child_weight=6,colsample_bytree=.8, scale_pos_weight=1.6, gamma=10, reg_alpha=8,reg_lambda=1.3,silent=False, eval_metric='logloss'), CatBoostClassifier(verbose=True,depth=8,iterations=200,learning_rate=0.1, eval_metric='AUC',bagging_temperature=0.8, l2_leaf_reg=4, rsm=0.8,random_seed=10086), lgb.LGBMClassifier( objective='binary', metric='logloss', num_leaves=35, depth=8, learning_rate=0.1, seed=2018, colsample_bytree=0.8, # min_child_samples=8, subsample=0.9, n_estimators=200), lgb.LGBMClassifier( objective='binary', metric='AUC', num_leaves=35, depth=8, learning_rate=0.1, seed=2018, colsample_bytree=0.8, # min_child_samples=8, subsample=0.9, n_estimators=200, boosting = 'rf' ) ] cfs = len(clfs) ntrain=X.shape[0] ## 891 ntest=X_tes.shape[0] ## 418 kf=KFold(n_splits=cfs,random_state=2017) #def get_oof(clf,X,y,X_tes): oof_train=np.zeros((ntrain,cfs)) ##shape为(ntrain,)表示只有一维 891*1 oof_test=np.zeros((ntest,cfs)) for j, clf in enumerate(clfs): print 'Training classifier [%s]' % (j) oof_test_skf = np.zeros((ntest, cfs)) for i,(train_index,test_index) in enumerate(kf.split(X)): kf_x_train=X.loc[train_index] ## (891/5 *4)*7 故shape:(712*7) kf_y_train=y[train_index] ## 712*1 kf_x_test=X.loc[test_index] ## 179*7 clf.fit(kf_x_train,kf_y_train) oof_train[test_index,j]=clf.predict_proba(kf_x_test)[:,1] oof_test_skf[:,i]=clf.predict_proba(X_tes)[:,1] oof_test[:,j]=oof_test_skf.mean(axis=1) #----------------------第二层,可以选择LR,也可以选择lgb #-------------------------------选用lgb---------0.08248948001930827 lgb0 = lgb.LGBMClassifier( objective='binary', metric='logloss', num_leaves=35, depth=8, learning_rate=0.05, seed=2018, colsample_bytree=0.8, # min_child_samples=8, subsample=0.9, n_estimators=200) lgb_model = lgb0.fit(oof_train, y, eval_set=[(oof_test, y_tes)], early_stopping_rounds=200) best_iter = lgb_model.best_iteration_ lgb2 = lgb.LGBMClassifier( objective='binary', metric='logloss', num_leaves=35, depth=8, learning_rate=0.05, seed=2018, colsample_bytree=0.8, # min_child_samples=8, subsample=0.9, n_estimators=300) lgb2.fit(oof_train,y) Y_test_predict = lgb2.predict_proba(oof_test)[:,1] pred = pd.DataFrame() pred['stacking_pred'] = Y_test_predict pred['is_trade'] = y_tes logloss = log_loss(pred['is_trade'],pred['stacking_pred']) #-----------------------------选用lr-------0.08304983559720211 lr = LogisticRegression(n_jobs=-1) lr.fit(oof_train, y) Y_test_predict = lr.predict_proba(oof_test)[:,1] pred = pd.DataFrame() pred['stacking_pred'] = Y_test_predict pred['is_trade'] = y_tes logloss = log_loss(pred['is_trade'],pred['stacking_pred']) #---------------------结果比较 #用原始数据做预测的lgb-----0.08245197939071315 lgb0 = lgb.LGBMClassifier( objective='binary', # metric='binary_error', num_leaves=35, depth=8, learning_rate=0.05, seed=2018, colsample_bytree=0.8, # min_child_samples=8, subsample=0.9, n_estimators=300) lgb_model = lgb0.fit(X, y) Y_test_predict = lgb_model.predict_proba(test[col])[:, 1] pred = pd.DataFrame() pred['lgb_pred'] = Y_test_predict pred['is_trade'] = y_tes logloss = log_loss(pred['is_trade'],pred['lgb_pred']) #用原始数据的cat---------0.08236626308985852 cat0 = CatBoostClassifier(verbose=True,depth=8,iterations=200,learning_rate=0.1, eval_metric='AUC',bagging_temperature=0.8, l2_leaf_reg=4, rsm=0.8,random_seed=10086) cat_model = cat0.fit(X, y) Y_test_predict = cat_model.predict_proba(X_tes)[:, 1] pred = pd.DataFrame() pred['cat_pred'] = Y_test_predict pred['is_trade'] = y_tes logloss = log_loss(pred['is_trade'],pred['cat_pred']) #用原始数据做预测的lr---------------0.0914814461101563 lr = LogisticRegression(n_jobs=-1) lr.fit(X, y) Y_test_predict = lr.predict_proba(test[col])[:, 1] #pred = pd.DataFrame() pred['lr_pred'] = Y_test_predict pred['is_trade'] = y_tes logloss = log_loss(pred['is_trade'],pred['lr_pred']) #做一个简单的bagging--------------0.08223091072342852 pred['bagg_pred'] = (pred['cat_pred'] + pred['lgb_pred'])/2 logloss = log_loss(pred['is_trade'],pred['bagg_pred']) #做一个带加权的bagging------------- lr_loss = (1/0.0914814461101563) cat_loss = (1/0.08222911268463354) lgb_loss = (1/0.08237142613041541) #pred['weight_bagg_pred'] = (pred['cat_pred']*cat_loss+\ # pred['lgb_pred']*lgb_loss+pred['lr_pred']*lr_loss # )/(lr_loss+cat_loss+lgb_loss) pred['weight_bagg_pred'] = (pred['cat_pred']*cat_loss+\ pred['lgb_pred']*lgb_loss )/(cat_loss+lgb_loss) logloss = log_loss(pred['is_trade'],pred['weight_bagg_pred'])