总体来说,和xgboost,gbdt差不多,就是bagging,boosting,stacking几种方法?简单说,就是几个模型投票,residual,堆叠。。的感觉。。但是具体,没整明白。
似乎,似乎是每个模型调好参,再整一起,还可以叠一起的意思?我还准备加个catboost,还没成功,各种buging。。。
def build_model_lgb(train_X,train_y):
estimator = lgb.LGBMRegressor(n_estimators=120, subsample=0.47,
max_depth=52,num_leaves=99,min_child_samples=99,objective ='huber')
param_grid = {
'learning_rate': [0.01, 0.05, 0.1],
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(train_X, train_y)
return gbm
def build_model_xgb(x_train,y_train):
model = xgb.XGBRegressor(n_estimators=120, learning_rate=0.08, gamma=0, subsample=0.8,\
colsample_bytree=0.9, max_depth=5) #, objective ='reg:squarederror'
model.fit(x_train, y_train)
return model
## Split data with val
x_train,x_val,y_train,y_val = train_test_split(train_X,train_y,test_size=0.3)
## Train and Predict
print('Predict GBDT...')
model_gbdt = build_model_gbdt(x_train,y_train)
val_gbdt = model_gbdt.predict(x_val)
subA_gbdt = model_gbdt.predict(X_test)
print('predict XGB...')
model_xgb = build_model_xgb(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
subA_xgb = model_xgb.predict(X_test)
print('predict lgb...')
model_lgb = build_model_lgb(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
subA_lgb = model_lgb.predict(X_test)
import catboost as cb
cat_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
cbr = cb.CatBoostRegressor(iterations=300,
depth=12,cat_features=cat_features,
verbose=30,
loss_function='MAE',
eval_metric='MAE')
print('predict cbr...')
model_cbr = cbr(x_train,y_train)
val_cbr = cbr.predict(x_val)
subA_cbr = cbr.predict(X_test)
## Starking
## 第一层
train_lgb_pred = model_lgb.predict(x_train)
train_xgb_pred = model_xgb.predict(x_train)
train_gbdt_pred = model_gbdt.predict(x_train)
train_cbr_pred = cbr.predict(x_train)
Strak_X_train = pd.DataFrame()
Strak_X_train['Method_1'] = train_lgb_pred
Strak_X_train['Method_2'] = train_xgb_pred
Strak_X_train['Method_3'] = train_gbdt_pred
Strak_X_train['Method_4'] = train_cbr_pred
Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_lgb
Strak_X_val['Method_2'] = val_xgb
Strak_X_val['Method_3'] = val_gbdt
Strak_X_val['Method_4'] = val_cbr
Strak_X_test = pd.DataFrame()
Strak_X_test['Method_1'] = subA_lgb
Strak_X_test['Method_2'] = subA_xgb
Strak_X_test['Method_3'] = subA_gbdt
Strak_X_test['Method_4'] = subA_cbr
## level2-method
model_lr_Stacking = build_model_lr(Strak_X_train,y_train)
## 训练集
train_pre_Stacking = model_lr_Stacking.predict(Strak_X_train)
print('MAE of Stacking-LR:',mean_absolute_error(y_train,train_pre_Stacking))
## 验证集
val_pre_Stacking = model_lr_Stacking.predict(Strak_X_val)
print('MAE of Stacking-LR:',mean_absolute_error(y_val,val_pre_Stacking))
## 预测集
print('Predict Stacking-LR...')
subA_Stacking = model_lr_Stacking.predict(Strak_X_test)
subA_Stacking[subA_Stacking<10]=10 ## 去除过小的预测值
sub = pd.DataFrame()
sub['SaleID'] = TestA_data.SaleID
sub['price'] = subA_Stacking
sub.to_csv('./sub_Stacking.csv',index=False)