文章目录
假数据生成
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.feature_selection import RFE
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from scipy.stats import pearsonr
import lightgbm as lgb
import pickle
data = pd.DataFrame([{
'A': 1, 'B': 2, 'C': 3, 'D': 10, 'y': 1}, {
'A': 11, 'B': 22, 'C': 33, 'D': 1, 'y': 0},
{
'A': 111, 'B': 222, 'C': 333, 'D': 1, 'y': 0}, {
'A': 111, 'B': 222, 'C': 333, 'D': 1, 'y': 0},
{
'A': 111, 'B': 21, 'C': 333, 'D': 1, 'y': 0}, {
'A': 111, 'B': 90, 'C': 333, 'D': 1, 'y': 0},
{
'A': 111, 'B': 64, 'C': 12, 'D': 1, 'y': 0}, {
'A': 111, 'B': 222, 'C': 6, 'D': 1, 'y': 0}])
data
X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values
[[ 1 2 3 10]
[ 11 22 33 1]
[111 222 333 1]
[111 222 333 1]
[111 21 333 1]
[111 90 333 1]
[111 64 12 1]
[111 222 6 1]]
[1 0 0 0 0 0 0 0]
原生LGB训练
param = {
'num_leaves': 31, 'objective': 'binary', 'boosting': 'gbdt',
'metric': ['average_precision', 'auc', 'binary_logloss'],
'num_iterations': 100,
'learning_rate': 0.1,
'num_leaves': 31,
'num_threads': 3,
'max_depth': -1,
'min_data_in_leaf': 20,
'min_sum_hessian_in_leaf': 1e-3,
'neg_bagging_fraction': 1.0,
'is_unbalance': True}
train_data = lgb.Dataset(X, label=y)
eval_result = {
}
model = lgb.train(params=param, train_set=train_data, valid_sets=train_data,
callbacks=[lgb.early_stopping(stopping_rounds=5),
lgb.print_evaluation(period=10),
lgb.record_evaluation(eval_result)])
save_path = '/Users/wangguisen/Documents/work_space/python/DNCM/oCPX_ctr/oCPX_CTR_test/data/lgb.model'
# model.save_model(save_path)
# model = lgb.Booster(model_file=save_path)
# model.predict(X, num_iteration=model.best_iteration)
# help(pickle.dumps)
with open(save_path, mode='wb') as f:
pickle.dump(model, f)
with open(save_path, mode='rb') as f:
model = pickle.load(f)
model.predict(X, num_iteration=model.best_iteration)
# lgb.plot_metric(eval_result, metric='auc', xlabel='num_iterations', ylabel='auc')
[[ 1 2 3 10]
[ 11 22 33 1]
[111 222 333 1]
[111 222 333 1]
[111 21 333 1]
[111 90 333 1]
[111 64 12 1]
[111 222 6 1]]
[1 0 0 0 0 0 0 0]
[LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
[LightGBM] [Info] Number of positive: 1, number of negative: 7
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.125000 -> initscore=-1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[1] training's average_precision: 0.125 training's auc: 0.5 training's binary_logloss: 0.50668
Training until validation scores don't improve for 5 rounds
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[2] training's average_precision: 0.125 training's auc: 0.5 training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[3] training's average_precision: 0.125 training's auc: 0.5 training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[4] training's average_precision: 0.125 training's auc: 0.5 training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[5] training's average_precision: 0.125 training's auc: 0.5 training's binary_logloss: 0.50668
......
array([0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])
sklearn LGB训练
from lightgbm.sklearn import LGBMClassifier
lgbmodel = LGBMClassifier(objective='binary', boosting='gbdt', metric=['average_precision', 'acc', 'binary_logloss'],
num_threads=4,
num_leaves=31, num_iterations=100, learning_rate=0.1, max_depth=3, feature_fraction=0.9,
lambda_l1=0.1, lambda_l2=0.1, min_gain_to_split=0, min_sum_hessian_in_leaf=0.1,
min_data_in_leaf=20)
X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values
lgbmodel.fit(X, y)
lgbmodel.predict_proba(X)
array([[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125],
[0.875, 0.125]])
随机搜索调参
from sklearn.model_selection import RandomizedSearchCV
from lightgbm.sklearn import LGBMClassifier
print(X)
print(y)
fixed_params = {
'objective': 'binary',
'boosting': 'gbdt',
'metric': ['average_precision', 'auc', 'binary_logloss'],
'num_threads': 4,
}
# params = {
# 'num_leaves': [20, 25, 31, 35, 40],
# 'num_iterations': [100, 200, 300, 350, 400],
# 'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1, 0.2],
# 'max_depth': [3, 5, 6, 7, 9, 12, 15, 17, 25],
# 'feature_fraction': [0.7, 0.8, 0.9, 1],
# 'lambda_l1': [0, 0.01, 0.05, 0.1],
# 'lambda_l2': [0, 0.01, 0.05, 0.1],
# 'min_gain_to_split': [0, 0.05, 0.1, 0.3],
# 'min_sum_hessian_in_leaf': [0.1, 1, 3, 5, 7],
# 'min_data_in_leaf': [20, 30, 50, 60]
# }
params = {
'num_leaves': [20, 25],
'num_iterations': [100],
'learning_rate': [0.01, 0.015],
'max_depth': [3, 5],
'feature_fraction': [0.9, 1],
'lambda_l1': [0, 0.01],
'lambda_l2': [0, 0.01],
'min_gain_to_split': [0, 0.05],
'min_sum_hessian_in_leaf': [0.1],
'min_data_in_leaf': [20]
}
model = LGBMClassifier(**fixed_params)
optimized_GBM = RandomizedSearchCV(model, params, n_iter=50, cv=5, n_jobs=4)
optimized_GBM.fit(X, y)
print('参数的最佳取值:{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))
参数的最佳取值:{'num_leaves': 20, 'num_iterations': 100, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.05, 'min_data_in_leaf': 20, 'max_depth': 5, 'learning_rate': 0.015, 'lambda_l2': 0, 'lambda_l1': 0, 'feature_fraction': 0.9}
最佳模型得分:0.9
https://lightgbm.readthedocs.io/en/latest/Parameters.html
-
调参:
注意:因为LGB的并行优化,底层会有多进程,如果服务接口要对多模型包装多进程,LGB的模型预测的时候会阻塞