假数据生成

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.feature_selection import RFE
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from scipy.stats import pearsonr
import lightgbm as lgb
import pickle

data = pd.DataFrame([{
    
    'A': 1, 'B': 2, 'C': 3, 'D': 10, 'y': 1}, {
    
    'A': 11, 'B': 22, 'C': 33, 'D': 1, 'y': 0}, 
                     {
    
    'A': 111, 'B': 222, 'C': 333, 'D': 1, 'y': 0}, {
    
    'A': 111, 'B': 222, 'C': 333, 'D': 1, 'y': 0},
                    {
    
    'A': 111, 'B': 21, 'C': 333, 'D': 1, 'y': 0}, {
    
    'A': 111, 'B': 90, 'C': 333, 'D': 1, 'y': 0},
                    {
    
    'A': 111, 'B': 64, 'C': 12, 'D': 1, 'y': 0}, {
    
    'A': 111, 'B': 222, 'C': 6, 'D': 1, 'y': 0}])

data

X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values

在这里插入图片描述

[[  1   2   3  10]
 [ 11  22  33   1]
 [111 222 333   1]
 [111 222 333   1]
 [111  21 333   1]
 [111  90 333   1]
 [111  64  12   1]
 [111 222   6   1]]
[1 0 0 0 0 0 0 0]

原生LGB训练

param = {
    
    'num_leaves': 31, 'objective': 'binary', 'boosting': 'gbdt', 
        'metric': ['average_precision', 'auc', 'binary_logloss'], 
        'num_iterations': 100, 
        'learning_rate': 0.1,
        'num_leaves': 31,
        'num_threads': 3,
        'max_depth': -1,
        'min_data_in_leaf': 20,
        'min_sum_hessian_in_leaf': 1e-3,
        'neg_bagging_fraction': 1.0, 
        'is_unbalance': True}
        
train_data = lgb.Dataset(X, label=y)
eval_result = {
    
    }
model = lgb.train(params=param, train_set=train_data, valid_sets=train_data,
                 callbacks=[lgb.early_stopping(stopping_rounds=5), 
                            lgb.print_evaluation(period=10),
                            lgb.record_evaluation(eval_result)])


save_path = '/Users/wangguisen/Documents/work_space/python/DNCM/oCPX_ctr/oCPX_CTR_test/data/lgb.model'

# model.save_model(save_path)
# model = lgb.Booster(model_file=save_path)
# model.predict(X, num_iteration=model.best_iteration)

# help(pickle.dumps)

with open(save_path, mode='wb') as f:
    pickle.dump(model, f)

with open(save_path, mode='rb') as f:
    model = pickle.load(f)

model.predict(X, num_iteration=model.best_iteration)
# lgb.plot_metric(eval_result, metric='auc', xlabel='num_iterations', ylabel='auc')

[[  1   2   3  10]
 [ 11  22  33   1]
 [111 222 333   1]
 [111 222 333   1]
 [111  21 333   1]
 [111  90 333   1]
 [111  64  12   1]
 [111 222   6   1]]
[1 0 0 0 0 0 0 0]
[LightGBM] [Warning] There are no meaningful features, as all feature values are constant.
[LightGBM] [Info] Number of positive: 1, number of negative: 7
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 8, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.125000 -> initscore=-1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[1]	training's average_precision: 0.125	training's auc: 0.5	training's binary_logloss: 0.50668
Training until validation scores don't improve for 5 rounds
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[2]	training's average_precision: 0.125	training's auc: 0.5	training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[3]	training's average_precision: 0.125	training's auc: 0.5	training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[4]	training's average_precision: 0.125	training's auc: 0.5	training's binary_logloss: 0.50668
[LightGBM] [Warning] Stopped training because there are no more leaves that meet the split requirements
[5]	training's average_precision: 0.125	training's auc: 0.5	training's binary_logloss: 0.50668
......
array([0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])

sklearn LGB训练

from lightgbm.sklearn import LGBMClassifier
lgbmodel = LGBMClassifier(objective='binary', boosting='gbdt', metric=['average_precision', 'acc', 'binary_logloss'], 
                          num_threads=4,
                          num_leaves=31, num_iterations=100, learning_rate=0.1, max_depth=3, feature_fraction=0.9,
                          lambda_l1=0.1, lambda_l2=0.1, min_gain_to_split=0, min_sum_hessian_in_leaf=0.1, 
                          min_data_in_leaf=20)
X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values
lgbmodel.fit(X, y)
lgbmodel.predict_proba(X)

array([[0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125],
       [0.875, 0.125]])

随机搜索调参

from sklearn.model_selection import RandomizedSearchCV
from lightgbm.sklearn import LGBMClassifier

print(X)
print(y)

fixed_params = {
    
    
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': ['average_precision', 'auc', 'binary_logloss'],
    'num_threads': 4,
}

# params = {
    
    
#     'num_leaves': [20, 25, 31, 35, 40],
#     'num_iterations': [100, 200, 300, 350, 400],
#     'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1, 0.2],
#     'max_depth': [3, 5, 6, 7, 9, 12, 15, 17, 25],
#     'feature_fraction': [0.7, 0.8, 0.9, 1],
#     'lambda_l1': [0, 0.01, 0.05, 0.1],
#     'lambda_l2': [0, 0.01, 0.05, 0.1],
#     'min_gain_to_split': [0, 0.05, 0.1, 0.3],
#     'min_sum_hessian_in_leaf': [0.1, 1, 3, 5, 7],
#     'min_data_in_leaf': [20, 30, 50, 60]
# }
params = {
    
    
    'num_leaves': [20, 25],
    'num_iterations': [100],
    'learning_rate': [0.01, 0.015],
    'max_depth': [3, 5],
    'feature_fraction': [0.9, 1],
    'lambda_l1': [0, 0.01],
    'lambda_l2': [0, 0.01],
    'min_gain_to_split': [0, 0.05],
    'min_sum_hessian_in_leaf': [0.1],
    'min_data_in_leaf': [20]
}

model = LGBMClassifier(**fixed_params)
optimized_GBM = RandomizedSearchCV(model, params, n_iter=50, cv=5, n_jobs=4)
optimized_GBM.fit(X, y)

print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

参数的最佳取值：{'num_leaves': 20, 'num_iterations': 100, 'min_sum_hessian_in_leaf': 0.1, 'min_gain_to_split': 0.05, 'min_data_in_leaf': 20, 'max_depth': 5, 'learning_rate': 0.015, 'lambda_l2': 0, 'lambda_l1': 0, 'feature_fraction': 0.9}
最佳模型得分:0.9

https://lightgbm.readthedocs.io/en/latest/Parameters.html

调参：
- https://blog.csdn.net/weixin_44414593/article/details/107962732
- https://blog.csdn.net/weixin_43655282/article/details/97687758

注意：因为LGB的并行优化，底层会有多进程，如果服务接口要对多模型包装多进程，LGB的模型预测的时候会阻塞

原生LGB训练demo

文章目录

假数据生成

原生LGB训练

sklearn LGB训练

随机搜索调参

猜你喜欢