极端梯度提升算法

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
"""
  1. booster[默认是gbtree]
  选择每次迭代的模型,有两种选择:gbtree基于树的模型、gbliner线性模型
  2. silent[默认是0]
  当这个参数值为1的时候,静默模式开启,不会输出任何信息。一般这个参数保持默认的0,这样可以帮我们更好的理解模型。
  3. nthread[默认值为最大可能的线程数]
  这个参数用来进行多线程控制,应当输入系统的核数,如果你希望使用cpu全部的核,就不要输入这个参数,算法会自动检测。
  4. objective[默认是reg:linear]
  这个参数定义需要被最小化的损失函数。最常用的值有:binary:logistic二分类的逻辑回归,返回预测的概率非类别。multi:softmax使用softmax的多分类器,返回预测的类别。在这种情况下,你还要多设置一个参数:num_class类别数目。
  5. eval_metric[默认值取决于objective参数的取之]
  对于有效数据的度量方法。对于回归问题,默认值是rmse,对于分类问题,默认是error。典型值有:rmse均方根误差;mae平均绝对误差;logloss负对数似然函数值;error二分类错误率;merror多分类错误率;mlogloss多分类损失函数;auc曲线下面积。
  6. seed[默认是0]
随机数的种子,设置它可以复现随机数据的结果,也可以用于调整参数。
"""
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb


def demo1():
# xgboost 有封装好的分类器和回归器,可以直接用 XGBClassifier 建立模型
model = XGBClassifier(learning_rate=0.1)
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
# model.fit(X_train, y_train)
# gradient boosting 还有一个优点是可以给出训练好的模型的特征重要性
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


def demo2():
model = XGBClassifier(learning_rate=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
plot_importance(model)
pyplot.show()


def demo3():
model = XGBClassifier()
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
# print(kfold)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, Y)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))


if __name__ == '__main__':
dataset = loadtxt('../dataset/pima-indians-diabetes.csv', delimiter=",")
X = dataset[:, 0:8]
Y = dataset[:, 8]
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# lightgbm
num_round = 50
lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) # 创建验证数据
params = {
'task': 'train',
'boosting_type': 'gbdt', # 设置提升类型
'objective': 'regression', # 目标函数
'metric': {'l2', 'auc'}, # 评估函数
'num_leaves': 31, # 叶子节点数
'learning_rate': 0.05, # 学习速率
'feature_fraction': 0.9, # 建树的特征选择比例
'bagging_fraction': 0.8, # 建树的样本采样比例
'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
print('Start training...')
# 训练 cv and train
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval,
early_stopping_rounds=5) # 训练数据需要参数列表和数据集
print('Start predicting...')
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止,可以通过best_iteration方式从最佳迭代中获得预测
# 评估模型
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # 计算真实值和预测值之间的均方根误差

猜你喜欢

转载自www.cnblogs.com/xiennnnn/p/11970416.html
今日推荐