预估器建模方式

1.Xgboost建模,sklearn评估
2.网格搜索交叉验证找最优参数
3.early-stop早停止
4.特征重要度
5.并行训练加速

#预估器建模方式:sklearn与XGboost配合使用
#xgboost建模,sklearn评估
import pickle
import xgboost as xgb

import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.datasets import load_iris, load_digits, load_boston

rng = np.random.RandomState(31337)

#二分类:混淆矩阵
print("数字0和1的二分类问题")
digits = load_digits(2) #2表示类别的意思
y = digits['target']
X = digits['data']
#数据切分对象
kf = KFold(n_splits=2,shuffle=True,random_state=rng)#对数据进行切分
print("在2着数据上的交叉验证")
#2折交叉验证
for train_index,test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])#初始化XGB分类器,拟合数据
    predictions = xgb_model.predict(X[test_index])#预估
    actuals = y[test_index]
    print("混淆矩阵")
    print(confusion_matrix(actuals,predictions))

#多分类,混淆矩阵
print('\nIris:多分类')
iris = load_iris()
y = iris['target']
X = iris['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("混淆矩阵:")
    print(confusion_matrix(actuals, predictions))


#回归问题:MSE
print("\n波士顿房价回归预测问题")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
print("在2折数据上的交叉验证")
for train_index, test_index in kf.split(X):
    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print("MSE:",mean_squared_error(actuals, predictions))

#网格搜索交叉验证查找最优超参数
print("参数最优化:")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()#以回归类问题为了,初始化一个回归器
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200]}, verbose=1)
clf.fit(X,y)
print(clf.best_score_) #最好的得分
print(clf.best_params_)#最好的参数
#early-stopping 早停(另外一种调参方式)
# 在训练集上学习模型,一颗一颗树添加,在验证集上看效果,当验证集效果不再提升,停止树的添加与生长
X = digits['data']
y = digits['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0)
clf = xgb.XGBClassifier()
#early_stopping_rounds=10:表示再验证集上最多有耐心等10次,如果加了10棵树还未提升,则停止,追加到10次之前未最好的效果
clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(X_val, y_val)])
#

#特征重要度
iris = load_iris()
y = iris['target']
X = iris['data']
xgb_model = xgb.XGBClassifier().fit(X,y)

print('特征排序:')
feature_names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
#模型中的属性feature_importances_
feature_importances = xgb_model.feature_importances_
indices = np.argsort(feature_importances)[::-1]

for index in indices:
    print("特征 %s 重要度为 %f" %(feature_names[index], feature_importances[index]))


import matplotlib.pyplot as plt
plt.figure(figsize=(16,8))
plt.title("feature importances")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b')
import os

if __name__ == "__main__":
    try:
        from multiprocessing import set_start_method
    except ImportError:
        raise ImportError("Unable to import multiprocessing.set_start_method."
                          " This example only runs on Python 3.4")
    #set_start_method("forkserver")

    import numpy as np
    from sklearn.model_selection import GridSearchCV
    from sklearn.datasets import load_boston
    import xgboost as xgb

    rng = np.random.RandomState(31337)

    print("Parallel Parameter optimization")
    boston = load_boston()

    os.environ["OMP_NUM_THREADS"] = "2"  # or to whatever you want
    y = boston['target']
    X = boston['data']
    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4, 6],
                                   'n_estimators': [50, 100, 200]}, verbose=1,
                       n_jobs=2)
    clf.fit(X, y)
    print(clf.best_score_)
    print(clf.best_params_)
发布了30 篇原创文章 · 获赞 0 · 访问量 425

猜你喜欢

转载自blog.csdn.net/qq_38319401/article/details/104360413