sklearn.GridSearchCV选择超参

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
# 分别 取出 数据 与 标签 reshape的作用是把原始矩阵格式的像素数据转化为一行一个样本的形式
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
# 分割测试数据与训练数据(注意这里已经分割了数据集)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
# 输入模型的超参由验证集来选择
# SVM主要的超参有类似于正则的系数和内核函数
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

#观察角度分别有准度与回归
scores = ['precision', 'recall']
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    # 通过 GridSearchCV 搜索最佳的超参数
    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
  # 这里进行交叉验证的数据是之前分割的训练数据
  # 而交叉验证本身又会分割数据，所以交叉验证这里分割的测试集我么可以看做为验证集，用来拟合模型的超参
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    # 注意最后利用测试集展示的才是泛化误差
    y_true, y_pred = y_test, clf.predict(X_test)
    
    print(classification_report(y_true, y_pred))
    print()
    #关于验证集与交叉验证的关系，解释链接 https://www.jianshu.com/p/67010cba1834
sklearn.GridSearchCV选择超参

猜你喜欢