Cross-validation cross-validation and training set test set division

Since the code is relatively simple, it is easier to understand the function and function of the code directly by looking at the results generated by the code, so no more explanations are added.
Refer to the official document address: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-evaluating-estimator-performance

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

Divide training set and test set (train_set, test_set)

iris=datasets.load_iris()
iris.data.shape,iris.target.shape
((150, 4), (150,))
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.4,random_state=0)
X_train.shape,y_train.shape
((90, 4), (90,))
X_test.shape,y_test.shape
((60, 4), (60,))
clf=svm.SVC(kernel='linear',C=1).fit(X_train,y_train)
clf.score(X_test,y_test)
0.96666666666666667

Calculate metrics for cross-validation

from sklearn.model_selection import cross_val_score
clf=svm.SVC(kernel='linear',C=1)
scores=cross_val_score(clf,iris.data,iris.target,cv=5)#5折交叉验证
scores
array([ 0.96666667,  1.        ,  0.96666667,  0.96666667,  1.        ])
#平均值和95%的置信区间可以计算得出
print("Accuary: %0.2f(+/-%0.2f)" % (scores.mean(),scores.std()*2))
Accuary: 0.98(+/-0.03)
#默认情况下,每次CV迭代计算score是估计量的得分方法。可以通过使用评分参数来改变其评分参数:
from sklearn import metrics
scores=cross_val_score(clf,iris.data,iris.target,cv=5,scoring='f1_macro')
scores
array([ 0.96658312,  1.        ,  0.96658312,  0.96658312,  1.        ])
#由于水仙花数据集中的正负样本个数是均衡的,所以准确率和F1值的得分很相似
#也可以通过传递交叉验证迭代器参数,使用其他交叉验证策略,例如:
from sklearn.model_selection import ShuffleSplit
n_samples=iris.data.shape[0]
cv=ShuffleSplit(n_splits=3,test_size=0.3,random_state=0)
cross_val_score(clf,iris.data,iris.target,cv=cv)
array([ 0.97777778,  0.97777778,  1.        ])
#数据转换和留出数据
from sklearn import preprocessing
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.4,random_state=0)
scaler=preprocessing.StandardScaler().fit(X_train)
X_train_transformed=scaler.transform(X_train)
clf=svm.SVC(kernel='linear',C=1.0).fit(X_train_transformed,y_train)
X_test_transformed=scaler.transform(X_test)
clf.score(X_test_transformed,y_test)
0.93333333333333335
#管道可以更容易地组合估计器,在交叉验证下使用更便捷的写法:
from sklearn.pipeline import make_pipeline
clf=make_pipeline(preprocessing.StandardScaler(),svm.SVC(C=1))
cross_val_score(clf,iris.data,iris.target,cv=cv)
array([ 0.97777778,  0.93333333,  0.95555556])
#通过交叉验证获得预测值
#函数cross_val_predict和cross_val_score功能相似,但前者对每个输入返回一个预测元素
from sklearn.model_selection import cross_val_predict
predicted=cross_val_predict(clf,iris.data,iris.target,cv=10)
metrics.accuracy_score(iris.target,predicted)
0.96666666666666667

cross-validation iterator

#对于假设其独立同分布的数据
#KFold 将样本划分为k组,如果k=n,就是所谓的留一法。
import numpy as np
from sklearn.model_selection import KFold
X=["a","b","c","d"]
kf=KFold(n_splits=2)
for train,test in kf.split(X):
    print("%s %s" %(train,test))#生成的折用下标表示原始数据位置
[2 3] [0 1]
[0 1] [2 3]
X=np.array([[0.,0.],[1.,1.],[-1.,-1.],[2.,2.]])
y=np.array([0,1,0,1])
X_train,X_test,y_train,y_test=X[train],X[test],y[train],y[test]
# Leave One Out (LOO) 留一法
from sklearn.model_selection import LeaveOneOut
X=[1,2,3,4]
loo=LeaveOneOut()
for train,test in loo.split(X):
    print("%s %s"%(train,test))
[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]
#Leave P Out (LPO) 与LeaveOneOut相似,从n个样本中选出p个样本的排列
from sklearn.model_selection import LeavePOut
X=np.ones(4)
lpo=LeavePOut(p=2)
for train,test in lpo.split(X):
    print("%s %s" %(train,test))
[2 3] [0 1]
[1 3] [0 2]
[1 2] [0 3]
[0 3] [1 2]
[0 2] [1 3]
[0 1] [2 3]

Random permutations cross-validation aka Shuffle & Split

#ShuffleSplit 可以定义划分迭代次数和训练测试集的划分比例
from sklearn.model_selection import ShuffleSplit
X=np.arange(5)
ss=ShuffleSplit(n_splits=3,test_size=0.25,random_state=0)#random_state保证了随机的可再现性
for train_index,test_index in ss.split(X):
    print("%s %s" %(train_index,test_index))
[1 3 4] [2 0]
[1 4 3] [0 2]
[4 0 2] [1 3]

Hierarchical cross-validation iterator based on class labels

#有监督分类问题中,如果正负样本的个数不均衡,这种情况下就要考虑使用分层的方法划分训练集和测试集
#StratifiedKFold and StratifiedShuffleSplit

#分层k折
from sklearn.model_selection import StratifiedKFold
X=np.ones(10)
y=[0,0,0,0,1,1,1,1,1,1]
skf=StratifiedKFold(n_splits=3)
for train,test in skf.split(X,y):
     print("%s %s"%(train,test))
[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]
#StratifiedShuffleSplit是shufflesplit的一个变种,返回分层分割结果,即在训练集和验证集中保留相同的正负样本比例

cross-validation iterator for grouped data

#Group k-fold 分组k折验证 ,让相同组的样本只出现在训练集或测试集中。
from sklearn.model_selection import GroupKFold
X=[0.1,0.2,2.2,2.4,2.3,4.55,5.8,8.8,9,10]
y=["a","b","b","b","c","c","c","d","d","d"]
groups=[1,1,1,2,2,2,3,3,3,3]
gkf=GroupKFold(n_splits=3)
for train,test in gkf.split(X,y,groups=groups):
    print("%s %s" %(train,test))
[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]
#Leave One Group Out
from sklearn.model_selection import LeaveOneGroupOut
X=[1,5,10,50,60,70,80]
y=[0,1,1,2,2,2,2]
groups=[1,1,2,2,3,3,3]
logo=LeaveOneGroupOut()
for train,test in logo.split(X,y,groups=groups):
    print("%s %s"%(train,test))
[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]
# Leave P Groups Out
from sklearn.model_selection import LeavePGroupsOut

X = np.arange(6)
y = [1, 1, 1, 2, 2, 2]
groups = [1, 1, 2, 2, 3, 3]
lpgo = LeavePGroupsOut(n_groups=2)
for train, test in lpgo.split(X, y, groups=groups):
    print("%s %s" % (train, test))
[4 5] [0 1 2 3]
[2 3] [0 1 4 5]
[0 1] [2 3 4 5]
#Group Shuffle Split
from sklearn.model_selection import GroupShuffleSplit

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss = GroupShuffleSplit(n_splits=4, test_size=0.5, random_state=0)
for train, test in gss.split(X, y, groups=groups):
    print("%s %s" % (train, test))
[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]

Cross validation of time series data

#Time Series Split
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv=TimeSeriesSplit(n_splits=3)
print(tscv)
for train,test in tscv.split(X):
    print("%s %s"%(train,test))
TimeSeriesSplit(n_splits=3)
[0 1 2] [3]
[0 1 2 3] [4]
[0 1 2 3 4] [5]

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325995371&siteId=291194637