## sklearn 三种分类算法评估

``````from sklearn import cross_validation
from sklearn import datasets
from sklearn import naive_bayes
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
from numpy import argmax

acc_for_NB = []             #使用accuracy评估三个算法
acc_for_SVC = []
acc_for_RFC = []

f1_for_NB = []              # 使用F1-score评估三个算法
f1_for_SVC = []
f1_for_RFC = []

auc_for_NB = []             # 使用AUC ROC评估三个算法
auc_for_SVC = []
auc_for_RFC = []

X, Y = datasets.make_classification(n_samples = 1000, n_features = 10)
kf = cross_validation.KFold(1000, n_folds = 10, shuffle = True)
for train_index, test_index in kf:
X_train, y_train = X[train_index], Y[train_index]
X_test, y_test = X[test_index], Y[test_index]

#使用NB算法
NBclf = naive_bayes.GaussianNB()
NBclf.fit(X_train, y_train)
NBpred = NBclf.predict(X_test)

#分别用三种方法评估NB算法结果
acc_for_NB.append(metrics.accuracy_score(y_test, NBpred))
f1_for_NB.append(metrics.f1_score(y_test, NBpred))
auc_for_NB.append(metrics.roc_auc_score(y_test, NBpred))

#使用SVC算法，先找出最佳的参数C
nn = len(X_train)
Cvalues = [1e-02, 1e-01, 1e00, 1e01, 1e02]
Cscore = []       #记录每个C取值的评估分数
for C in Cvalues:
#将X_train分为5个fold
ikf = cross_validation.KFold(nn, n_folds = 5, shuffle = True)
innerscore = []
#对一个C的取值进行评估
for inner_train_index, inner_test_index in ikf:
inner_X_train, inner_X_test = X_train[inner_train_index], X_train[inner_test_index]
inner_y_train, inner_y_test = y_train[inner_train_index], y_train[inner_test_index]

innerSVCclf = svm.SVC(C = C, kernel = 'rbf')
innerSVCclf.fit(inner_X_train, inner_y_train)
innerSVCpred = innerSVCclf.predict(inner_X_test)
innerscore.append(metrics.accuracy_score(inner_y_test, innerSVCpred))

#记录一个C取值评估分数的平均值，作为该C取值的评估
Cscore.append(sum(innerscore) / len(innerscore))
#取评估结果最好的C取值，进行SVC算法
bestC = Cvalues[argmax(Cscore)]
print("The best C is:", bestC)
SVCclf = svm.SVC(C = bestC, kernel = 'rbf')
SVCclf.fit(X_train, y_train)
SVCpred = SVCclf.predict(X_test)

#用三种方法评估SVC
acc_for_SVC.append(metrics.accuracy_score(y_test, SVCpred))
f1_for_SVC.append(metrics.f1_score(y_test, SVCpred))
auc_for_SVC.append(metrics.roc_auc_score(y_test, SVCpred))

#下面使用RFC算法，先找出最佳n_estimators取值
n_estimators_values = [10, 100, 1000]
n_estimators_scores = []      #记录每个n_estimators评估分数
for n_estimator in n_estimators_values:
ikf = cross_validation.KFold(nn, n_folds = 5, shuffle = True)
innerscore = []
for inner_train_index, inner_test_index in ikf:
inner_X_train, inner_X_test = X_train[inner_train_index], X_train[inner_test_index]
inner_y_train, inner_y_test = y_train[inner_train_index], y_train[inner_test_index]

innerRFCclf = ensemble.RandomForestClassifier(n_estimators=n_estimator)
innerRFCclf.fit(inner_X_train, inner_y_train)
innerRFCpred = innerRFCclf.predict(inner_X_test)
innerscore.append(metrics.accuracy_score(inner_y_test, innerRFCpred))

#记录每个n_estimators评分平均值，作为一个取值的评分
n_estimators_scores.append(sum(innerscore) / len(innerscore))

#取评分最好的n_estimators取值
best_n_estimators = n_estimators_values[argmax(n_estimators_scores)]
print("The best n_estimator is:", best_n_estimators)
RFCclf = ensemble.RandomForestClassifier(n_estimators=best_n_estimators)
RFCclf.fit(X_train, y_train)
RFCpred = RFCclf.predict(X_test)

#用三种方法评估RFC算法
acc_for_RFC.append(metrics.accuracy_score(y_test, RFCpred))
f1_for_RFC.append(metrics.f1_score(y_test, RFCpred))
auc_for_RFC.append(metrics.roc_auc_score(y_test, RFCpred))

print("Naive Bayes:")

print("Evaluated by accuracy score:")
print(acc_for_NB)
print("Average:", sum(acc_for_NB) / len(acc_for_NB))
print()

print("Evaluated by f1 score:")
print(f1_for_NB)
print("Average:", sum(f1_for_NB) / len(f1_for_NB))
print()

print("Evaluated by roc auc score:")
print(auc_for_NB)
print("Average:", sum(auc_for_NB) / len(auc_for_NB))
print()

print("SVC:")

print("Evaluated by accuracy score:")
print(acc_for_SVC)
print("Average:", sum(acc_for_SVC) / len(acc_for_SVC))
print()

print("Evaluated by f1 score:")
print(f1_for_SVC)
print("Average:", sum(f1_for_SVC) / len(f1_for_SVC))
print()

print("Evaluated by roc auc score:")
print(auc_for_SVC)
print("Average:", sum(auc_for_SVC) / len(auc_for_SVC))
print()

print("Random Forest:")

print("Evaluated by accuracy score:")
print(acc_for_RFC)
print("Average:", sum(acc_for_RFC) / len(acc_for_RFC))
print()

print("Evaluated by f1 score:")
print(f1_for_RFC)
print("Average:", sum(f1_for_RFC) / len(f1_for_RFC))
print()

print("Evaluated by roc auc score:")
print(auc_for_RFC)
print("Average:", sum(auc_for_RFC) / len(auc_for_RFC))
print()``````

``````The best C is: 1.0
The best n_estimator is: 10
The best C is: 1.0
The best n_estimator is: 1000
The best C is: 1.0
The best n_estimator is: 10
The best C is: 1.0
The best n_estimator is: 10
The best C is: 1.0
The best n_estimator is: 1000
The best C is: 1.0
The best n_estimator is: 100
The best C is: 1.0
The best n_estimator is: 100
The best C is: 1.0
The best n_estimator is: 1000
The best C is: 1.0
The best n_estimator is: 10
The best C is: 1.0
The best n_estimator is: 1000``````

SVC算法待确定参数为C，RFC算法待确定参数为n_estimator，在10次迭代中，C的最佳取值总是1.0，而n_estimator的三种取值都可能成为最佳取值。

``````Evaluated by accuracy score:
[0.94, 0.96, 0.98, 0.94, 0.94, 0.95, 0.91, 0.95, 0.93, 0.9]
Average: 0.9400000000000001``````

``````Evaluated by f1 score:
[0.9375, 0.9574468085106385, 0.9743589743589743, 0.9189189189189189, 0.9464285714285715, 0.9532710280373831, 0.9072164948453607, 0.9549549549549549, 0.9421487603305785, 0.8979591836734694]
Average: 0.939020369505885``````

``````Evaluated by roc auc score:
[0.94, 0.9598554797270173, 0.975, 0.9277007145859605, 0.9416564667482661, 0.9487179487179487, 0.909763905562225, 0.9484848484848485, 0.9333616298811546, 0.9007603041216488]
Average: 0.938530129782907``````

``````Evaluated by accuracy score:
[0.95, 0.98, 0.98, 0.98, 0.98, 0.98, 0.97, 0.97, 0.99, 0.97]
Average: 0.975``````

``````Evaluated by f1 score:
[0.9504950495049505, 0.9787234042553191, 0.975, 0.9743589743589743, 0.9821428571428572, 0.9807692307692307, 0.968421052631579, 0.972972972972973, 0.991869918699187, 0.9696969696969697]
Average: 0.9744450430032039``````

``````Evaluated by roc auc score:
[0.95, 0.9799277398635086, 0.9791666666666667, 0.9789827658680118, 0.9824561403508771, 0.9799679487179487, 0.9693877551020409, 0.9686868686868688, 0.9919354838709677, 0.9705882352941176]
Average: 0.9751099604421007``````

``````[0.99, 0.97, 0.98, 0.98, 0.98, 0.97, 0.96, 0.96, 0.99, 0.97]
Average: 0.975``````

``````Evaluated by f1 score:
[0.99009900990099, 0.968421052631579, 0.9743589743589743, 0.9743589743589743, 0.9821428571428572, 0.9714285714285713, 0.9583333333333333, 0.9642857142857142, 0.991869918699187, 0.9696969696969697]
Average: 0.9744995375837149``````

``````Evaluated by roc auc score:
[0.99, 0.9704937775993576, 0.975, 0.9789827658680118, 0.9824561403508771, 0.969551282051282, 0.9595838335334134, 0.9575757575757576, 0.9919354838709677, 0.9705882352941176]
Average: 0.9746167276143785``````

0条评论