数据:datingTestSet
# -*- coding: utf-8 -*- """ Created on Tue May 8 16:44:29 2018 @author: sun_y """ import numpy as np import sklearn as skl from sklearn.cross_validation import train_test_split import file2 X,Y=file2.file2matrix("datingTestSet.txt") X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3) #KNN from sklearn import neighbors model1 = neighbors.KNeighborsClassifier() #取得knn分类器 model1.fit(X_train,y_train) y_pred = model1.predict(X_train) train_error = np.mean(y_pred != y_train) print("KNN train error is %f" % train_error) y_pred_test=model1.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("KNN test error is %f" % test_error) print("*********************************") #randomforest from sklearn.ensemble import RandomForestClassifier clf=RandomForestClassifier(n_estimators=100) model2 =clf.fit(X_train,y_train) y_pred = model2.predict(X_train) train_error = np.mean(y_pred != y_train) print("randomforest train error is %f" % train_error) y_pred_test=model2.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("randomforest test error is %f" % test_error) print("*********************************") # decision tree from sklearn import tree clf = tree.DecisionTreeClassifier() model4 =clf.fit(X_train,y_train) y_pred = model4.predict(X_train) train_error = np.mean(y_pred != y_train) print("decision tree train error is %f" % train_error) y_pred_test=model4.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("decision tree test error is %f" % test_error) print("*********************************") #GBDT(Gradient Boosting Decision Tree) Classifier from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(n_estimators=200) model5 =clf.fit(X_train,y_train) y_pred = model5.predict(X_train) train_error = np.mean(y_pred != y_train) print("GBDT train error is %f" % train_error) y_pred_test=model5.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("GBDT test error is %f" % test_error) print("*********************************") #standardlize data from sklearn import preprocessing X_train=preprocessing.scale(X_train) X_test=preprocessing.scale(X_test) # Logistic Regression Classifier from sklearn.linear_model import LogisticRegression clf = LogisticRegression(penalty='l2') model3 =clf.fit(X_train,y_train) y_pred = model3.predict(X_train) train_error = np.mean(y_pred != y_train) print("Logistic Regression train error is %f" % train_error) y_pred_test=model3.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("Logistic Regression test error is %f" % test_error) print("*********************************") #SVM Classifier from sklearn.svm import SVC clf = SVC(kernel='rbf', probability=True) model6 =clf.fit(X_train,y_train) y_pred = model6.predict(X_train) train_error = np.mean(y_pred != y_train) print("SVM train error is %r" % train_error) y_pred_test=model6.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("SVM test error is %r" % test_error) print("*********************************") #nn from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(5,2),random_state=1) model7 =clf.fit(X_train,y_train) MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate='constant', learning_rate_init=0.001, max_iter=10000, momentum=0.01, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) y_pred = model7.predict(X_train) train_error = np.mean(y_pred != y_train) print("NN train error is %f" % train_error) y_pred_test=model7.predict(X_test) test_error=np.mean(y_pred_test != y_test) print("NN test error is %f" % test_error)
【测试结果】:
KNN train error is 0.148571 KNN test error is 0.206667 ********************************* randomforest train error is 0.000000 randomforest test error is 0.036667 ********************************* decision tree train error is 0.000000 decision tree test error is 0.046667 ********************************* GBDT train error is 0.000000 GBDT test error is 0.040000 ********************************* Logistic Regression train error is 0.095714 Logistic Regression test error is 0.086667 ********************************* SVM train error is 0.037142857142857144 SVM test error is 0.03666666666666667 ********************************* NN train error is 0.032857 NN test error is 0.056667
【tips: 】
1.
在将数据作用于logistic regression, svm 以及 nn 之前 需要对数据进行归一化
2.但是类似于KNN 这样需要对数据求距离的模型,不需要对数据进行归一化处理 否则会消弱大距离数据的影响