sklearn的PCA使用

1、数据准备

#导入必要的工具包
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import time

#读取训练数据和测试数据
train = pd.read_csv('./data/MNIST_train.csv')
test = pd.read_csv('./data/MNIST_test.csv')

y_train = train.label.values
X_train = train.drop("label",axis=1).values
X_test = test.values 

#因为数据是像素值[0,255]，所以我们将其转换为  --> [0,1]
X_train = X_train / 255.0
X_test = X_test / 255.0

# 原始输入的特征维数和样本数目
print('the shape of train_image: {}'.format(X_train.shape))
print('the shape of test_image: {}'.format(X_test.shape))

#因为机器的性能，我们取前一万条数据
X_train = X_train[:10000]
X_test = X_test[:10000]
y_train = y_train[:10000]

2、操作

# 将训练集合拆分成训练集和校验集，在校验集上找到最佳的模型超参数（PCA的维数）
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train,y_train, train_size = 0.8,random_state = 0)

#拆分后的训练集和校验集的样本数目
print(X_train_part.shape)
print(X_val.shape)

# 一个参数点（PCA维数为n）的模型训练和测试，得到该参数下模型在校验集上的预测性能
def n_component_analysis(n, X_train, y_train, X_val, y_val):
    start = time.time()
    
    pca = PCA(n_components=n)
    print("PCA begin with n_components: {}".format(n));
    pca.fit(X_train)
    
    # 在训练集和测试集降维 
    X_train_pca = pca.transform(X_train)
    X_val_pca = pca.transform(X_val)
    
    # 利用SVC训练
    print('SVC begin')
    clf1 = svm.SVC()
    clf1.fit(X_train_pca, y_train)
    
    # 返回accuracy
    accuracy = clf1.score(X_val_pca, y_val)
    
    end = time.time()
    print("accuracy: {}, time elaps:{}".format(accuracy, int(end-start)))
    return accuracy



# 设置超参数（PCA维数）搜索范围
n_s = np.linspace(0.70, 0.85, num=15)
accuracy = []
for n in n_s:
    tmp = n_component_analysis(n, X_train_part, y_train_part, X_val, y_val)
    accuracy.append(tmp)


# 绘制不同PCA维数下模型的性能，找到最佳模型／参数（分数最高）
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(n_s, np.array(accuracy), 'b-')

3、最佳参数训练

#最佳模型参数
pca = PCA(n_components=0.75)

#根据最佳参数，在全体训练数据上重新训练模型
pca.fit(X_train)

pca.n_components_

#根据最佳参数，对全体训练数据降维
X_train_pca = pca.transform(X_train)

#根据最佳参数，对测试数据降维
X_test_pca = pca.transform(X_test)

# 降维后的特征维数
print(X_train_pca.shape)
print(X_test_pca.shape)

#在降维后的训练数据集上训练SVM分类器
clf = svm.SVC()
clf.fit(X_train_pca, y_train)

# 用在降维后的全体训练数据集上训练的模型对测试集进行测试
y_predict = clf.predict(X_test_pca)

#生成提交测试结果
df = pd.DataFrame(y_predict)
df.columns=['Label']
df.index+=1
df.index.name = 'Imageid'
df.to_csv('SVC_Minist_submission.csv', header=True)

1、数据准备

2、操作

3、最佳参数训练

猜你喜欢