PCA降维--代码实现

1.PCA降维

from numpy import *

def loadDataSet(fileName, delim='\t'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [map(float,line) for line in stringArr]
    return mat(datArr)

def pca(dataMat, topNfeat=9999999):    #topNfeat 降维后的维度
    meanVals = mean(dataMat, axis=0)     #按列求均值,即每一列求一个均值,不同的列代表不同的特征
    #print meanVals                
    meanRemoved = dataMat - meanVals   #remove mean     #去均值,将样本数据的中心点移到坐标原点
    print meanRemoved  
    covMat = cov(meanRemoved, rowvar=0)         #计算协方差矩阵
    #print covMat                             
    eigVals,eigVects = linalg.eig(mat(covMat))   #计算协方差矩阵的特征值和特征向量
    #print eigVals
    #print eigVects
    eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest  #排序,将特征值按从小到大排列
    #print eigValInd
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions      #选择维度为topNfeat的特征值
    #print eigValInd
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest   #选择与特征值对应的特征向量
    print redEigVects
    lowDDataMat = meanRemoved * redEigVects   #transform data into new dimensions    #将数据映射到新的维度上,lowDDataMat为降维后的数据
    print lowDDataMat
    reconMat = (lowDDataMat * redEigVects.T) + meanVals         #对原始数据重构,用于测试
    print reconMat
    return lowDDataMat, reconMat

def replaceNanWithMean():             #均值代替那些样本中的缺失值
    datMat = loadDataSet('secom.data', ' ')
    numFeat = shape(datMat)[1]
    for i in range(numFeat):
        meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) # .A表示把矩阵转化为数组array
        #nonzero(~isnan(datMat[:,i].A))[0] 返回非0元素所在行的索引; 
        #>>> nonzero([True,False,True])
        #    (array([0, 2]),) 第0个和第3个元素非0
        #~isnan()返回Ture or False
        datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal  #set NaN values to mean
    return datMat
>>> import matplotlib.pyplot as plt
>>> fig=plt.figure()
>>> ax=fig.add_subplot(111)
>>> import pca
>>> dataMat=pca.loadDataSet('testSet.txt')
>>> lowDMat, reconMat = pca.pca(dataMat,1)
>>> ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0],marker='^',s=90)
>>> ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0],marker='o',s=50,c='red')
>>> plt.show()   #由两维降为1维数据,降维后为一条红色直线,该方向是样本方差最大的方向,即样本离散程度最大的方向,该方向,将原来的2维数据融合为1维上

>>> lowDMat, reconMat = pca.pca(dataMat,2)
>>> plt.show()   #保留原来的2维数据,画图后可看出,数据样本是重合的



猜你喜欢

转载自blog.csdn.net/qq_29422251/article/details/79279446