#coding:utf-8
from numpy import *
"""
将数据转换成前N个主成分的伪代码如下:
去除平均值
计算协方差矩阵
计算协方差矩阵的特征值和特征向量
将特征值从大到小排序
保留最上面的N个特征向量
将数据转换到上述N个特征向量构建的新空间中
"""
def loadDataSet(filename,delim=' '):
fr = open(filename)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
dataArr = [list(map(float,line)) for line in stringArr] #map()的结果是惰性的,必须加list使其返回list
#print(shape(stringArr))
#print(shape(dataArr))
return mat(dataArr)
def pca(dataMat,topNfeat=99999): #topNfeat是应用的N个特征,也就是降到topNfeat维
meanVals = mean(dataMat,axis=0) #axis=0表示是对行压缩,即对列求平均,返回1*n的矩阵
print(type(meanVals))
meanRemoved = dataMat-meanVals
covMat = cov(meanRemoved,rowvar=0)
#如果`rowvar`为True(默认值),则每行代表一个变量,并在列中显示。 否则,转换关系:每列代表一个变量,在行中显示。
print(covMat)
eigVals,eigVects = linalg.eig(mat(covMat))
eigValInd = argsort(eigVals)
eigValInd = eigValInd[:-(topNfeat+1):-1]
redEigVects = eigVects[:,eigValInd]
lowDDataMat = meanRemoved * redEigVects
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat,reconMat
if __name__ == "__main__":
Add="D:\PycharmProjects\PCA\dataTest.txt"
dataMat= loadDataSet(Add)
lowDMat,reconMat = pca(dataMat,1) #降成1维
print(shape(lowDMat))
print(shape(reconMat))
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],marker="*",s=90,c="b")
#ax.scatter(lowDMat[:,0].flatten().A[0],lowDMat[:,1].flatten().A[0],marker="o",s=50,c="red")
ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker="o", s=40, c="r")
plt.show()
平均值带缺失值的函数:
#将NaN替换成平均值
def replaceNaNWithMean():
dataMat = loadDataSet()
numFeat = shape(dataMat)[1]
for i in range(numFeat):
#计算所有非NAN的平均值
meanVal = mean(dataMat[nonzero(~isnan(dataMat[:,i].A))[0],i])
dataMat[nonzero(isnan(dataMat[:,i].A))[0],i] = meanVal
return dataMat