机器学习之PCA

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2018/8/16 16:34
# @Author  : limingyu
# @Site    : 
# @File    : Test_PCA.py
# @Software: PyCharm
import pandas as pd
from matplotlib import pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
import numpy as np


df = pd.read_csv('8.iris.data')
print(df.head())
#   5.1  3.5  1.4  0.2  Iris-setosa
#0  4.9  3.0  1.4  0.2  Iris-setosa
#1  4.7  3.2  1.3  0.2  Iris-setosa
#2  4.6  3.1  1.5  0.2  Iris-setosa
#3  5.0  3.6  1.4  0.2  Iris-setosa
#4  5.4  3.9  1.7  0.4  Iris-setosa
df.columns = ['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
print(df.head())
#   sepal_len  sepal_wid  petal_len  petal_wid        class
#0        4.9        3.0        1.4        0.2  Iris-setosa
#1        4.7        3.2        1.3        0.2  Iris-setosa
#2        4.6        3.1        1.5        0.2  Iris-setosa
#3        5.0        3.6        1.4        0.2  Iris-setosa
#4        5.4        3.9        1.7        0.4  Iris-setosa
#切分数据
X = df.ix[:,0:4].values   #切出属性
y = df.ix[:,4].values  #切出类别
print(X)
print(y)
label_dict = {1:'Iris-setosa',2:'Iris-versicolor',3:'Iris-virginica'}
feature_dict = {0:'sepal length[cm]',1:'sepal width [cm]',2:'petal length[cm]',3:'petal width[cm]'}

#画图得到不同特征之间取值范围不同
plt.figure(figsize=(8,6))
for cnt in range(4):
    plt.subplot(2,2,cnt+1)
    for lab in ('Iris-setosa','Iris-versicolor','Iris-virginica'):
        plt.hist(X[y==lab,cnt],label=lab,bins=10,alpha=0.3,)
        plt.xlabel(feature_dict[cnt])
        plt.legend(loc='upper right',fancybox=True,fontsize=8)
plt.tight_layout()
plt.show()

#对特征处理：标准化操作，使所有特征在同一区间变化
X_std = StandardScaler().fit_transform(X)
print(X_std)
#[[-1.1483555  -0.11805969 -1.35396443 -1.32506301]
#[-1.3905423   0.34485856 -1.41098555 -1.32506301]...
#计算协方差，查看各特征间相关程度，正数表正相关，负数表负相关
mean_vec = np.mean(X_std,axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix： \n%s' %cov_mat)
#Covariance matrix：
#[[ 1.00675676 -0.10448539  0.87716999  0.82249094]
#[-0.10448539  1.00675676 -0.41802325 -0.35310295]
#[ 0.87716999 -0.41802325  1.00675676  0.96881642]
#[ 0.82249094 -0.35310295  0.96881642  1.00675676]]
#等同于直接调方法
print('Numpy covariance matrix: \n%s' %np.cov(X_std.T))
#Numpy covariance matrix:
#[[ 1.00675676 -0.10448539  0.87716999  0.82249094]
#[-0.10448539  1.00675676 -0.41802325 -0.35310295]
#[ 0.87716999 -0.41802325  1.00675676  0.96881642]
#[ 0.82249094 -0.35310295  0.96881642  1.00675676]]
cov_mat = np.cov(X_std.T)
#求矩阵的特征值和特征向量
eig_vals,eig_vecs = np.linalg.eig(cov_mat)
print('Eigenvectors: \n%s' %eig_vecs) #特征向量
#Eigenvectors:
#[[ 0.52308496 -0.36956962 -0.72154279  0.26301409]
 #[-0.25956935 -0.92681168  0.2411952  -0.12437342]
 #[ 0.58184289 -0.01912775  0.13962963 -0.80099722]
 #[ 0.56609604 -0.06381646  0.63380158  0.52321917]]
print('Eigenvectors: \n%s' %eig_vals) #特征值
#Eigenvectors:
#[2.92442837 0.93215233 0.14946373 0.02098259]

#将特征值和特征向量做成对
eig_pairs = [(np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]
print(eig_pairs)
#[(2.9244283691111144, array([ 0.52308496, -0.25956935,  0.58184289,  0.56609604])),
# (0.932152330253508, array([-0.36956962, -0.92681168, -0.01912775, -0.06381646])),
# (0.14946373489813417, array([-0.72154279,  0.2411952 ,  0.13962963,  0.63380158])),
# (0.02098259276427019, array([ 0.26301409, -0.12437342, -0.80099722,  0.52321917]))]
eig_pairs.sort(key=lambda x:x[0],reverse=True)
print("Eigenvalues in descending order:")
#用特征值衡量特征向量重要程度，取出特征值大的
for i in eig_pairs:
    print(i[0])
    #Eigenvalues in descending order:
    #2.9244283691111144
    #0.932152330253508
    #0.14946373489813417
    #0.02098259276427019
#对特征值归一化:化成百分数，得到每个特征在总体中占的比重
tot = sum(eig_vals) #特征值求和
print(tot)  #4.027027027027027
#i是从大到小排序后的特征值，得到每个特征在总体中占的比重
var_exp = [(i / tot)*100 for i in sorted(eig_vals,reverse=True)]
print(var_exp)  #[72.62003332692029, 23.147406858644157, 3.7115155645845395, 0.5210442498510046]
cum_var_exp = np.cumsum(var_exp) #cumsum():后一个数是前面所有数之和
print(cum_var_exp) #[ 72.62003333  95.76744019  99.47895575 100.        ]

#cumsum使用举例:后一个数是前面所有数之和
a = np.array([1,2,3,4])
print(a) #[1 2 3 4]
print(np.cumsum(a)) #[1 3 6 10]

plt.figure(figsize=(6,4))
plt.bar(range(4),var_exp,alpha=0.5,align='center',label='individual explained variance')
plt.step(range(4),cum_var_exp,where='mid',label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

#取出选择的两个大的特征值对应的特征向量
matrix_w = np.hstack((eig_pairs[0][1].reshape(4,1),eig_pairs[1][1].reshape(4,1)))
print('Matrix W:\n',matrix_w) #Matrix W:
                               #[[ 0.52308496 -0.36956962]
                               #[-0.25956935 -0.92681168]
                               #[ 0.58184289 -0.01912775]
                               #[ 0.56609604 -0.06381646]]

Y = X_std.dot(matrix_w)
print(Y)
#PCA之前的图
plt.figure(figsize=(6,4))
for lab,col in zip(('Iris-setosa','Iris-versicolor','Iris-virginica'),('blue','red','green')):
    plt.scatter(X[y==lab,0],X[y==lab,1],label=lab,c=col)
plt.xlabel('sepal_len')
plt.ylabel('sepal_wid')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

#PCA之后的图
plt.figure(figsize=(6,4))
for lab,col in zip(('Iris-setosa','Iris-versicolor','Iris-virginica'),('blue','red','green')):
    plt.scatter(Y[y==lab,0],Y[y==lab,1],label=lab,c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center')
plt.tight_layout()
plt.show()
猜你喜欢