Principal component analysis PCA to reduce the dimensionality of the data

Principal component analysis ( the Principal the Analysis the Component , the PCA ), is a statistical method. Orthogonal transform by the presence of a variable set of possible correlation variables into a set of linearly independent, the set of the converted variables called principal components.

In practical topics in order to fully analyze the problem, often raised many variables relating thereto (or factors), because each variable to reflect some information about this subject in varying degrees.

Principal component analysis was first introduced by K. Pearson ( Karl Pearson ) non-random variables introduced later H. Hotelling this method is extended to the case of the random vector. Size information is often used to measure the sum of squares or variance.

# -*- coding: utf-8 -*-
import numpy as np
from matplotlib import pyplot as plt
from scipy import io as spio
from sklearn.decomposition import pca

'''
主成分分析_2维数据降维1维演示函数
'''


def PCA_2D():
    data_2d = spio.loadmat("data.mat")
    X = data_2d['X']
    m = X.shape[0]
    plt = plot_data_2d(X, 'bo')  # 显示二维的数据
    plt.show()

    X_copy = X.copy()
    X_norm, mu, sigma = featureNormalize(X_copy)  # 归一化数据
    # plot_data_2d(X_norm)    # 显示归一化后的数据
    # plt.show()

    Sigma = np.dot(np.transpose(X_norm), X_norm) / m  # 求Sigma
    U, S, V = np.linalg.svd(Sigma)  # 求Sigma的奇异值分解

    plt = plot_data_2d(X, 'bo')  # 显示原本数据
    drawline(plt, mu, mu + S[0] * (U[:, 0]), 'r-')  # 线,为投影的方向

    plt.axis('square')
    plt.show()

    K = 1  # 定义降维多少维(本来是2维的,这里降维1维)
    '''投影之后数据(降维之后)'''
    Z = projectData(X_norm, U, K)  # 投影
    '''恢复数据'''
    X_rec = recoverData(Z, U, K)  # 恢复
    '''作图-----原数据与恢复的数据'''
    plt = plot_data_2d(X_norm, 'bo')
    plot_data_2d(X_rec, 'ro')
    for i in range(X_norm.shape[0]):
        drawline(plt, X_norm[i, :], X_rec[i, :], '--k')
    plt.axis('square')
    plt.show()


'''主成分分析_PCA图像数据降维'''


def PCA_faceImage():
    print(u'加载图像数据.....')
    data_image = spio.loadmat('data_faces.mat')
    X = data_image['X']
    display_imageData(X[0:100, :])
    m = X.shape[0]  # 数据条数

    print(u'运行PCA....')
    X_norm, mu, sigma = featureNormalize(X)  # 归一化

    Sigma = np.dot(np.transpose(X_norm), X_norm) / m  # 求Sigma
    U, S, V = np.linalg.svd(Sigma)  # 奇异值分解
    display_imageData(np.transpose(U[:, 0:36]))  # 显示U的数据

    print(u'对face数据降维.....')
    K = 100  # 降维100维(原先是32*32=1024维的)
    Z = projectData(X_norm, U, K)
    print(u'投影之后Z向量的大小:%d %d' % Z.shape)

    print(u'显示降维之后的数据......')
    X_rec = recoverData(Z, U, K)  # 恢复数据
    display_imageData(X_rec[0:100, :])


# 可视化二维数据
def plot_data_2d(X, marker):
    plt.plot(X[:, 0], X[:, 1], marker)
    return plt


# 归一化数据
def featureNormalize(X):
    '''(每一个数据-当前列的均值)/当前列的标准差'''
    n = X.shape[1]
    mu = np.zeros((1, n));
    sigma = np.zeros((1, n))

    mu = np.mean(X, axis=0)  # axis=0表示列
    sigma = np.std(X, axis=0)
    for i in range(n):
        X[:, i] = (X[:, i] - mu[i]) / sigma[i]
    return X, mu, sigma


# 映射数据
def projectData(X_norm, U, K):
    Z = np.zeros((X_norm.shape[0], K))

    U_reduce = U[:, 0:K]  # 取前K个
    Z = np.dot(X_norm, U_reduce)
    return Z


# 画一条线
def drawline(plt, p1, p2, line_type):
    plt.plot(np.array([p1[0], p2[0]]), np.array([p1[1], p2[1]]), line_type)


# 恢复数据
def recoverData(Z, U, K):
    X_rec = np.zeros((Z.shape[0], U.shape[0]))
    U_recude = U[:, 0:K]
    X_rec = np.dot(Z, np.transpose(U_recude))  # 还原数据(近似)
    return X_rec


# 显示图片
def display_imageData(imgData):
    sum = 0
    '''
    显示100个数(若是一个一个绘制将会非常慢,可以将要画的图片整理好,放到一个矩阵中,显示这个矩阵即可)
    - 初始化一个二维数组
    - 将每行的数据调整成图像的矩阵,放进二维数组
    - 显示即可
    '''
    m, n = imgData.shape
    width = np.int32(np.round(np.sqrt(n)))
    height = np.int32(n / width);
    rows_count = np.int32(np.floor(np.sqrt(m)))
    cols_count = np.int32(np.ceil(m / rows_count))
    pad = 1
    display_array = -np.ones((pad + rows_count * (height + pad), pad + cols_count * (width + pad)))
    for i in range(rows_count):
        for j in range(cols_count):
            max_val = np.max(np.abs(imgData[sum, :]))
            display_array[pad + i * (height + pad):pad + i * (height + pad) + height,
            pad + j * (width + pad):pad + j * (width + pad) + width] = imgData[sum, :].reshape(height, width,
                                                                                               order="F") / max_val  # order=F指定以列优先,在matlab中是这样的,python中需要指定,默认以行
            sum += 1

    plt.imshow(display_array, cmap='gray')  # 显示灰度图像
    plt.axis('off')
    plt.show()


if __name__ == "__main__":
    PCA_2D()
    PCA_faceImage()

Author: WangB

Published 142 original articles · won praise 213 · views 10000 +

Guess you like

Origin blog.csdn.net/s0302017/article/details/104336874
Recommended