七：K-means和PCA

K-means是一个迭代的，无监督的聚类算法，将类似的实例组合成簇。该算法通过猜测每个簇的初始聚类中心开始，然后重复将实例分配给最近的簇，并重新计算该簇的聚类中心。

PCA是在数据集中找到“主成分”或最大方差方向的线性变换。它可以用于降维。

聚类在无监督学习中的应用十分广泛，当没有标签提前对数据进行标注，余姚挖掘数据间的隐藏信息

与相关性，参与K-means可以将数据分成不同的簇，但是需要预先知道自己的分类方向，所以需要在

机器进行聚类运算前随机定义相对距离较远的不同点，然后进行聚类运算。

一：init.py（主函数）

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
import function as f

##测试：（距离聚类中心最近的数据索引）的函数
data = loadmat(r'******')
X = data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) #初始化质心
idx = f.find_closest_centroids(X, initial_centroids)

#数据显示
data2 = pd.DataFrame(data.get('X'), columns=['X1', 'X2'])
# print(data2.head())

sb.set(context="notebook", style="white")
sb.lmplot('X1', 'X2', data=data2, fit_reg=False)
# plt.show()
# print(f.compute_centorids(X, idx, 3))

idx, centroids = f.run_k_means(X, initial_centroids, 10)

cluster1 = X[np.where(idx == 0)[0],:]
cluster2 = X[np.where(idx == 1)[0],:]
cluster3 = X[np.where(idx == 2)[0],:]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3')
ax.legend()
# plt.show()

#随机样本质心。
# print(f.init__centroids(X, 3))

##将K-means应用于图像压缩
from IPython.display import Image
Image(filename=r'******')
image_data = loadmat(r'******')
print(image_data)
#图片的维度
A = image_data['A']
print(A.shape)

#对数据应用一些预处理，并将其提供给K-means算法。
A = A / 255
#重置矩阵大小
X = np.reshape(A, (A.shape[0] * A.shape[1], A.shape[2]))

#随机初始化聚类中心
initial_centroids = f.init__centroids(X, 16)
#运行聚类算法
idx, centroids = f.run_k_means(X, initial_centroids, 10)
# 得到最后一个聚类中心
idx = f.find_closest_centroids(X, centroids)
# 把每一个像素值与聚类结果进行匹配
X_recovered = centroids[idx.astype(int),:]
# reshape to the original dimensions
X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))

plt.imshow(X_recovered)
# print(plt.show())

###PCA是在数据集中找到“主成分”或最大方差方向的线性变换。 它可以用于降维.
#实现PCA并将其应用于一个简单的二维数据集
data3 = loadmat(r'******')
X = data3['X']

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:, 0], X[:, 1])
# plt.show()

U, S, V = f.pca(X) #U是主成分
#实现一个计算投影并且仅选择顶部K个分量的函数，有效地减少维数
def project_data(X, U, k):
    U_reduced = U[:,:k]
    return np.dot(X, U_reduced)
Z = project_data(X, U, 1)
#通过反向转换步骤来恢复原始数据。
def recover_data(Z, U, k):
    U_reduced = U[:,:k]
    return np.dot(Z, U_reduced.T)
X_recovered = recover_data(Z, U, 1)

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(list(X_recovered[:, 0]), list(X_recovered[:, 1]))
plt.show()

二：function.py

import numpy as np

def find_closest_centroids(X, centroids):
    #input:数据x，初始聚类中心centroids
    #output：距离聚类中心最近的数据索引
    #todo：找到数据中心每个实例最接近的聚类中心的数据
    #step1:得到矩阵的维度，初始化
    m = X.shape[0]
    k = centroids.shape[0]
    idx = np.zeros(m)

    #step2:遍历所有数据，找到距离聚类中心最近的
    for i in range(m):
        min_dist = 1000000
        for j in range(k):
            dist = np.sum((X[i, : ] - centroids[j, : ]) ** 2)
            if dist < min_dist:
                min_dist = dist #记录当前最短距离和中心索引值
                idx[i] = j

    return idx

def compute_centorids(X, idx, k):
    #input:数据X，聚类中心idx，中心个数
    #output:输出新的聚类中心。
    #todo：计算当前族的聚类中心
    #step1:得到矩阵大小，初始化矩阵
    m, n = X.shape
    centroids = np.zeros((k, n))

    #step：计算聚类中心(聚类中心只是当前分配给簇的所有样本的平均值)
    for i in range(k):
        indices = np.where(idx == i) #找到n维数组中特定数值的索引
        centroids[i, : ] = (np.sum(X[indices, : ], axis = 1) / len(indices[0])).ravel()

    return centroids

def run_k_means(X, initial_centroids, max_iters):
    #input:数据X，聚类中心idx， 簇的个数
    #output：当前簇的聚类中心。
    #todo：计算当前簇的聚类中心
    #step1:得到矩阵大小，初始化矩阵级变量
    m, n = X.shape
    k = initial_centroids.shape[0]
    idx = np.zeros(m)
    centroids = initial_centroids
    #step2:实现聚类算法，调用之前的两个函数
    for i in range(max_iters):
        idx = find_closest_centroids(X, centroids)
        centroids = compute_centorids(X, idx, k)

    return idx, centroids

#创建一个随机样本并将其用作初始聚类中心的函数，来更好的设置质心
def init__centroids(X, k):
    m, n = X.shape
    centroids = np.zeros((k, n))
    idx = np.random.randint(0, m, k)

    for i in range(k):
        centroids[i, : ] = X[idx[i], : ]

    return centroids

#PCA： 在确保数据被归一化之后，输出仅仅是原始数据的协方差矩阵的奇异值分解（Tip：矩阵奇异值分解可以使用np.linalg.svd(X)函数，其中X是待分解矩阵。）
def pca(X):
    #input:数据X
    #output：矩阵U,S,V
    #todo：对数据进行奇异值分解
    #step:1归一化数据
    X = (X - X.mean()) /  X.std()

    #step2:计算协方差矩阵
    X = np.matrix(X)
    cov = (X.T * X) / X.shape[0]

    #step#:进行奇异值分解
    U, S, V = np.linalg.svd(cov)

    return U, S, V

Chris_hx

发布了40 篇原创文章 · 获赞 4 · 访问量 5171

私信关注

一：__init__.py（主函数）

二：function.py

猜你喜欢

一：init.py（主函数）