Handwriting realizes classic Kmeans and dynamically displays clustering process

1. Review of the classic kmeans algorithm
step1: randomly select k objects as the initial cluster center
step2: calculate the distances from each sample to the k objects, and assign the sample points to the nearest class
step3: recalculate the k clusters Class clustering center
step4: Refresh the clustering center until the clustering center reaches the preset threshold or no longer changes
step5: Output the clustering center and the corresponding clustering
2. The dynamic display process
completes new clustering every time it is refreshed After plotting a 2D/3D image, the process is more interesting, please record it.
3.Code

import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

plt.rcParams['font.sans-serif']=['KaiTi']
plt.rcParams['axes.unicode_minus'] = False

# 高效读取数据源的函数
def read_data(data,skip_row,level,sht_name=0):
    if data[-4:]=='.csv':
        try:
            d_u = pd.read_csv(data, header=0, chunksize=10000, encoding='gbk')  # encoding='gbk',
        except UnicodeDecodeError:
            d_u = pd.read_csv(data, header=0, chunksize=10000, encoding='utf-8')
        chunk_0 = [i for i in d_u]
        df = pd.concat(chunk_0)
    else:
        if level==0 or level==1:
            header=skip_row
        else:
            header=[skip_row + i for i in range(level)]
        df = pd.read_excel(data, sheet_name=sht_name,header=header)
    return df

#计算距离的函数--欧氏距离
def cal_distance(a,b):
    distance=0
    for i in range(len(a)):
        distance+=(a[i]-b[i])**2
    return distance**0.5

#获取归一化的数据
def get_data(df,dimensions):
    u=[]
    for i in dimensions:
        minValue=df[i].min()
        maxValue = df[i].max()
        dk=(df[i]-minValue)/(maxValue-minValue)#对数据进行归一化处理
        u.append(dk)
    ds=pd.concat(u,axis=1)
    return ds

#初始化参数的函数
def get_init_params(data,n):
    resultBox = {
    
    }  # 存放分类结果的容器
    oriPoints=[]
    for i in range(n):
        resultBox[i]=[]
    std_2=0
    while std_2<std_1:  #简单优化一下,避免随机选取的点过于集中
        oriPointIndex=random.sample(range(len(data)),n)
        oriPoints=data[oriPointIndex]
        std_2=np.std(oriPoints)
        print(std_2)
    return resultBox,oriPoints

def get_length(data):
    length=0
    for i in data:
        length+=i**2
    return length**0.5

#获取聚类具体数据的函数
def get_clusters(dk,data,clusterBox,center,n=1):
    colorBox=[''  for i in range(len(data))]
    for i in range(len(data)):
        distanceBox=[]
        for j in range(n):
            distanceBox.append(cal_distance(center[j],data[i]))
        clusterBox[distanceBox.index(min(distanceBox))].append(data[i])
        colorBox[i]=color_list[distanceBox.index(min(distanceBox))]
    plot_data(dk, n, dimensions, color_list, colorBox)
    return clusterBox

#更新中心点的函数
def update_center(data):
    ArrayData = [np.vstack(data[i]) for i in range(len(data))]
    center = [i.mean(axis=0) for i in ArrayData]
    return np.vstack(center)

#判断中心点偏移量的函数
def cal_center_change(a,b,err):
    chgBox = []
    for i in range(len(a)):
        chgPct = (cal_distance(list(a[i]),list(b[i]))/get_length(list(a[i])))
        chgBox.append(chgPct)
    IsWeaken=1 if sum([1 if i<=err else 0 for i in chgBox])==len(a) else 0#判定是否达到设定的阈值
    return chgBox,IsWeaken

#作图函数,间隔一秒重新展示,体现聚类过程。仅在2维和3维时进行作图
def plot_data(df,n,dimensions,color_list,colorBox):
    if len(dimensions)==2 and n<=len(color_list):
        plt.figure(figsize=(20,10),dpi=60)
        plt.scatter(df[dimensions[0]], df[dimensions[1]], c=colorBox,s=10)
        plt.title('二维图像')
        plt.xlabel(dimensions[0])
        plt.ylabel(dimensions[1])
        plt.show(block=False)
        plt.pause(1)
        plt.close()
    elif len(dimensions)==3 and n<=len(color_list):
        fig = plt.figure(figsize=(20,10),dpi=60)
        ax = Axes3D(fig)
        ax.scatter(df[dimensions[0]], df[dimensions[1]],df[dimensions[2]], c=colorBox,s=10)
        ax.set_xlabel(dimensions[0])
        ax.set_ylabel(dimensions[1])
        ax.set_zlabel(dimensions[2])
        ax.legend([],title='三维图像')
        plt.show(block=False)
        plt.pause(1)
        plt.close()

#主函数
def kMeans(df,list_data,interation,err,n):
    clusterBox, center = get_init_params(list_data, n=n)
    chgBox, IsWeaken=[],0
    i=1
    while IsWeaken==0 and i<interation:
        clusterBox=get_clusters(df,list_data, clusterBox, center=center, n=n)
        last_center=center
        center=update_center(clusterBox)
        chgBox, IsWeaken = cal_center_change(last_center, center, err)
        i+=1
    return center,clusterBox,chgBox,i


f_0='../data_source/clsTest.csv'  #数据自己构造一个即可
color_list=['pink','blue','green','purple','orange','red','grey','yellow','black','DodgerBlue']  #暂时写10个颜色
# dimensions=['xdata','ydata'] #存放进行聚类时使用的维度/字段
dimensions=['xdata','ydata','zdata'] #存放进行聚类时使用的维度/字段

k=7 #分类数量
interation=500  #迭代次数
err=0.005#设置阈值

if  __name__== "__main__":
    ds=read_data(f_0,0,0,0)
    dk=get_data(ds,dimensions)#用于绘图
    list_data = dk.values
    std_1=np.std(list_data)
    print(std_1)
    center,clusterBox,chgBox,interateNum=kMeans(dk,list_data,interation,err=err,n=k)
    print('归一化后聚类中心:\n{}'.format(center))
    print('迭代次数:{}'.format(interateNum))

4.效果展示
二维图像:
insert image description here
三维图像:
insert image description here
5.结果与评价
运行结果
归一化后聚类中心:
[[0.06568673 0.43479818 0.43537879]
[0.84744645 0.0676106 0.08559254]
[0.21654101 0.63332569 0.62638664]
[0.15594514 0.19334235 0.20561701]
[0.4406322 0.53107942 0.52858135]
[0.95099398 0.9540862 0.93101594]
[0.16843369 0.83702506 0.8172746]]
Number of iterations: 8

From the dynamic display process of clustering, it can be clearly seen that:
a. The selection of the initial point has a great influence on the clustering result. Assigned at the corresponding position, that is, the local optimum, the yellow and gray parts of the three-dimensional image in the figure can be seen very clearly; b
. In some positions, if the cluster is not initially assigned a cluster, the cluster is easy to be The other nearby clustering forces are equally divided, such as the part
c located in the middle of the blue/red/green of the three-dimensional map above. The algorithm is very efficient, and the convergence threshold of the specified preset value can be reached after 8 iterations

Guess you like

Origin blog.csdn.net/Apollo_Guang/article/details/127773974