A simple and easy-to-modify Python implementation of KMeans clustering, essential for mathematical modeling

The usage scenarios and introduction will not be repeated here. Please refer to https://zhuanlan.zhihu.com/p/78798251

This is implemented using python. Throw the code directly below

user's guidance

  • Find a way to read the data set that needs to be clustered into the program. The storage structure is as follows

 [[point],[],[],...,[]] point:x,y 

  • Instantiate, pass in the K value and data set array, the maximum number of iterations (generally not very large), the maximum shrinkage interval (the larger the more accurate, the smaller the more rough) and other parameters
  • Call the function to randomly generate the initial clustering center, or specify the clustering center yourself. Please note that the number of clustering centers needs to correspond to the K value.
  • Call Kmeans_run() to start Kmeans clustering

Tips

  • The draw function provided will get stuck after drawing. You can continue to iterate by closing the matplotlib page manually (in order to facilitate the generation of images for each iteration). You can write it yourself according to your needs.
  • If you want to output detailed data each time, you can modify it in the update() function.
  • The download address of the dataset used in this example: https://cloud.tencent.com/developer/article/1869024

Code

import matplotlib.pyplot as plt
import pandas as pd
import random
color = ["b","red","orange",'green']#点的颜色,有K个颜色就最多能画到K个聚类,只需添加颜色即可增加K的上限
#读数据,可根据实际情况自行修改
df_l = pd.read_csv("C:/Users/LENOVO/Desktop/人工智能导论/iris.csv",usecols=[3,4])
datas = []
for i in range(150):
        temp = []
        temp.append(df_l.iloc[i,0])
        temp.append(df_l.iloc[i,1])
        datas.append(temp)
print(datas)
print(len(datas))
datastes = []
datastes.append(datas)#处理数据,以列表形式储存,储存结构:[[point],[],[],...,[]] point:x,y
class K_means():
    datapoints = []#[[poin],[],[],...,[]] len=obj_num,poin:x,y
    Max_step = 100
    min = 10000
    K = 0
    K_mean_Points = []#开始时的中心点 [[meanApoint],[],...,[]] len=K,meanApoint:x,y
    K_mean_process_Points = []#动态更新的中心点们
    datapoints_classed = []#[[[point1],[point2]...[pointn]],[[]],[[]],...,[[]]] 分类过的
                                #classA                      #class...
    clock = 0#记录分类完成时的迭代次数
    def __init__(self,datapoints,K=2,Max_step=1000,min=10000) -> None:
        self.Max_step = Max_step
        self.min = min
        self.datapoints = datapoints
        self.K = K
        for i in range(K):
            self.datapoints_classed.append([])
            self.K_mean_process_Points.append([])    
    
    
    def initKPoint_random(self,data_points):#使用随机初始化中心点
        #[[meanApoint],[],...,[]]
        for i in range(self.K):
            rand = random.randint(0,len(data_points))
            self.K_mean_Points.append(data_points[rand])#随机在数据集中挑选K个初始的中心点
            
    def initKPoint_yourself(self,mena_points):#手动初始化中心点[[x1,x2],[x3,x4]...]
        self.K_mean_Points = mena_points
    
    def getLenth(self,potA,potB):#计算两点间距离
        return ((potA[0]-potB[0])**2+(potA[1]-potB[1])**2)**0.5
        
    def devide(self,i):
        min = self.min
        classid = 0
        for j in range(self.K):
            len = self.getLenth(self.K_mean_process_Points[j],self.datapoints[i])#判断距离
            if len<=min:
                min = len
                classid = j
        self.datapoints_classed[classid].append(self.datapoints[i])
        #分类
    
    def update(self):
        updated_K_mean_Points = []
        for j in range(self.K):
            meanPoint = []
            x_sum = 0
            y_sum = 0
            n = 0
            for each in self.datapoints_classed[j]:
                x_sum+=each[0]
                y_sum+=each[1]
                n+=1
            x = x_sum/n
            y = y_sum/n
            meanPoint.append(x)
            meanPoint.append(y)
            updated_K_mean_Points.append(meanPoint)
        self.K_mean_process_Points = updated_K_mean_Points#完成更新
        
    def clear_classed_data(self):#清空此次迭代的分类,以便下次使用
        self.datapoints_classed = []
        for i in range(self.K):
            temp = []
            self.datapoints_classed.append(temp)
    
    def draw(self,point_lis,K):#分好类的集合
        xlis_all = []#参数形式:
        ylis_all = []#[[[point1],[point2]...],[[],[]]]
        for k in point_lis:
            xlis_all.append([])
            ylis_all.append([])
            for point in k:
                xlis_all[-1].append(point[0])
                ylis_all[-1].append(point[1])
                
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set(xlim=[0,8],ylim = [0,3],title='the Kmeans',ylabel = "PatelWidth",xlabel = "PatelLength")
        for i in range(K):
            plt.scatter(xlis_all[i],ylis_all[i],c=color[i],s=3)
        plt.show() 
        
    def Kmeans_run(self):
        self.K_mean_process_Points = self.K_mean_Points
        #进行第一次选取并更新
        step = 0
        while step<self.Max_step:
            step+=1
            for i in range(len(self.datapoints)):
                self.devide(i)#把每个元素根据上一次更新得到得样本中心进行聚类
                #datapoints_classed满
            self.update()#计算,得到每个类的新平均样本中心点并更新K_mean_process_Points
            # if step == self.Max_step:
            self.draw(self.datapoints_classed,self.K)
            print(step)
                
            self.clear_classed_data()                #清除这次的datapointsclassed    
            
    
        
        
test =K_means(datas,2)
# test.draw(datastes,1)
pot1=[0,0]
pot2=[0,9]
# test.initKPoint_random(datas)
start_point = [[4,4],[5,5]] 
test.initKPoint_yourself(start_point)
test.Kmeans_run()

#调用时,首先生成kmeans对象,然后使用任意一种方式指定初次中心
# test.initKPoint_yourself(start_point)
# test.initKPoint_random(datas)

Guess you like

Origin blog.csdn.net/Gelercat/article/details/127858387