KMeans聚类 简单易修改的Python实现,数学建模必备

使用场景与简介这里不再赘述,可参考https://zhuanlan.zhihu.com/p/78798251

这里使用python进行实现。代码直接丢在下面

使用指南

  • 把需要进行聚类分析的数据集想办法读到程序里,储存结构如下

 [[point],[],[],...,[]] point:x,y 

  • 实例化,传入K值以及数据集数组, 最大迭代次数(一般不会很大),最大收缩间隔(越大越精准,越小越粗略)等参数
  • 调用随机生成起始聚类中心函数,或自己指定聚类中心,需注意聚类中心个数需与K值对应
  • 调用Kmeans_run(),开始Kmeans聚类

Tips

  • 所提供的draw函数会在绘制后卡住,手动关闭matplotlib页面即可继续迭代(为了方便生成每次迭代的图像),可根据所需自行编写
  • 若要输出每次的详细数据,可在update()函数内进行修改
  • 在此例子中使用的数据集下载地址:https://cloud.tencent.com/developer/article/1869024

代码实现

import matplotlib.pyplot as plt
import pandas as pd
import random
color = ["b","red","orange",'green']#点的颜色,有K个颜色就最多能画到K个聚类,只需添加颜色即可增加K的上限
#读数据,可根据实际情况自行修改
df_l = pd.read_csv("C:/Users/LENOVO/Desktop/人工智能导论/iris.csv",usecols=[3,4])
datas = []
for i in range(150):
        temp = []
        temp.append(df_l.iloc[i,0])
        temp.append(df_l.iloc[i,1])
        datas.append(temp)
print(datas)
print(len(datas))
datastes = []
datastes.append(datas)#处理数据,以列表形式储存,储存结构:[[point],[],[],...,[]] point:x,y
class K_means():
    datapoints = []#[[poin],[],[],...,[]] len=obj_num,poin:x,y
    Max_step = 100
    min = 10000
    K = 0
    K_mean_Points = []#开始时的中心点 [[meanApoint],[],...,[]] len=K,meanApoint:x,y
    K_mean_process_Points = []#动态更新的中心点们
    datapoints_classed = []#[[[point1],[point2]...[pointn]],[[]],[[]],...,[[]]] 分类过的
                                #classA                      #class...
    clock = 0#记录分类完成时的迭代次数
    def __init__(self,datapoints,K=2,Max_step=1000,min=10000) -> None:
        self.Max_step = Max_step
        self.min = min
        self.datapoints = datapoints
        self.K = K
        for i in range(K):
            self.datapoints_classed.append([])
            self.K_mean_process_Points.append([])    
    
    
    def initKPoint_random(self,data_points):#使用随机初始化中心点
        #[[meanApoint],[],...,[]]
        for i in range(self.K):
            rand = random.randint(0,len(data_points))
            self.K_mean_Points.append(data_points[rand])#随机在数据集中挑选K个初始的中心点
            
    def initKPoint_yourself(self,mena_points):#手动初始化中心点[[x1,x2],[x3,x4]...]
        self.K_mean_Points = mena_points
    
    def getLenth(self,potA,potB):#计算两点间距离
        return ((potA[0]-potB[0])**2+(potA[1]-potB[1])**2)**0.5
        
    def devide(self,i):
        min = self.min
        classid = 0
        for j in range(self.K):
            len = self.getLenth(self.K_mean_process_Points[j],self.datapoints[i])#判断距离
            if len<=min:
                min = len
                classid = j
        self.datapoints_classed[classid].append(self.datapoints[i])
        #分类
    
    def update(self):
        updated_K_mean_Points = []
        for j in range(self.K):
            meanPoint = []
            x_sum = 0
            y_sum = 0
            n = 0
            for each in self.datapoints_classed[j]:
                x_sum+=each[0]
                y_sum+=each[1]
                n+=1
            x = x_sum/n
            y = y_sum/n
            meanPoint.append(x)
            meanPoint.append(y)
            updated_K_mean_Points.append(meanPoint)
        self.K_mean_process_Points = updated_K_mean_Points#完成更新
        
    def clear_classed_data(self):#清空此次迭代的分类,以便下次使用
        self.datapoints_classed = []
        for i in range(self.K):
            temp = []
            self.datapoints_classed.append(temp)
    
    def draw(self,point_lis,K):#分好类的集合
        xlis_all = []#参数形式:
        ylis_all = []#[[[point1],[point2]...],[[],[]]]
        for k in point_lis:
            xlis_all.append([])
            ylis_all.append([])
            for point in k:
                xlis_all[-1].append(point[0])
                ylis_all[-1].append(point[1])
                
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.set(xlim=[0,8],ylim = [0,3],title='the Kmeans',ylabel = "PatelWidth",xlabel = "PatelLength")
        for i in range(K):
            plt.scatter(xlis_all[i],ylis_all[i],c=color[i],s=3)
        plt.show() 
        
    def Kmeans_run(self):
        self.K_mean_process_Points = self.K_mean_Points
        #进行第一次选取并更新
        step = 0
        while step<self.Max_step:
            step+=1
            for i in range(len(self.datapoints)):
                self.devide(i)#把每个元素根据上一次更新得到得样本中心进行聚类
                #datapoints_classed满
            self.update()#计算,得到每个类的新平均样本中心点并更新K_mean_process_Points
            # if step == self.Max_step:
            self.draw(self.datapoints_classed,self.K)
            print(step)
                
            self.clear_classed_data()                #清除这次的datapointsclassed    
            
    
        
        
test =K_means(datas,2)
# test.draw(datastes,1)
pot1=[0,0]
pot2=[0,9]
# test.initKPoint_random(datas)
start_point = [[4,4],[5,5]] 
test.initKPoint_yourself(start_point)
test.Kmeans_run()

#调用时,首先生成kmeans对象,然后使用任意一种方式指定初次中心
# test.initKPoint_yourself(start_point)
# test.initKPoint_random(datas)

猜你喜欢

转载自blog.csdn.net/Gelercat/article/details/127858387