使用场景与简介这里不再赘述,可参考https://zhuanlan.zhihu.com/p/78798251
这里使用python进行实现。代码直接丢在下面
使用指南
- 把需要进行聚类分析的数据集想办法读到程序里,储存结构如下
[[point],[],[],...,[]] point:x,y
- 实例化,传入K值以及数据集数组, 最大迭代次数(一般不会很大),最大收缩间隔(越大越精准,越小越粗略)等参数
- 调用随机生成起始聚类中心函数,或自己指定聚类中心,需注意聚类中心个数需与K值对应
- 调用Kmeans_run(),开始Kmeans聚类
Tips
- 所提供的draw函数会在绘制后卡住,手动关闭matplotlib页面即可继续迭代(为了方便生成每次迭代的图像),可根据所需自行编写
- 若要输出每次的详细数据,可在update()函数内进行修改
- 在此例子中使用的数据集下载地址:https://cloud.tencent.com/developer/article/1869024
代码实现
import matplotlib.pyplot as plt
import pandas as pd
import random
color = ["b","red","orange",'green']#点的颜色,有K个颜色就最多能画到K个聚类,只需添加颜色即可增加K的上限
#读数据,可根据实际情况自行修改
df_l = pd.read_csv("C:/Users/LENOVO/Desktop/人工智能导论/iris.csv",usecols=[3,4])
datas = []
for i in range(150):
temp = []
temp.append(df_l.iloc[i,0])
temp.append(df_l.iloc[i,1])
datas.append(temp)
print(datas)
print(len(datas))
datastes = []
datastes.append(datas)#处理数据,以列表形式储存,储存结构:[[point],[],[],...,[]] point:x,y
class K_means():
datapoints = []#[[poin],[],[],...,[]] len=obj_num,poin:x,y
Max_step = 100
min = 10000
K = 0
K_mean_Points = []#开始时的中心点 [[meanApoint],[],...,[]] len=K,meanApoint:x,y
K_mean_process_Points = []#动态更新的中心点们
datapoints_classed = []#[[[point1],[point2]...[pointn]],[[]],[[]],...,[[]]] 分类过的
#classA #class...
clock = 0#记录分类完成时的迭代次数
def __init__(self,datapoints,K=2,Max_step=1000,min=10000) -> None:
self.Max_step = Max_step
self.min = min
self.datapoints = datapoints
self.K = K
for i in range(K):
self.datapoints_classed.append([])
self.K_mean_process_Points.append([])
def initKPoint_random(self,data_points):#使用随机初始化中心点
#[[meanApoint],[],...,[]]
for i in range(self.K):
rand = random.randint(0,len(data_points))
self.K_mean_Points.append(data_points[rand])#随机在数据集中挑选K个初始的中心点
def initKPoint_yourself(self,mena_points):#手动初始化中心点[[x1,x2],[x3,x4]...]
self.K_mean_Points = mena_points
def getLenth(self,potA,potB):#计算两点间距离
return ((potA[0]-potB[0])**2+(potA[1]-potB[1])**2)**0.5
def devide(self,i):
min = self.min
classid = 0
for j in range(self.K):
len = self.getLenth(self.K_mean_process_Points[j],self.datapoints[i])#判断距离
if len<=min:
min = len
classid = j
self.datapoints_classed[classid].append(self.datapoints[i])
#分类
def update(self):
updated_K_mean_Points = []
for j in range(self.K):
meanPoint = []
x_sum = 0
y_sum = 0
n = 0
for each in self.datapoints_classed[j]:
x_sum+=each[0]
y_sum+=each[1]
n+=1
x = x_sum/n
y = y_sum/n
meanPoint.append(x)
meanPoint.append(y)
updated_K_mean_Points.append(meanPoint)
self.K_mean_process_Points = updated_K_mean_Points#完成更新
def clear_classed_data(self):#清空此次迭代的分类,以便下次使用
self.datapoints_classed = []
for i in range(self.K):
temp = []
self.datapoints_classed.append(temp)
def draw(self,point_lis,K):#分好类的集合
xlis_all = []#参数形式:
ylis_all = []#[[[point1],[point2]...],[[],[]]]
for k in point_lis:
xlis_all.append([])
ylis_all.append([])
for point in k:
xlis_all[-1].append(point[0])
ylis_all[-1].append(point[1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set(xlim=[0,8],ylim = [0,3],title='the Kmeans',ylabel = "PatelWidth",xlabel = "PatelLength")
for i in range(K):
plt.scatter(xlis_all[i],ylis_all[i],c=color[i],s=3)
plt.show()
def Kmeans_run(self):
self.K_mean_process_Points = self.K_mean_Points
#进行第一次选取并更新
step = 0
while step<self.Max_step:
step+=1
for i in range(len(self.datapoints)):
self.devide(i)#把每个元素根据上一次更新得到得样本中心进行聚类
#datapoints_classed满
self.update()#计算,得到每个类的新平均样本中心点并更新K_mean_process_Points
# if step == self.Max_step:
self.draw(self.datapoints_classed,self.K)
print(step)
self.clear_classed_data() #清除这次的datapointsclassed
test =K_means(datas,2)
# test.draw(datastes,1)
pot1=[0,0]
pot2=[0,9]
# test.initKPoint_random(datas)
start_point = [[4,4],[5,5]]
test.initKPoint_yourself(start_point)
test.Kmeans_run()
#调用时,首先生成kmeans对象,然后使用任意一种方式指定初次中心
# test.initKPoint_yourself(start_point)
# test.initKPoint_random(datas)