Python实现k均值聚类算法_K-Means聚类算法
若想快速了解k均值聚类算法_K-Means聚类算法,可参考这位大佬的文章,通俗易懂:k均值聚类算法考试例题_K-Means聚类算法,作者:weixin_39789792
感谢这位博主。
本篇博客仅作为自用笔记,如有侵权联系删除。
代码详细
注意:
质心k的坐标取值不能取的太离谱,比如三个质心中有一个质心的坐标离样本数据中所有坐标都很远,就会导致列表sse_k1(或k2、k3)中无数据,便使len(sse_kx(x为1、2、3))=0,报错:ZeroDivisionError: division by zero
import pylab as pl
def square_Euclid(x, y):
"""
计算欧几里得距离:
若是两个平面上的点,即(x1,y1),和(x2,y2),那这俩点距离即√( (x1-x2)^2+(y1-y2)^2);
如果是三维空间中的点,则为√( (x1-x2)^2+(y1-y2)^2+(z1-z2)^2 。
"""
return (x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2
# 初始化各个点
num_x = []
num_y = []
fl = open('data01.txt') # 点数据存放在data01.txt中
for line in fl.readlines():
curLine = line.strip().split()
num_x.append(float(curLine[0]))
num_y.append(float(curLine[1]))
# 初始化三个质心,经过聚类得到三个分组
k1 = [-1.7, 1]
k2 = [-0.5, 0.5]
k3 = [1, 2]
# 定义三个簇
sse_k1 = []
sse_k2 = []
sse_k3 = []
n = len(num_x)
while True:
sse_k1 = []
sse_k2 = []
sse_k3 = []
for i in range(n):
square_E1 = square_Euclid(k1, [num_x[i], num_y[i]])
square_E2 = square_Euclid(k2, [num_x[i], num_y[i]])
square_E3 = square_Euclid(k3, [num_x[i], num_y[i]])
num_min = min(square_E1, square_E2, square_E3)
# 聚类
if num_min == square_E1:
sse_k1.append(i)
elif num_min == square_E2:
sse_k2.append(i)
elif num_min == square_E3:
sse_k3.append(i)
# 求坐标平均值,以确定新的质心(更新质心坐标)
k1_x = sum([num_x[i] for i in sse_k1]) / len(sse_k1)
k1_y = sum([num_y[i] for i in sse_k1]) / len(sse_k1)
k2_x = sum([num_x[i] for i in sse_k2]) / len(sse_k2)
k2_y = sum([num_y[i] for i in sse_k2]) / len(sse_k2)
k3_x = sum([num_x[i] for i in sse_k3]) / len(sse_k3)
k3_y = sum([num_y[i] for i in sse_k3]) / len(sse_k3)
# 只要有质心的坐标发生改变,则更新质心坐标;若三个质心均无变化,则判定以收敛,聚类结束,退出循环
if k1 != [k1_x, k1_y] or k2 != [k2_x, k2_y] or k3 != [k3_x, k3_y]:
k1 = [k1_x, k1_y]
k2 = [k2_x, k2_y]
k3 = [k3_x, k3_y]
else:
break
# 取聚类后的点坐标
kv1_x = [num_x[i] for i in sse_k1]
kv1_y = [num_y[i] for i in sse_k1]
kv2_x = [num_x[i] for i in sse_k2]
kv2_y = [num_y[i] for i in sse_k2]
kv3_x = [num_x[i] for i in sse_k3]
kv3_y = [num_y[i] for i in sse_k3]
pl.plot(kv1_x, kv1_y, '+')
pl.plot(kv2_x, kv2_y, '.')
pl.plot(kv3_x, kv3_y, '^')
# 坐标系大小依样本数据值范围而定
pl.xlim(-2, 2.5)
pl.ylim(-1, 2.5)
pl.show()
结果展示
所用样本数据
在.py文件所在文件下创建data01.txt文本文档,并将以下数据存入
-0.113523119722435 0.305566317246824
-0.0363280337139859 0.110677855003451
0.113494507689856 0.285179031884109
-0.00383816850385252 0.670778674827114
-0.180363593046200 0.394837771823933
0.295367728543231 -0.355182535548782
-0.0296566442720039 0.228722511660635
-0.0930361342677474 0.154377592930645
-0.159633545380951 1.03286272700827
-0.609370592744484 0.0100246598182464
0.164875043625935 0.107920610145671
-0.649661855983650 -0.0264148180075531
-0.0853301136043781 0.194929464533097
-0.0869727732803104 -0.166019322253363
0.267258237150858 0.318664851557507
-0.876324515282669 0.578412914115882
0.290320777421500 0.0269704554131184
-0.164202641138215 -0.0216061750617156
-0.408886348765266 -0.178183406834480
-0.00275690297052195 -0.149757266490323
-0.230897603220972 0.202729565016547
-0.289768125501838 0.299373894453753
0.565273947293806 -0.112025265465832
-0.259434375270518 -0.183038062076565
-0.0622055869197436 0.0178584309105331
-0.281488166956539 -0.282493439656289
0.288003999490542 0.354832178282382
-0.00387861254715821 0.245338598261617
0.0230259610960932 0.304367839506965
0.297069520513791 0.398694925851779
0.213528795047459 -0.0341268311839215
0.248545070529365 -0.182513241920946
-0.674431824833610 0.166219624024427
0.0695478578554150 0.364281641067673
1.52144323033782 1.56356334395462
1.54901744911605 1.44082824131763
1.72628026225810 0.999267392962595
1.34339405843162 1.54435051334828
1.63076888391605 0.822969713727122
1.24625402720513 1.50291563943267
1.49966193305128 1.43962200220279
0.806148334745612 1.59798616598320
1.73765675194197 0.801038214866100
0.688725193167526 1.18560461303177
1.31503430771996 1.25566460922217
1.14051881393761 1.28173391148891
0.883497444350820 1.52712829138676
1.35619761199096 1.47157896393621
-1.41400896645106 1.03490557492282
-1.46921827418174 0.691733912712829
-1.06733046906236 0.945293131396786
-0.789899047908273 1.04583303354796
-0.922550939191143 1.39310184834662
-0.918965347657051 1.44432139992464
-1.03616345036068 1.00166612828372
-1.07715160762591 1.51189230738663
-1.01283275702248 1.46105578965393
-1.48079534886488 1.21031313607727
-0.986518252032434 0.949195019118798
-1.62901492888985 1.53208781532487
-1.05432664597088 1.20897843449092
-1.51323198856773 0.929507861004623
-1.55689740607725 1.32978015955565
-1.39341270591838 1.41557221811715
-1.66195228414799 0.787792125905413
-1.24494832523794 1.84020927229746
-0.898778729417616 0.570410077321060
-1.32885894876685 0.732892764435160
-1.09324537986321 1.63706883409855
-1.19875924554585 1.35282905121539
-0.866788557380931 1.11436578620945
-1.30006378262166 1.25366700524127
-1.15735442373393 1.48126320709162
-0.469640188642725 0.975507100878317
-0.887529056287694 1.54350983044641
-1.54530712190787 1.47051092229069
-0.895890659992745 1.23572220775434
-1.54226615688700 1.31046627190501
-1.24686714416393 1.05116769432966
-1.18900045601094 1.35740905869805
-1.65786095402519 1.03723338930851
-1.37644334323617 1.08018136201292
-1.00479602718570 0.921073237322932
-0.958390570797860 1.56536899409517
-0.761574879786032 1.24176101803965
-1.56925161923031 1.04223861195863
-0.979085655811513 1.46432198217887
-1.14713536403328 1.08846006315455
-0.853944089636229 1.39103904734476