算法步骤
"""
1.随机取k个中心点
2. 计算所有点到中心点的距离
将所有点 分别放入 中心点所在的簇
更新中心点
如果中心点不变 结束迭代
迭代
"""
在文章后面给大家提供一个测试集,虽然模板通用,但要注意数据的格式。
获取数据
def loadDataSet(filename):
return np.loadtxt(filename,delimiter=",",dtype=np.float)
计算两点的距离
def distance(x,y):
return np.sqrt(np.sum((x-y)**2))
获得随机的k个数据中心,用于算法的随机取中心点
def initCenters(dataset,k):
"""
返回的k个中心点
:param dataset:数据集
:param k:中心点的个数
:return:
"""
centersIndex = np.random.choice(len(dataset),k,replace=False)
return dataset[centersIndex]
计算两点的距离公式
def distance(x,y):
return np.sqrt(np.sum((x-y)**2))
核心算法部分
def kmeans(dataset,k):
"""
返回k个簇
:param dataset:数据集
:param k:簇的个数
:return:k个簇,以及簇的中心点
"""
centers = initCenters(dataset,k)
n,m = dataset.shape
clusters = np.full(n,np.nan)
flag = True
while flag:
flag = False
for i in range(n):
minDist,clustersIndex = 99999999,0
for j in range(len(centers)):
dist = distance(dataset[i],centers[j])
if dist<minDist:
minDist = dist
clustersIndex = j
if clusters[i]!=clustersIndex:
clusters[i]=clustersIndex
flag = True
for i in range(k):
subdataset = dataset[np.where(clusters==i)]
centers[i] = np.mean(subdataset,axis=0)
return clusters,centers
数据可视化
def show(dataset,k,clusters,centers):
n,m = dataset.shape
if m>2:
return 1
colors = ["r","g","b","y"]
for i in range(n):
clusterIndex = clusters[i].astype(np.int)
plt.plot(dataset[i][0],dataset[i][1],color=colors[clusterIndex],marker="o")
for i in range(k):
plt.scatter(centers[i][0],centers[i][1],marker="s")
plt.show()
效果图
在最后给给大家提供一个测试集
10.235186,11.321997
10.122339,11.810993
9.190236,8.904943
9.306371,9.847394
8.330131,8.340352
10.152785,10.123532
10.408540,10.821986
9.003615,10.039206
9.534872,10.096991
9.498181,10.825446
9.875271,9.233426
10.362276,9.376892
10.191204,11.250851
7.720499,6.476300
9.334379,8.471268
7.963186,6.731333
8.244973,9.013785
9.569196,10.568949
8.854793,9.076536
9.382171,7.240862
8.179055,8.944502
8.267896,8.797017
9.047165,8.725068
8.741043,7.901385
7.190216,7.804587
8.081227,9.314431
8.047460,5.720780
7.917584,7.543254
8.676942,10.102220
9.210251,9.424717
7.732998,9.840202
7.681754,8.609897
7.925863,10.079159
8.261509,8.242080
8.514872,7.527561
10.324450,10.804481
7.856710,7.931543
7.858608,7.995340
9.196808,6.577598
9.644415,10.935081
9.579833,9.085021
7.888484,5.976428
9.072624,9.703344
8.914184,9.298515
7.822869,7.086663
10.538554,11.061464
8.280187,8.709012
8.884223,8.670105
9.359927,10.575055
9.078611,9.710833
7.935134,8.586173
8.805945,10.575145
9.584316,9.614076
11.269714,11.717254
9.120444,9.019774
7.977520,8.313923
8.104648,9.456128
8.617126,7.331723
9.033811,9.469706
8.327680,5.122092
8.532272,10.100909
9.295434,8.933824
9.905202,9.027559
10.585764,10.912733