使用人工数据集,这里构建的是块状数据集,用于测试聚类算法。
完整代码:
import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets.samples_generator import make_circles
DATA_TYPE = 'blobs'
N=200 # 数据样本数 N
# Number of clusters, if we choose circles, only 2 will be enough
if (DATA_TYPE == 'circle'):
K=2
else:
K=4
# Maximum number of iterations, if the conditions are not met
MAX_ITERS = 1000 #最大迭代次数设置
start = time.time()
# 人工数据集
centers = [(-2, -2), (-2, 1.5), (1.5, -2), (2, 1.5)]
if (DATA_TYPE == 'circle'):
data, features = make_circles(n_samples=200, # n_samples 数据的数目
shuffle=True, # shuffle 数据是否打乱(True/False)
noise= 0.01, # noise 添加到圆形数据集上的随机噪声数据
factor=0.4) # factor 环形数据间的比例因子
else:
data, features = make_blobs (n_samples=200, centers=centers, n_features = 2, cluster_std=0.8, shuffle=False, random_state=42)
# 画出质心, 聚类中心点
fig, ax = plt.subplots()# 质心位置
ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250)
plt.show()
#如果DATA_TYPE是blobs型,绘图
fig, ax = plt.subplots()
if (DATA_TYPE == 'blobs'):
ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250)
ax.scatter(data.transpose()[0], data.transpose()[1], marker = 'o', s = 100, c = features, cmap=plt.cm.coolwarm )
plt.show()
points=tf.Variable(data)# 数据集点的坐标
cluster_assignments = tf.Variable(tf.zeros([N], dtype=tf.int64))# 分配的类的索引
centroids = tf.Variable(tf.slice(points.initialized_value(), [0,0], [K,2]))#各组质心的坐标
sess = tf.Session()
sess.run(tf.initialize_all_variables())
sess.run(centroids)
rep_centroids = tf.reshape(tf.tile(centroids, [N, 1]), [N, K, 2])# 质心=N×K×2
rep_points = tf.reshape(tf.tile(points, [1, K]), [N, K, 2])# 本点=N×K×2 , 复制原数据拓展列数为原数据的K倍
sum_squares = tf.reduce_sum(tf.square(rep_points - rep_centroids), reduction_indices=2)# 求距离的平方,张量维度减2
best_centroids = tf.argmin(sum_squares, 1)# 最小值索引,对所有维度求和,得到和最小的那个索引
did_assignments_change = tf.reduce_any(tf.not_equal(best_centroids, cluster_assignments))# 停止条件:质心不再变化
def bucket_mean(data, bucket_ids, num_buckets):
total = tf.unsorted_segment_sum(data, bucket_ids, num_buckets) # 统计每个类的数据总和
count = tf.unsorted_segment_sum(tf.ones_like(data), bucket_ids, num_buckets) # 统计每个类的数量
return total / count # 返回每个类的均值
means = bucket_mean(points, best_centroids, K) # 样本数据的均值 4*2
with tf.control_dependencies([did_assignments_change]):
do_updates = tf.group(centroids.assign(means), cluster_assignments.assign(best_centroids)) # 更新聚类中心和分类索引号
changed = True
iters = 0
fig, ax = plt.subplots()
if (DATA_TYPE == 'blobs'):
colourindexes=[2,1,4,3]
else:
colourindexes=[2,1]
while changed and iters < MAX_ITERS:
fig, ax = plt.subplots()
iters += 1
[changed, _] = sess.run([did_assignments_change, do_updates])
[centers, assignments] = sess.run([centroids, cluster_assignments])
ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1], marker = 'o', s = 200, c = assignments, cmap=plt.cm.coolwarm )
ax.scatter(centers[:,0],centers[:,1], marker = '^', s = 550, c = colourindexes, cmap=plt.cm.plasma)
ax.set_title('Iteration ' + str(iters))
plt.savefig("kmeans" + str(iters) +".png")
ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1], marker = 'o', s = 200, c = assignments, cmap=plt.cm.coolwarm )
plt.show()
end = time.time()
print("Found in %.2f seconds" % (end-start)), iters, "iterations"
print("Centroids:")
print(centers)
print("Cluster assignments:", assignments)