《TensorFlow机器学习项目实战》人工块状数据集的k均值聚类

使用人工数据集,这里构建的是块状数据集,用于测试聚类算法。

完整代码:

import tensorflow as tf
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets.samples_generator import make_circles

DATA_TYPE = 'blobs'
N=200  # 数据样本数 N
# Number of clusters, if we choose circles, only 2 will be enough
if (DATA_TYPE == 'circle'):
    K=2
else:
    K=4
# Maximum number of iterations, if the conditions are not met
MAX_ITERS = 1000  #最大迭代次数设置
start = time.time()
# 人工数据集
centers = [(-2, -2), (-2, 1.5), (1.5, -2), (2, 1.5)]
if (DATA_TYPE == 'circle'):
    data, features = make_circles(n_samples=200,   # n_samples 数据的数目
                                  shuffle=True,    # shuffle 数据是否打乱(True/False)
                                  noise= 0.01,     # noise 添加到圆形数据集上的随机噪声数据
                                  factor=0.4)     # factor 环形数据间的比例因子
else:
    data, features = make_blobs (n_samples=200, centers=centers, n_features = 2, cluster_std=0.8, shuffle=False, random_state=42)
# 画出质心, 聚类中心点
fig, ax = plt.subplots()# 质心位置
ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250)
plt.show()
#如果DATA_TYPE是blobs型,绘图
fig, ax = plt.subplots()
if (DATA_TYPE == 'blobs'):
	ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250)
	ax.scatter(data.transpose()[0], data.transpose()[1], marker = 'o', s = 100, c = features, cmap=plt.cm.coolwarm )
	plt.show()

points=tf.Variable(data)# 数据集点的坐标
cluster_assignments = tf.Variable(tf.zeros([N], dtype=tf.int64))# 分配的类的索引
centroids = tf.Variable(tf.slice(points.initialized_value(), [0,0], [K,2]))#各组质心的坐标

sess = tf.Session()
sess.run(tf.initialize_all_variables())
sess.run(centroids)

rep_centroids = tf.reshape(tf.tile(centroids, [N, 1]), [N, K, 2])# 质心=N×K×2
rep_points = tf.reshape(tf.tile(points, [1, K]), [N, K, 2])# 本点=N×K×2 , 复制原数据拓展列数为原数据的K倍
sum_squares = tf.reduce_sum(tf.square(rep_points - rep_centroids), reduction_indices=2)# 求距离的平方,张量维度减2
best_centroids = tf.argmin(sum_squares, 1)# 最小值索引,对所有维度求和,得到和最小的那个索引

did_assignments_change = tf.reduce_any(tf.not_equal(best_centroids, cluster_assignments))# 停止条件:质心不再变化

def bucket_mean(data, bucket_ids, num_buckets):
	total = tf.unsorted_segment_sum(data, bucket_ids, num_buckets)    # 统计每个类的数据总和
	count = tf.unsorted_segment_sum(tf.ones_like(data), bucket_ids, num_buckets)    # 统计每个类的数量
	return total / count    # 返回每个类的均值

means = bucket_mean(points, best_centroids, K)     # 样本数据的均值 4*2

with tf.control_dependencies([did_assignments_change]):
	do_updates = tf.group(centroids.assign(means), cluster_assignments.assign(best_centroids))   # 更新聚类中心和分类索引号

changed = True
iters = 0

fig, ax = plt.subplots()
if (DATA_TYPE == 'blobs'):
    colourindexes=[2,1,4,3]
else:
    colourindexes=[2,1]
while changed and iters < MAX_ITERS:
	fig, ax = plt.subplots()
	iters += 1
	[changed, _] = sess.run([did_assignments_change, do_updates])
	[centers, assignments] = sess.run([centroids, cluster_assignments])
	ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1], marker = 'o', s = 200, c = assignments, cmap=plt.cm.coolwarm )
	ax.scatter(centers[:,0],centers[:,1], marker = '^', s = 550, c = colourindexes, cmap=plt.cm.plasma)
	ax.set_title('Iteration ' + str(iters))
	plt.savefig("kmeans" + str(iters) +".png")

ax.scatter(sess.run(points).transpose()[0], sess.run(points).transpose()[1], marker = 'o', s = 200, c = assignments, cmap=plt.cm.coolwarm )
plt.show()

end = time.time()
print("Found in %.2f seconds" % (end-start)), iters, "iterations"
print("Centroids:")
print(centers)
print("Cluster assignments:", assignments)

输出图片为:

猜你喜欢

转载自blog.csdn.net/zqzq19950725/article/details/88184654