聚类算法是一种无监督的分类方法,即样本预先不知所属类别或标签,需要根据样本之间的距离或相似程度自动进行分类。聚类算法可以分为基于划分的方法、基于连通性的方法、基于密度的方法、基于概率分布模型的方法等,
K-means(K均值)属于基于划分的聚类方法。
一、基本原理
基于划分的聚类方法是将样本集组成的矢量空间划分为多个区域,每个区域都存在一个区域相关的表示,即区域中心。对于每个样本可以建立一种样本到区域中心的映射:
其中l()为指数函数。
根据建立的映射q(x),可以将相应的样本分类到相应的中心,得到最终的划分结果。
不同的基于划分的聚类算法的主要区别在于如何建立相应的映射方式q(x)。在经典的K-means聚类算法中,映射是通过样本与各中心的之间的距离平方和最小准则来确立的。
假设有样本集合, K-means聚类算法的目标是将数据集划分为k(k<n)类:S = {S1, S2, ..., SK},使划分后的K个子集合满足类内的距离平方和最小:
其中,
求解目标函数是一个NP-hard问题,无法保证得到一个稳定的全局最优解。在经典的聚类算法中,采取迭代优化策略,有效地求解目标函数的局部最优解。
算法步骤如下:
步骤1 初始化聚类中心,可选取样本集的前k个样本,或者随机选取k个样本;
步骤2
分配各样本到相近的聚类集合,样本分配依据为:
式 中 i = 1,2, ...,k,p ≠ j。
步骤3 根据步骤2的分配结果,更新聚类中心:
步骤4 若迭代达到最大迭代步数,或前后两次迭代的差小于设定阈值,即,则迭代终止,否则重复步骤2。
其中,步骤2和步骤3分别对样本集合重新分配和更新计算聚类中心,通过迭代计算过程中优化目标函数,实现类内距离平方和最小。
二、K-means算法的优化
2.1 聚类中心初始化的优化
K-means对聚类中心的初始化比较敏感,不同初始化值会带来不同的聚类结果,这是因为K-means仅仅对目标函数求取近似局部最优解,不能保证得到全局最优解,即在一定数据分布下聚类结果会因为初始化的不同而产生很大的偏差。
下面介绍一下K-means的改进算法,即K-means++算法,改算法能够有效产生初始的聚类中心。
首先,随机初始化一个聚类中心;
然后,通过迭代计算最大概率值:
加入下一个聚类中心:
直到选择k个中心。
K-means++算法的计算复杂度为O(knd),没有增加过多的计算负担,同时可以保证算法更有效的近似于最优解。
2.2 类别个数的自适应确定
经典的K-means算法中,聚类的个数k是预先确定的,不具备自适应选择类别个数的能力。而聚类算法中类别个数的设定将会在很大程度上决定聚类效果。
ISODATA算法与K-means在基本原则上是一致的,通过计算距离平方和最小来实现聚类,但在迭代的过程中会引入类别的合并与分离机制。
在每一次迭代中,ISODATA算法首先在固定类别的情况下进行聚类,然后根据设定样本之间的距离阈值进行合并操作,并根据每一组类别Si中样本协方差矩阵信息来判断是否分开。
但IOSDATA算法的效率会相比于K-means大大降低。
附:K-means算法C语言实现
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define sqr(x) ((x)*(x))
#define MAX_CLUSTERS 16
#define MAX_ITERATIONS 100
#define BIG_double (INFINITY)
void fail(char *str)
{
printf(str);
exit(-1);
}
double calc_distance(int dim, double *p1, double *p2)
{
double distance_sq_sum = 0;
for (int ii = 0; ii < dim; ii++)
distance_sq_sum += sqr(p1[ii] - p2[ii]);
return distance_sq_sum;
}
void calc_all_distances(int dim, int n, int k, double *X, double *centroid, double *distance_output)
{
for (int ii = 0; ii < n; ii++) // for each point
for (int jj = 0; jj < k; jj++) // for each cluster
{
// calculate distance between point and cluster centroid
distance_output[ii*k + jj] = calc_distance(dim, &X[ii*dim], ¢roid[jj*dim]);
}
}
double calc_total_distance(int dim, int n, int k, double *X, double *centroids, int *cluster_assignment_index)
// NOTE: a point with cluster assignment -1 is ignored
{
double tot_D = 0;
// for every point
for (int ii = 0; ii < n; ii++)
{
// which cluster is it in?
int active_cluster = cluster_assignment_index[ii];
// sum distance
if (active_cluster != -1)
tot_D += calc_distance(dim, &X[ii*dim], ¢roids[active_cluster*dim]);
}
return tot_D;
}
void choose_all_clusters_from_distances(int dim, int n, int k, double *distance_array, int *cluster_assignment_index)
{
// for each point
for (int ii = 0; ii < n; ii++)
{
int best_index = -1;
double closest_distance = BIG_double;
// for each cluster
for (int jj = 0; jj < k; jj++)
{
// distance between point and cluster centroid
double cur_distance = distance_array[ii*k + jj];
if (cur_distance < closest_distance)
{
best_index = jj;
closest_distance = cur_distance;
}
}
// record in array
cluster_assignment_index[ii] = best_index;
}
}
void calc_cluster_centroids(int dim, int n, int k, double *X, int *cluster_assignment_index, double *new_cluster_centroid)
{
int cluster_member_count[MAX_CLUSTERS];
// initialize cluster centroid coordinate sums to zero
for (int ii = 0; ii < k; ii++)
{
cluster_member_count[ii] = 0;
for (int jj = 0; jj < dim; jj++)
new_cluster_centroid[ii*dim + jj] = 0;
}
// sum all points
// for every point
for (int ii = 0; ii < n; ii++)
{
// which cluster is it in?
int active_cluster = cluster_assignment_index[ii];
// update count of members in that cluster
cluster_member_count[active_cluster]++;
// sum point coordinates for finding centroid
for (int jj = 0; jj < dim; jj++)
new_cluster_centroid[active_cluster*dim + jj] += X[ii*dim + jj];
}
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
for (int ii = 0; ii < k; ii++)
{
if (cluster_member_count[ii] == 0)
printf("WARNING: Empty cluster %d! \n", ii);
// for each dimension
for (int jj = 0; jj < dim; jj++)
new_cluster_centroid[ii*dim + jj] /= cluster_member_count[ii]; /// XXXX will divide by zero here for any empty clusters!
}
}
void get_cluster_member_count(int n, int k, int *cluster_assignment_index, int *cluster_member_count)
{
// initialize cluster member counts
for (int ii = 0; ii < k; ii++)
cluster_member_count[ii] = 0;
// count members of each cluster
for (int ii = 0; ii < n; ii++)
cluster_member_count[cluster_assignment_index[ii]]++;
}
void update_delta_score_table(int dim, int n, int k, double *X, int *cluster_assignment_cur, double *cluster_centroid, int *cluster_member_count, double *point_move_score_table, int cc)
{
// for every point (both in and not in the cluster)
for (int ii = 0; ii < n; ii++)
{
double dist_sum = 0;
for (int kk = 0; kk < dim; kk++)
{
double axis_dist = X[ii*dim + kk] - cluster_centroid[cc*dim + kk];
dist_sum += sqr(axis_dist);
}
double mult = ((double)cluster_member_count[cc] / (cluster_member_count[cc] + ((cluster_assignment_cur[ii]==cc) ? -1 : +1)));
point_move_score_table[ii*dim + cc] = dist_sum * mult;
}
}
void perform_move(int dim, int n, int k, double *X, int *cluster_assignment, double *cluster_centroid, int *cluster_member_count, int move_point, int move_target_cluster)
{
int cluster_old = cluster_assignment[move_point];
int cluster_new = move_target_cluster;
// update cluster assignment array
cluster_assignment[move_point] = cluster_new;
// update cluster count array
cluster_member_count[cluster_old]--;
cluster_member_count[cluster_new]++;
if (cluster_member_count[cluster_old] <= 1)
printf("WARNING: Can't handle single-member clusters! \n");
// update centroid array
for (int ii = 0; ii < dim; ii++)
{
cluster_centroid[cluster_old*dim + ii] -= (X[move_point*dim + ii] - cluster_centroid[cluster_old*dim + ii]) / cluster_member_count[cluster_old];
cluster_centroid[cluster_new*dim + ii] += (X[move_point*dim + ii] - cluster_centroid[cluster_new*dim + ii]) / cluster_member_count[cluster_new];
}
}
void cluster_diag(int dim, int n, int k, double *X, int *cluster_assignment_index, double *cluster_centroid)
{
int cluster_member_count[MAX_CLUSTERS];
get_cluster_member_count(n, k, cluster_assignment_index, cluster_member_count);
printf(" Final clusters \n");
for (int ii = 0; ii < k; ii++)
printf(" cluster %d: members: %8d, centroid (%.1f %.1f) \n", ii, cluster_member_count[ii], cluster_centroid[ii*dim + 0], cluster_centroid[ii*dim + 1]);
}
void copy_assignment_array(int n, int *src, int *tgt)
{
for (int ii = 0; ii < n; ii++)
tgt[ii] = src[ii];
}
int assignment_change_count(int n, int a[], int b[])
{
int change_count = 0;
for (int ii = 0; ii < n; ii++)
if (a[ii] != b[ii])
change_count++;
return change_count;
}
void kmeans(
int dim, // dimension of data
double *X, // pointer to data
int n, // number of elements
int k, // number of clusters
double *cluster_centroid, // initial cluster centroids
int *cluster_assignment_final // output
)
{
double *dist = (double *)malloc(sizeof(double) * n * k);
int *cluster_assignment_cur = (int *)malloc(sizeof(int) * n);
int *cluster_assignment_prev = (int *)malloc(sizeof(int) * n);
double *point_move_score = (double *)malloc(sizeof(double) * n * k);
if (!dist || !cluster_assignment_cur || !cluster_assignment_prev || !point_move_score)
fail("Error allocating dist arrays");
// initial setup
calc_all_distances(dim, n, k, X, cluster_centroid, dist);
choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);
// BATCH UPDATE
double prev_totD = BIG_double;
int batch_iteration = 0;
while (batch_iteration < MAX_ITERATIONS)
{
// printf("batch iteration %d \n", batch_iteration);
// cluster_diag(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
// update cluster centroids
calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
// deal with empty clusters
// XXXXXXXXXXXXXX
// see if we've failed to improve
double totD = calc_total_distance(dim, n, k, X, cluster_centroid, cluster_assignment_cur);
if (totD > prev_totD)
// failed to improve - currently solution worse than previous
{
// restore old assignments
copy_assignment_array(n, cluster_assignment_prev, cluster_assignment_cur);
// recalc centroids
calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
printf(" negative progress made on this step - iteration completed (%.2f) \n", totD - prev_totD);
// done with this phase
break;
}
// save previous step
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);
// move all points to nearest cluster
calc_all_distances(dim, n, k, X, cluster_centroid, dist);
choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
int change_count = assignment_change_count(n, cluster_assignment_cur, cluster_assignment_prev);
printf("%3d %u %9d %16.2f %17.2f\n", batch_iteration, 1, change_count, totD, totD - prev_totD);
fflush(stdout);
// done with this phase if nothing has changed
if (change_count == 0)
{
printf(" no change made on this step - iteration completed \n");
break;
}
prev_totD = totD;
batch_iteration++;
}
// write to output array
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_final);
free(dist);
free(cluster_assignment_cur);
free(cluster_assignment_prev);
free(point_move_score);
}
2017.11.17