Kmeans在无监督学习中,由于其非常容易实现,所以常常被用来聚类。但是他的缺点也需要注意:
1)Kmeans由于只适用于球形数据,这个原因来自于他的欧式距离来衡量相似度。如果数据分布不是球形的,则最后聚类结果往往不会太理想。
2)Kmeans 非常容易收敛到局部最小值,而且在大规模数据集上收敛较慢。
3)Kmeans极其容易产生空簇。
下面举一个Kmeans 空簇产生的一个例子
1. 假设有下面数据
2. 初始分类:
初始分类形成(1,2,3,7)一类,(4,5)一类,(6)一类
可以猜测的是,初始簇中心选择为3,5,6
3.进一步更新类中心
在这一步中,第二类中的原子已经为空,形成空簇,无法进行更新类中心。
具体解决算法网上有很多,我就不列出来了。
C++实现:
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <random>
#include <cmath>
using namespace std;
vector<vector<double>> readFile(const string filename, int col) {
ifstream infile;
vector<vector<double>> alldata;
infile.open(filename);
if (!infile.is_open()) {
cout << "could not open the file!" << endl;
exit(EXIT_FAILURE);
}
while (infile.good()) {
double tmp;
vector<double> data_raw;
for (int i = 0; i < col; i++) {
infile >> tmp;
data_raw.push_back(tmp);
}
alldata.push_back(data_raw);
}
if (infile.eof())
cout << "End of file reached." << endl;
else if (infile.fail())
cout << "Input terminated by data mismatch." << endl;
else
cout << "Input terminated by unkown reason." << endl;
infile.close();
return alldata;
}
int Random(int start, int end) {
int dis = end - start;
return rand() % dis + start;
}
double calEuclid(vector<double> vec1, vector<double> vec2) {
double res = 0.0;
for (int i = 0; i < vec1.size(); i++) {
res += pow((vec1[i] - vec2[i]), 2);
}
return sqrt(res);
}
vector<double> calAverd(vector<vector<double>> vec) {
vector<double> res(vec[0].size(),0);
for (int j = 0; j < vec[0].size(); j++) {
for (int i = 0; i < vec.size(); i++)
res[j] += vec[i][j];
res[j] /= vec.size();
}
return res;
}
vector<vector<double>> init_centers(const vector<vector<double>>& dataSet, int k) {
vector<vector<double>> centers;
for (int i = 0; i < k; i++) {
int idx = Random(0, dataSet.size() - 1);
vector<double> center;
center = dataSet[idx];
centers.push_back(center);
}
return centers;
}
void Kmeans(const vector<vector<double>>& dataSet, int k, int iter, vector<vector<double>>& centers, vector<int>& cls_idx ) {
cls_idx = vector<int>(dataSet.size(), 0);
centers = init_centers(dataSet, k);
for(int it=0; it<iter; it++)
for (int i = 0; i < dataSet.size(); i++) {
vector<double> cur = dataSet[i];
double min = calEuclid(cur, centers[0]);
for (int j = 0; j < k; j++) {
double dis = calEuclid(cur, centers[j]);
if (dis < min) {
min = dis;
cls_idx[i] = j;
}
}
}
for (int id = 0; id < k; id++) {
vector<vector<double>> cls;
for (int i = 0; i < dataSet.size(); i++) {
if (cls_idx[i] == id)
cls.push_back(dataSet[i]);
}
centers[id] = calAverd(cls);
}
}
int main(void) {
vector<vector<double>> alldata = readFile("testSet.txt",2);
cout << "have read data sizes: "<< alldata.size() << endl;
vector<vector<double>> centers;
vector<int> cls_idx;
Kmeans(alldata, 4, 100, centers, cls_idx);
for (int i = 0; i < centers.size(); i++) {
cout << "centers: " << "x; " << centers[i][0] << " y; " << centers[i][1] << endl;
}
system("pause");
return 0;
}