【机器学习】Kmeans的C++实现及空簇的产生原因

Kmeans在无监督学习中,由于其非常容易实现,所以常常被用来聚类。但是他的缺点也需要注意:
1)Kmeans由于只适用于球形数据,这个原因来自于他的欧式距离来衡量相似度。如果数据分布不是球形的,则最后聚类结果往往不会太理想。
2)Kmeans 非常容易收敛到局部最小值,而且在大规模数据集上收敛较慢。
3)Kmeans极其容易产生空簇。

下面举一个Kmeans 空簇产生的一个例子
1. 假设有下面数据


这里写图片描述

2. 初始分类:


这里写图片描述

初始分类形成(1,2,3,7)一类,(4,5)一类,(6)一类
可以猜测的是,初始簇中心选择为3,5,6

3.进一步更新类中心


这里写图片描述

在这一步中,第二类中的原子已经为空,形成空簇,无法进行更新类中心。
具体解决算法网上有很多,我就不列出来了。

C++实现:

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <random>
#include <cmath>
using namespace std;

vector<vector<double>> readFile(const string filename, int col) {
    ifstream infile;
    vector<vector<double>> alldata;
    infile.open(filename);
    if (!infile.is_open()) {
        cout << "could not open the file!" << endl;
        exit(EXIT_FAILURE);
    }

    while (infile.good()) {
        double tmp;
        vector<double> data_raw;
        for (int i = 0; i < col; i++) {
            infile >> tmp;
            data_raw.push_back(tmp);
        }
        alldata.push_back(data_raw);
    }

    if (infile.eof())
        cout << "End of file reached." << endl;
    else if (infile.fail())
        cout << "Input terminated by data mismatch." << endl;
    else
        cout << "Input terminated by unkown reason." << endl;

    infile.close();
    return alldata;
}

int Random(int start, int end) {
    int dis = end - start;
    return rand() % dis + start;
}

double calEuclid(vector<double> vec1, vector<double> vec2) {
    double res = 0.0;
    for (int i = 0; i < vec1.size(); i++) {
        res += pow((vec1[i] - vec2[i]), 2);
    }
    return sqrt(res);
}

vector<double> calAverd(vector<vector<double>> vec) {
    vector<double> res(vec[0].size(),0);
    for (int j = 0; j < vec[0].size(); j++) {
        for (int i = 0; i < vec.size(); i++) 
            res[j] += vec[i][j];
        res[j] /= vec.size();
    }   
    return res;
}

vector<vector<double>> init_centers(const vector<vector<double>>& dataSet, int k) {
    vector<vector<double>> centers;
    for (int i = 0; i < k; i++) {
        int idx = Random(0, dataSet.size() - 1);
        vector<double> center;
        center = dataSet[idx];
        centers.push_back(center);
    }
    return centers;
}

void Kmeans(const vector<vector<double>>& dataSet, int k, int iter, vector<vector<double>>& centers, vector<int>& cls_idx ) {
    cls_idx = vector<int>(dataSet.size(), 0);
    centers = init_centers(dataSet, k);

    for(int it=0; it<iter; it++)
        for (int i = 0; i < dataSet.size(); i++) {
            vector<double> cur = dataSet[i];
            double min = calEuclid(cur, centers[0]);
            for (int j = 0; j < k; j++) {
                double dis = calEuclid(cur, centers[j]);
                if (dis < min) {
                    min = dis;
                    cls_idx[i] = j;
                }
            }
        }

        for (int id = 0; id < k; id++) {
            vector<vector<double>> cls;
            for (int i = 0; i < dataSet.size(); i++) {
                if (cls_idx[i] == id)
                    cls.push_back(dataSet[i]);
            }
            centers[id] = calAverd(cls);
        }
}


int main(void) {

    vector<vector<double>> alldata = readFile("testSet.txt",2);
    cout << "have read data sizes: "<< alldata.size() << endl;

    vector<vector<double>> centers;
    vector<int> cls_idx;
    Kmeans(alldata, 4, 100, centers, cls_idx);

    for (int i = 0; i < centers.size(); i++) {
        cout << "centers: " << "x; " << centers[i][0] << " y; " << centers[i][1] << endl;
    }

    system("pause");
    return 0;
}

猜你喜欢

转载自blog.csdn.net/shwan_ma/article/details/80096408