Reprinted from: https://blog.csdn.net/hjimce/article/details/45200985
Gaussian mixture model and k -means clustering are two of the more common and simple algorithms in clustering algorithms. Here, the k -means clustering algorithm is introduced first.
1. Introduction to the theory of K-means algorithm
K-means algorithm is a hard clustering algorithm , which is a representative of a typical prototype-based objective function clustering method. It uses a certain distance from the data point to the prototype as the optimized objective function, and iterative operation is obtained by using the method of finding the extreme value of the function. adjustment rules. The K-means algorithm uses the Euclidean distance as the similarity measure, which is to find the optimal classification corresponding to a certain initial cluster center vector V, so that the evaluation index J is the smallest. The algorithm uses the error sum of squares criterion function as the clustering criterion function, in a word, it is to minimize the following formula:
The algorithm process is as follows:
1) Randomly select K documents from N documents as centroids
2) Measure its distance to each centroid for each remaining document and assign it to the class of the closest centroid
3) Recalculate the centroid of each class that has been obtained
4) Iterate 2 to 3 steps until the new centroid is equal to the original centroid or less than the specified threshold, and the algorithm ends
2. Implementation of K-means algorithm
The realization of the K -means clustering algorithm is divided into four steps. Let the data set be two-dimensional data , and use matlab to draw the data, as shown below:
Drawing code:
Plot the result:
Now suppose that the dataset is divided into 4 categories, the algorithm steps are as follows:
1) Initialize the cluster center. Since the number of clusters is 4 , we use matlab to randomly generate 4 non-repeating integers, and the size does not exceed the number of data points, and obtain the initial cluster centers A, B, C, D .
2) Calculate the distance from each point to the 4 cluster centers and find the smallest one. Assuming that point p is a point in the data set, find the point in A , B , C , and D that is closest to point p . If B is the closest point to P , then cluster point p into class B.
3) Update the cluster center. According to the clustering result of the data set obtained in step 2 , according to the clustering result, the position of the center of gravity of each class is calculated as the updated clustering center. Then go back to step 2 and perform clustering again, and repeat step 2 and step 3 until the iteration converges.
Finally paste the code:
close all;
clear;
clc;
% % Generate Gaussian random number 1
% mu = [2 3];
% SIGMA = [1 0; 0 2];
% r1 = mvnrnd(mu,SIGMA,100);
% plot(r1(:,1),r1(:,2),'r+');
% hold on;
% % Generate Gaussian random number 1
% mu = [7 8];
% SIGMA = [ 1 0; 0 2];
% r2 = mvnrnd (in, SIGMA, 100);
% plot (r2 (:, 1), r2 (:, 2), '*')
%
% data=[r1;r2]
data=importdata('data.txt');
% algorithm flow
figure(1);
plot(data(:,1),data(:,2),'*');
figure(2);
[m n]=size(data);
% The number of clusters is 4, then 4 non-repeating integers
p=randperm(m);
kn=4;
d=p(1:kn);
center(1:kn,:)=data(d(:),:);
flag=zeros(1,m);
% Calculate the nearest point from each point to the 4 centroid points
it=1;
while(it<30 for="" ii="1:m;" flag="" ii="" -1="" mindist="inf;" for="" jj="1:kn;" dst="norm(data(ii,:)-center(jj,:));" if="" mindist="">dst;
mindist = dst;
flag (ii) = jj;
end
end
end
% update cluster center
center=zeros(size(center));
countflag=zeros(1,kn);
for ii=1:m
for jj=1:kn
if(flag(ii)==jj)
center (jj,:) = center (jj,:) + data (ii, :);
countflag (jj) = countflag (jj) +1;
end
end
end
for jj=1:kn
center(jj,:)=center(jj,:)./countflag(jj);
end
it=it+1;
end
hold on;
for i=1:m;
if(flag(i)==1);
plot(data(i,1),data(i,2),'.y');
elseif flag(i)==2
plot(data(i,1),data(i,2),'.b');
elseif (flag(i)==3)
plot(data(i,1),data(i,2),'.k');
elseif (flag(i)==4)
plot(data(i,1),data(i,2),'.r');
end
end</30>
The clustering results are as follows:
enum
{
KMEANS_RANDOM_CENTERS=0, // Chooses random centers for k-Means initialization
KMEANS_PP_CENTERS=2, // Uses k-Means++ algorithm for initialization
KMEANS_USE_INITIAL_LABELS=1 // Uses the user-provided labels for K-Means initialization
};
//! clusters the input data using k-Means algorithm
CV_EXPORTS_W double kmeans( InputArray data, int K, CV_OUT InputOutputArray bestLabels,
TermCriteria criteria, int attempts,
int flags, OutputArray centers=noArray() );
实现函数:
double kmeans( const Mat& data, int K, Mat& best_labels,
TermCriteria criteria, int attempts,
int flags, Mat* _centers )
{
const int SPP_TRIALS = 3;
int N = data.rows > 1 ? data.rows : data.cols;
int dims = (data.rows > 1 ? data.cols : 1)*data.channels();
int type = data.depth();
bool simd = checkHardwareSupport(CV_CPU_SSE);
attempts = std::max(attempts, 1);
CV_Assert( type == CV_32F && K > 0 );
Mat _labels;
if( flags & CV_KMEANS_USE_INITIAL_LABELS )
{
CV_Assert( (best_labels.cols == 1 || best_labels.rows == 1) &&
best_labels.cols*best_labels.rows == N &&
best_labels.type() == CV_32S &&
best_labels.isContinuous());
best_labels.copyTo(_labels);
}
else
{
if( !((best_labels.cols == 1 || best_labels.rows == 1) &&
best_labels.cols*best_labels.rows == N &&
best_labels.type() == CV_32S &&
best_labels.isContinuous()))
best_labels.create(N, 1, CV_32S);
_labels.create(best_labels.size(), best_labels.type());
}
int* labels = _labels.ptr<int>();
Mat centers(K, dims, type), old_centers(K, dims, type);
vector<int> counters(K);
vector<Vec2f> _box(dims);
Vec2f* box = &_box[0];
double best_compactness = DBL_MAX, compactness = 0;
RNG& rng = theRNG();
int a, iter, i, j, k;
if( criteria.type & TermCriteria::EPS )
criteria.epsilon = std::max(criteria.epsilon, 0.);
else
criteria.epsilon = FLT_EPSILON;
criteria.epsilon *= criteria.epsilon;
if( criteria.type & TermCriteria::COUNT )
criteria.maxCount = std::min(std::max(criteria.maxCount, 2), 100);
else
criteria.maxCount = 100;
if( K == 1 )
{
attempts = 1;
criteria.maxCount = 2;
}
const float* sample = data.ptr<float>(0);
for( j = 0; j < dims; j++ )
box[j] = Vec2f(sample[j], sample[j]);
for( i = 1; i < N; i++ )
{
sample = data.ptr<float>(i);
for( j = 0; j < dims; j++ )
{
float v = sample[j];
box[j][0] = std::min(box[j][0], v);
box[j][1] = std::max(box[j][1], v);
}
}
for( a = 0; a < attempts; a++ )
{
double max_center_shift = DBL_MAX;
for( iter = 0; iter < criteria.maxCount && max_center_shift > criteria.epsilon; iter++ )
{
swap(centers, old_centers);
if( iter == 0 && (a > 0 || !(flags & KMEANS_USE_INITIAL_LABELS)) )
{
if( flags & KMEANS_PP_CENTERS )
generateCentersPP(data, centers, K, rng, SPP_TRIALS);
else
{
for( k = 0; k < K; k++ )
generateRandomCenter(_box, centers.ptr<float>(k), rng);
}
}
else
{
if( iter == 0 && a == 0 && (flags & KMEANS_USE_INITIAL_LABELS) )
{
for( i = 0; i < N; i++ )
CV_Assert( (unsigned)labels[i] < (unsigned)K );
}
// compute centers
centers = Scalar(0);
for( k = 0; k < K; k++ )
counters[k] = 0;
for( i = 0; i < N; i++ )
{
sample = data.ptr<float>(i);
k = labels[i];
float* center = centers.ptr<float>(k);
for( j = 0; j <= dims - 4; j += 4 )
{
float t0 = center[j] + sample[j];
float t1 = center[j+1] + sample[j+1];
center[j] = t0;
center[j+1] = t1;
t0 = center[j+2] + sample[j+2];
t1 = center[j+3] + sample[j+3];
center[j+2] = t0;
center[j+3] = t1;
}
for( ; j < dims; j++ )
center[j] += sample[j];
counters[k]++;
}
if( iter > 0 )
max_center_shift = 0;
for( k = 0; k < K; k++ )
{
float* center = centers.ptr<float>(k);
if( counters[k] != 0 )
{
float scale = 1.f/counters[k];
for( j = 0; j < dims; j++ )
center[j] *= scale;
}
else
generateRandomCenter(_box, center, rng);
if( iter > 0 )
{
double dist = 0;
const float* old_center = old_centers.ptr<float>(k);
for( j = 0; j < dims; j++ )
{
double t = center[j] - old_center[j];
dist += t*t;
}
max_center_shift = std::max(max_center_shift, dist);
}
}
}
// assign labels
compactness = 0;
for( i = 0; i < N; i++ )
{
sample = data.ptr<float>(i);
int k_best = 0;
double min_dist = DBL_MAX;
for( k = 0; k < K; k++ )
{
const float* center = centers.ptr<float>(k);
double dist = distance(sample, center, dims, simd);
if( min_dist > dist )
{
min_dist = dist;
k_best = k;
}
}
compactness += min_dist;
labels[i] = k_best;
}
}
if( compactness < best_compactness )
{
best_compactness = compactness;
if( _centers )
centers.copyTo(*_centers);
_labels.copyTo(best_labels);
}
}
return best_compactness;
}
}
调用方法:
const int kMeansItCount = 10; //迭代次数
const int kMeansType = cv::KMEANS_PP_CENTERS; //Use kmeans++ center initialization by Arthur and Vassilvitskii
cv::Mat bgdLabels, fgdLabels; //记录背景和前景的像素样本集中每个像素对应GMM的哪个高斯模型,论文中的kn
//kmeans中参数_bgdSamples为:每行一个样本
//kmeans的输出为bgdLabels,里面保存的是输入样本集中每一个样本对应的类标签(样本聚为componentsCount类后)
kmeans( _fgdSamples, GMM::componentsCount, fgdLabels,
cv::TermCriteria( CV_TERMCRIT_ITER, kMeansItCount, 0.0), 0, kMeansType );