Unsupervised Learning

Relative supervised learning (input into x, a corresponding y), not marked

Clustering

k-means
Density-based clustering
Maximum expected cluster

Dimensionality reduction

Latent Semantic Analysis (LSA)
Principal component analysis (PCA)
Singular value decomposition (SVD)

k-means (k-means) clustering algorithm is the most simple, efficient, unsupervised learning algorithms belong to
the core idea: designated k initial centroids (initial centroids) by the user, as a cluster of categories (cluster), repeated iteration until the algorithm converges
basic algorithm process:

Select k initial centroids (initial Cluster);
repeat：

　　　　For each sample point is calculated to obtain its closest centroid, the centroid of that mark its category corresponding Cluster; 
　　　　recalculated corresponding to the k cluser centroid; 
an until the centroid variations no longer occur or reached the maximum iteration

kmeans code implementation

Import numpy AS NP
 Import matplotlib.pyplot AS PLT 

# generates cluster data directly from the sklearn 
from sklearn.datasets.samples_generator Import make_blobs

Download Data

X, Y = make_blobs (N_SAMPLES = 100, centers =. 6, random_state = 1234, cluster_std = 0.6 )
 # 100 samples, centers the center of the sample generated (category) number; random_state --seed The Random Number Generator Used by; 
# cluster_std a different set of each category variance 
# x.shape # (100, 2) 
plt.figure (figsize = (6,6 )) 
plt.scatter (X [:, 0], X [:, . 1], C = Y)   # C = Y class. 6, the color becomes 
plt.show ()

Algorithm

# Introduced scipy the distance function, the default is calculated Euclidean distance 
from scipy.spatial.distance Import cdist 

class K_Means (Object):
     # initialization, parameter n_clusters (K), max_iter iterations, the initial centroid centroids of 
    DEF  the __init__ (Self, n_clusters. 5 = , max_iter = 300, centroids of = []): 
        self.n_clusters = n_clusters 
        self.max_iter = max_iter 
        self.centroids = np.array (centroids of, DTYPE = np.float) 
        
    # training model method, k-means clustering process, pass the raw data 
    DEF Fit (Self, data):
         # If not specify an initial centroid, randomly selected points on the data as an initial centroid 
        IF (== self.centroids.shape(0)):
             # randomly generated integer from 0 to 6 number data from data lines, as the index value 
            self.centroids = data [np.random.randint (0, data.shape [0], self.n_clusters) ,:] 
            
        # the iteration 
        for I in Range (self.max_iter):
             # 1. calculates a distance matrix obtained is a matrix of 100 * 6 
            distances = cdist (Data, self.centroids) 
            
            # 2. press nearly distance to far sorting, selecting a category of the nearest centroid point, as the current point free 
            c_ind = np.argmin (Distances, Axis =. 1) # Axis retains the most recent one. 1 = 
            
            # 3. mean value calculated for each type of data, updates the centroid coordinates 
            for i in the Range (self.n_clusters):
                 # exclude does not appear in c_ind in the category 
                ifi in c_ind:
                     # Select all categories are point i, taking the mean coordinate data inside of the i-th centroid update 
                    # data [i == c_ind] Boolean index, get the true value of 
                    self.centroids [i] = np.mean (Data [c_ind == I], Axis = 0) 
    
    # implemented prediction method 
    DEF predict (Self, the Samples):
         # with the above, first calculated distance matrix, and then select the nearest centroid that category 
        distances = cdist (the Samples, self.centroids) 
        c_ind = np.argmin (Distances, Axis =. 1 ) 
        
        return c_ind
 # # In testing, a two-dimensional array of 4 * 5 (4 centroid point), there are several points representative of the number of rows, row numbers are with a distance of each centroid 
dist = np.array ([[121,221,32,43 ], 
                [ 121,1,12,23 ],
                [65,21,2,43 ], 
                [ 1,221,32,43 ], 
                [ 21,11,22,3 ],]) 
c_ind = np.argmin (dist, Axis =. 1 )
 Print (c_ind) # each element with what kind recently [2 0 2. 3. 1] 
x_new = X [0:. 5 ] 
 Print (x_new) #
 Print (c_ind == 2) # [True False True False False] 
Print (x_new [c_ind == 2 ]) 
NP .mean (x_new [c_ind == 2], Axis = 0) # each coordinate corresponding to the average for each column

----->>

[2 1 2 0 3]
[[-0.02708305  5.0215929 ]
 [-5.49252256  6.27366991]
 [-5.37691608  1.51403209]
 [-5.37872006  2.16059225]
 [ 9.58333171  8.10916554]]
[ True False  True False False]
[[-0.02708305  5.0215929 ]
 [-5.37691608  1.51403209]]
Out[14]:
array([-2.70199956,  3.26781249])

test

# Define a function plotted in FIG sub 
DEF plotKMeans (X, Y, centroids of, the subplot, title):
     # allocation sub-view 121 of FIG. 1 shows a first sub-row two in 
    plt.subplot (the subplot) 
    plt.scatter ( X [:, 0], X [:, . 1], C = ' R & lt ' )
     # draw centroid 
    plt.scatter (centroids [:, 0] , centroids [:, 1], c = np.array (range (. 5)), S = 100 ) 
    plt.title (title) 
# centroids of the initial specified point 
kmeans = K_Means (max_iter = 300, centroids = np.array ([[2,1], [2,2], [2, . 3], [2,4], [2,5 ]])) 

plt.figure (figsize = (16,. 6 ))
 # initial state of FIG 
plotKMeans (X, Y, kmeans.centroids, 121, ' the initial state ') 

# Start Cluster 
kmeans.fit (X) 

plotKMeans (X, Y, kmeans.centroids, 122, ' Final State ' ) 

# predict the new data point category 
x_new = np.array ([[0,0] , [10 ,. 7 ]]) 
y_pred = kmeans.predict (x_new) 

Print (kmeans.centroids)
 Print (y_pred) 

plt.scatter (x_new [:, 0], x_new [:, . 1], S = 100, C = ' Black ' )

 ---- >> 
[[ 5.76444812 -4.67941789 ] 
 [ -2.89174024 -0.22808556 ] 
 [ -5.89115978 2.33887408 ] 
 [-2.8455246   5.87376915]
 [ 9.20551979  7.56124841]]
[1 4]

Machine learning models | unsupervised learning

Unsupervised Learning

kmeans code implementation

Algorithm

test

Guess you like