Machine learning 4kmeans practical application

(1) Picture compression

The following is the image compression using k-means code

from sklearn.datasets import load_sample_image 
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sys
import numpy as np
from pylab import mpl

# Specify the default font
mpl.rcParams ['font.sans-serif'] = ['FangSong']
# Solve that the saved image is a negative sign '-' displayed as a square
mpl.rcParams ['axes.unicode_minus'] = False
## Read the
image in the datasets image = load_sample_image ("china.jpg") #View the
picture
plt.imshow (image)
plt.show ()

## Observe the image file size, data structure
print ("Image uses memory size:", sys.getsizeof (image))
print ("Image size:", image.shape)
print (image)
## Data linearization
image1 = image [:: 3, :: 3]
X = image1.reshape (-1,3)
print (image.shape, X.shape)


### Use kmeans to cluster image pixel colors
n_colors = 64
K_model = KMeans (n_colors) #Expected
value of color classification after classification
y = K_model.fit_predict (X) #Color of
each category, two-dimensional array
colors = K_model .cluster_centers_
Print (y.shape, colors.shape)

## generates compression, reduced to two-dimensional
new_image = Colors [the y-] .reshape (image1.shape)
new_image.shape
new_image.size
# contrast
print ( "old pictures take up memory: ", sys.getsizeof (image))
print (" New image takes up content: ", sys.getsizeof (new_image))
print (" Old image memory: ", image.shape)
print (" New image content: ", new_image. shape) #Generate

image
plt.figure (figsize = (10,5)) # Canvas
plt.subplot (1, 2, 1)
plt.imshow (image)
plt.title ("Image before compression")
plt.subplot ( 1, 2, 2)
plt.title ("Compressed image")
plt.imshow (new_image.astype (np.uint8))
plt.suptitle ("Comparison before and after compression")
plt.show ()
actual effect:

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

(2) Actual application

Use rental information to predict whether the rental will bring gas

#K-meavn算法
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.cluster import  KMeans
from sklearn import preprocessing
from sklearn.metrics import jaccard_score,fowlkes_mallows_score,adjusted_rand_score

data=pd.read_csv("201706120024陈圳锐(处理后).csv")
y=data['燃气'].values
x=data.drop('燃气',axis=1).values
#分箱处理调优
# a=0
# count1=0
# count2=0
# for i in range(2,20):
#     for index in range(2,20):
#         print(i,index)
#         #quantile
#         est = preprocessing.KBinsDiscretizer(n_bins=[i, index], encode='ordinal',strategy='uniform').fit(data[['面积','租金']].values)
# x1 ) .values)= est.transform (data [[ ' Area ' , ' Rent ' ]]. values) 
# x = data.drop ( ' Area ' , axis = 1 ) .drop ( ' Rent ' , axis = 1 ) .values 
# x = np.hstack ((x1, x)) 
# 
# 
# #zscore Standardized Processing 
# zscore_scaler = preprocessing.StandardScaler () # Create StandardScaler object 
# x = zscore_scaler.fit_transform (data.drop ( ' gas ' , axis = 1 
#
#         x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=5)
#
#         KMeans_model = KMeans(n_clusters=2)
#         KMeans_model.fit(x_train)
#         from sklearn import metrics
#         y_pre=KMeans_model.predict(x_test)
#         if  a<metrics.accuracy_score(y_test, y_pre):
#             count1=i
#             count2=index
#             a=metrics.accuracy_score(y_test, y_pre)
#
# print ( ' Accuracy index: ' , metrics.accuracy_score (y_test, y_pre)) #Binary 

processing 
est = preprocessing.KBinsDiscretizer (n_bins = [ 19 , 19 , 19 ], encode = ' ordinal ' , strategy = ' uniform ' ) .fit (data [[ ' Area ' , ' Rent ' , ' Floor ' ]]. values) 
x1 = est.transform (data [[ ' Area ' , ' Rent ' , ' Floor ' ]].values)
x = data.drop ( ' Area ' , axis = 1 ) .drop ( ' Floor ' , axis = 1 ) .drop ( ' Rent ' , axis = 1 ) .values 


x = np.hstack ((x1 , x)) 
#zscore standardized processing 
zscore_scaler = preprocessing.StandardScaler () # Create StandardScaler object 
x = zscore_scaler.fit_transform (data.drop ( ' gas ' , axis = 1 ) .values) 
#divide the 

data set x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2 , random_state = 5) 
#Building 

model KMeans_model = KMeans (n_clusters = 2 , random_state = 5 ) 
#Training 
model KMeans_model.fit (x_train) #Prediction 
test set 
y_pre = KMeans_model.predict (x_test)
 from sklearn import metrics 
# KMeans_model.score (x_test, y_test) 
print ( " 201706120024 Chen Zhenrui Class 17 Software Engineering Class 1 " ) 
print ( ' Accuracy index: ' , metrics.accuracy_score (y_test, y_pre)) 
print ( ' Cluster center is: \ n ' , KMeans_model.cluster_centers_) # View cluster center 
print ( ' category label is: \ n ' , KMeans_model.labels_)
print ( " Number of participating tests: " , x_test.shape [ 0 ]) 
print ( " Predict the correct number: " , x_test.shape [ 0 ] * metrics.accuracy_score (y_test, y_pre)) 
print ( ' Accuracy index : ' , Metrics.accuracy_score (y_test, y_pre)) # Calculate accuracy rate 
print ( ' Kappa indicator: ' , metrics.cohen_kappa_score (y_test, y_pre)) # Kappa test 
print ( ' Confusion matrix: \ n ' , metrics.confusion_matrix ( y_test, y_pre)) # confusion matrix 

print ( " 201706120024 Chen Zhenrui 17th class software engineering class " ) 
target_names = [ 'No gas ' , ' With gas ' ] 
print ( ' Classification report: \ n ' , metrics.classification_report (y_test, y_pre, target_names = target_names)) # Classification report 
print ( ' Hanming loss: ' , metrics.hamming_loss (y_test, y_pre)) #Hanming loss. In multiple classifications, the Hamming loss corresponds to the s Hamming distance between y and y_pre 
print ( ' Jaccard coefficient: ' , metrics.jaccard_score (y_test, y_pre)) 



list1 = [] 
list2 = [] 
list3 = [ ]
 for index in range (len (x_train)): 
    center_index =KMeans_model.labels_
    if int(center_index[index])==0:
        list1.append(np.sqrt(sum(abs((x[index,:]-KMeans_model.cluster_centers_[int(center_index[index]),:]))**2)))
    elif int(center_index[index])==1:
        list2.append(np.sqrt(sum(abs((x[index,:]-KMeans_model.cluster_centers_[int(center_index[index]),:]))**2)))
cp1=np.mean(list1)
cp2=np.mean(list2)
cp_mean=(cp1+cp2)/2
print("该聚类的cp为:" + str (cp_mean)) 

# 求 sp 
list4 = []
 for index in range (len (KMeans_model.cluster_centers_)):
     for index1 in range (index + 1 , len (KMeans_model.cluster_centers_)): 
        #print (index, index1) 
        list4.append (np.sqrt (sum (abs (KMeans_model.cluster_centers_ [index,:] -KMeans_model.cluster_centers_ [index1,:]) ** 2 ))) 
sp = np.mean (list4) 
print ( " The cluster The sp is: " + str (sp)) 

import matplotlib.pyplot as plt # visual drawing
plt.rcParams [ ' font.sans-serif ' ] = ' SimHei ' # Set Chinese display 
fig = plt.figure (figsize = ( 10 , 5 )) 
plt.plot (range ( 100 ), KMeans_model.predict (x_test) [ 0 : 100 ], color = " red " ) 
plt.plot (range ( 100 ), y_test [ 0 : 100 ], color = " blue " ) 
plt.title ( ' 201706120024 Chen Zhenrui 17th class software engineering class \ n Comparison chart of real value and predicted value ' ) 
plt.savefig ( 'k-means real value vs. predicted 
value.png ' ) plt.show ()

The actual effect of the model

 

 

 

Guess you like

Origin www.cnblogs.com/renshenbenzuig/p/12733402.html