(1) Picture compression
The following is the image compression using k-means code
from sklearn.datasets import load_sample_image
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import sys
import numpy as np
from pylab import mpl
# Specify the default font
mpl.rcParams ['font.sans-serif'] = ['FangSong']
# Solve that the saved image is a negative sign '-' displayed as a square
mpl.rcParams ['axes.unicode_minus'] = False
## Read the
image in the datasets image = load_sample_image ("china.jpg") #View the
picture
plt.imshow (image)
plt.show ()
## Observe the image file size, data structure
print ("Image uses memory size:", sys.getsizeof (image))
print ("Image size:", image.shape)
print (image)
## Data linearization
image1 = image [:: 3, :: 3]
X = image1.reshape (-1,3)
print (image.shape, X.shape)
### Use kmeans to cluster image pixel colors
n_colors = 64
K_model = KMeans (n_colors) #Expected
value of color classification after classification
y = K_model.fit_predict (X) #Color of
each category, two-dimensional array
colors = K_model .cluster_centers_
Print (y.shape, colors.shape)
## generates compression, reduced to two-dimensional
new_image = Colors [the y-] .reshape (image1.shape)
new_image.shape
new_image.size
# contrast
print ( "old pictures take up memory: ", sys.getsizeof (image))
print (" New image takes up content: ", sys.getsizeof (new_image))
print (" Old image memory: ", image.shape)
print (" New image content: ", new_image. shape) #Generate
image
plt.figure (figsize = (10,5)) # Canvas
plt.subplot (1, 2, 1)
plt.imshow (image)
plt.title ("Image before compression")
plt.subplot ( 1, 2, 2)
plt.title ("Compressed image")
plt.imshow (new_image.astype (np.uint8))
plt.suptitle ("Comparison before and after compression")
plt.show ()
actual effect:
(2) Actual application
Use rental information to predict whether the rental will bring gas
#K-meavn算法 import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans from sklearn import preprocessing from sklearn.metrics import jaccard_score,fowlkes_mallows_score,adjusted_rand_score data=pd.read_csv("201706120024陈圳锐(处理后).csv") y=data['燃气'].values x=data.drop('燃气',axis=1).values #分箱处理调优 # a=0 # count1=0 # count2=0 # for i in range(2,20): # for index in range(2,20): # print(i,index) # #quantile # est = preprocessing.KBinsDiscretizer(n_bins=[i, index], encode='ordinal',strategy='uniform').fit(data[['面积','租金']].values) # x1 ) .values)= est.transform (data [[ ' Area ' , ' Rent ' ]]. values) # x = data.drop ( ' Area ' , axis = 1 ) .drop ( ' Rent ' , axis = 1 ) .values # x = np.hstack ((x1, x)) # # # #zscore Standardized Processing # zscore_scaler = preprocessing.StandardScaler () # Create StandardScaler object # x = zscore_scaler.fit_transform (data.drop ( ' gas ' , axis = 1 # # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=5) # # KMeans_model = KMeans(n_clusters=2) # KMeans_model.fit(x_train) # from sklearn import metrics # y_pre=KMeans_model.predict(x_test) # if a<metrics.accuracy_score(y_test, y_pre): # count1=i # count2=index # a=metrics.accuracy_score(y_test, y_pre) # # print ( ' Accuracy index: ' , metrics.accuracy_score (y_test, y_pre)) #Binary processing est = preprocessing.KBinsDiscretizer (n_bins = [ 19 , 19 , 19 ], encode = ' ordinal ' , strategy = ' uniform ' ) .fit (data [[ ' Area ' , ' Rent ' , ' Floor ' ]]. values) x1 = est.transform (data [[ ' Area ' , ' Rent ' , ' Floor ' ]].values) x = data.drop ( ' Area ' , axis = 1 ) .drop ( ' Floor ' , axis = 1 ) .drop ( ' Rent ' , axis = 1 ) .values x = np.hstack ((x1 , x)) #zscore standardized processing zscore_scaler = preprocessing.StandardScaler () # Create StandardScaler object x = zscore_scaler.fit_transform (data.drop ( ' gas ' , axis = 1 ) .values) #divide the data set x_train, x_test, y_train, y_test = train_test_split (x, y, test_size = 0.2 , random_state = 5) #Building model KMeans_model = KMeans (n_clusters = 2 , random_state = 5 ) #Training model KMeans_model.fit (x_train) #Prediction test set y_pre = KMeans_model.predict (x_test) from sklearn import metrics # KMeans_model.score (x_test, y_test) print ( " 201706120024 Chen Zhenrui Class 17 Software Engineering Class 1 " ) print ( ' Accuracy index: ' , metrics.accuracy_score (y_test, y_pre)) print ( ' Cluster center is: \ n ' , KMeans_model.cluster_centers_) # View cluster center print ( ' category label is: \ n ' , KMeans_model.labels_) print ( " Number of participating tests: " , x_test.shape [ 0 ]) print ( " Predict the correct number: " , x_test.shape [ 0 ] * metrics.accuracy_score (y_test, y_pre)) print ( ' Accuracy index : ' , Metrics.accuracy_score (y_test, y_pre)) # Calculate accuracy rate print ( ' Kappa indicator: ' , metrics.cohen_kappa_score (y_test, y_pre)) # Kappa test print ( ' Confusion matrix: \ n ' , metrics.confusion_matrix ( y_test, y_pre)) # confusion matrix print ( " 201706120024 Chen Zhenrui 17th class software engineering class " ) target_names = [ 'No gas ' , ' With gas ' ] print ( ' Classification report: \ n ' , metrics.classification_report (y_test, y_pre, target_names = target_names)) # Classification report print ( ' Hanming loss: ' , metrics.hamming_loss (y_test, y_pre)) #Hanming loss. In multiple classifications, the Hamming loss corresponds to the s Hamming distance between y and y_pre print ( ' Jaccard coefficient: ' , metrics.jaccard_score (y_test, y_pre)) list1 = [] list2 = [] list3 = [ ] for index in range (len (x_train)): center_index =KMeans_model.labels_ if int(center_index[index])==0: list1.append(np.sqrt(sum(abs((x[index,:]-KMeans_model.cluster_centers_[int(center_index[index]),:]))**2))) elif int(center_index[index])==1: list2.append(np.sqrt(sum(abs((x[index,:]-KMeans_model.cluster_centers_[int(center_index[index]),:]))**2))) cp1=np.mean(list1) cp2=np.mean(list2) cp_mean=(cp1+cp2)/2 print("该聚类的cp为:" + str (cp_mean)) # 求 sp list4 = [] for index in range (len (KMeans_model.cluster_centers_)): for index1 in range (index + 1 , len (KMeans_model.cluster_centers_)): #print (index, index1) list4.append (np.sqrt (sum (abs (KMeans_model.cluster_centers_ [index,:] -KMeans_model.cluster_centers_ [index1,:]) ** 2 ))) sp = np.mean (list4) print ( " The cluster The sp is: " + str (sp)) import matplotlib.pyplot as plt # visual drawing plt.rcParams [ ' font.sans-serif ' ] = ' SimHei ' # Set Chinese display fig = plt.figure (figsize = ( 10 , 5 )) plt.plot (range ( 100 ), KMeans_model.predict (x_test) [ 0 : 100 ], color = " red " ) plt.plot (range ( 100 ), y_test [ 0 : 100 ], color = " blue " ) plt.title ( ' 201706120024 Chen Zhenrui 17th class software engineering class \ n Comparison chart of real value and predicted value ' ) plt.savefig ( 'k-means real value vs. predicted value.png ' ) plt.show ()
The actual effect of the model