利用K-means进行聚类,显示聚类结果的各类别的数量,最终进行图形化展示 。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import cx_Oracle
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.cluster import KMeans #导入K均值聚类算法
import os
file = open('test.xlsx','rb')
data = pd.read_excel(file)
file.close()
data.set_index('CONS_NO',inplace=True)
#删除指标
data.drop('label_诉求行为',axis=1,inplace=True)
#填充nan值为0
data.fillna(0, inplace = True)
#标准化
for i in data.columns:
if i=='TS_FLAG':
data = pd.get_dummies(data,columns=[i]) #离散值进行One-Hot编码
else:
data[i]=\
StandardScaler().fit_transform(data[i].values.reshape(-1,1)).reshape(1,-1)[0] #数值标准化
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
distance=pd.DataFrame(index=range(1,15),columns=['类内距离','类间距离'])
for k in range(3,11):
kmodel=KMeans(n_clusters=k)
kmodel.fit(data)
r1 = pd.Series(kmodel.labels_).value_counts() #统计各个类别的数目
#重命名表头
print(r1)
distance_in=0
distance_ot=0
for i in range(k):
group=kmodel.labels_==i
members=data[group]
for v in np.mat(members):
distance_in += np.linalg.norm(v - kmodel.cluster_centers_[i]) #默认为二范数,即欧式距离
for j in range(k):
if i<j:
distance_ot +=np.linalg.norm(kmodel.cluster_centers_[i]-kmodel.cluster_centers_[j])
distance.loc[k,'类内距离']=distance_in
distance.loc[k,'类间距离']=distance_ot
print('完成聚类数为 {} 聚类!'.format(k))
#聚类6类
k = 6
kmodel = KMeans(n_clusters = k, n_jobs = 1) #n_jobs是并行数,一般等于CPU数较好
kmodel.fit(data) #训练模型
r1 = pd.Series(kmodel.labels_).value_counts() #统计各个类别的数目
r2 = pd.DataFrame(kmodel.cluster_centers_) #找出聚类中心
r = pd.concat([r2, r1], axis = 1) #横向连接(0是纵向),得到聚类中心对应的类别下的数目
r.columns = list(data.columns) + [u'类别数目'] #重命名表头
colr = ['#E15759', '#4E79A7', '#76B7B2', '#F28E2B','blue','#F45E2B','#F67E2B']
plt.figure(figsize = (10, 8))
for i in r.index:
if r['类别数目'][i] > 10:
plt.plot(r.columns[:-1], r.loc[i, r.columns[:-1]], color = colr[i], label = i, linewidth=2)
plt.legend(loc = 0, fontsize=14)
plt.axis('tight')
plt.xticks(fontsize=12,rotation=30)
plt.yticks(fontsize=12)
plt.xlabel('标签名' ,fontsize=20)
plt.ylabel('中心点' ,fontsize=20)
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.show()
data['label_总聚类'] = kmodel.labels_ #得到聚类结果