K-Means聚类算法实现

《机器学习实战》中关于kmeans的代码会报各种错误
修改后的代码如下
供大家参考

def loadDataSet(fileName):
    dataMat=[]
    fr=open(fileName)
    for line in fr.readlines():
        curLine=line.strip().split('   ')
        fltLine=list(map(float,curLine))
        dataMat.append(fltLine)
    return  dataMat

# 距离函数
import numpy as np

def distEclud(vecA,vecB):
    return np.sqrt(np.sum(np.power(vecA-vecB,2)))

# 初始化质心
def randCent(dataSet,k):
    n=np.shape(dataSet)[1]
    dataSet=np.mat(dataSet)
    centriods=np.mat(np.zeros((k,n)))
    for j in range(n):
        minJ=np.min(dataSet[:,j])
        rangeJ=np.max(dataSet[:,j]-minJ)
        centriods[:,j]=minJ+rangeJ*np.random.rand(k,1)
    return centriods

def kMeans(dataSet,k,distMeas=distEclud,createCent=randCent):
    m=np.shape(dataSet)[0]
    clusterAssment=np.mat(np.zeros((m,2))) # 第一列放质心的索引 第二列放到质心的距离
    dataSet=np.mat(dataSet)
    centroids=np.mat(createCent(dataSet,k))
    clusterChanged=True
    ks = 0
    while clusterChanged:
        clusterChanged=False
        for i in range(m):
            minDist=np.inf
            minIndex=-1
            for j in range(k):
                distJI=distMeas(centroids[j,:],dataSet[i,:])
                if distJI<minDist:
                    minDist=distJI;minIndex=j
            # 只要样本的簇序数有变化 就继续迭代 直到不变
            if clusterAssment[i,0]!=minIndex:
               clusterChanged=True
            clusterAssment[i,:]=minIndex,minDist**2
        ks += 1
        for cent in range(k):
            pstInClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]
            centroids[cent,:]=np.mean(pstInClust,axis=0)
    return centroids,clusterAssment


# 测试
# dataMat=loadDataSet('testSet.txt')
# dataMat=np.mat(dataMat)
# centroids,clusterAssment=kMeans(dataMat,4)
#
#
# #绘图
# import matplotlib.pyplot as plt
# fig=plt.figure()
# plt.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],c=clusterAssment[:,0].flatten().A[0])
# plt.scatter(centroids[:,0].flatten().A[0],centroids[:,1].flatten().A[0],marker='+',c='red')
# plt.show()


# 二分K均值聚类算法
def biKmeans(dataSet,k,distMeas=distEclud):
    m=np.shape(dataSet)[0]
    clusterAssment=np.mat(np.zeros((m,2)))
    centrioid0=np.mean(dataSet,axis=0).tolist()[0]
    centList=[centrioid0]
    print(centList)
    for j in range(m):
        clusterAssment[j,1]=distMeas(np.mat(centrioid0),dataSet[j,:])**2
    while(len(centList)<k):
        lowestSSE=np.inf
        for i in range(len(centList)):
            ptsInCurrCluster=dataSet[np.nonzero(clusterAssment[:,0].A==i)[0],:]
            centroidMat,splitClustAss=kMeans(ptsInCurrCluster,2,distMeas)
            sseSplit=np.sum(splitClustAss[:,1])
            sseNotSplit=np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("划分部分的误差:"+str(sseSplit))
            print("为划分部分的误差:"+str(sseNotSplit))
            if(sseNotSplit+sseSplit)<lowestSSE:
                bestCentToSplit=i
                bestNewCents=centroidMat
                bestClustAss=splitClustAss.copy()
                lowestSSE=sseNotSplit+sseSplit
        bestClustAss[np.nonzero(bestClustAss[:,0].A==1)[0],0]=len(centList)
        bestClustAss[np.nonzero(bestClustAss[:,0].A == 0)[0], 0] = bestCentToSplit
        centList[bestCentToSplit]=bestNewCents[0,:].tolist()[0]
        centList.append(bestNewCents[1, :].tolist()[0])
        clusterAssment[np.nonzero(clusterAssment[:,0].A==bestCentToSplit)[0],:]=bestClustAss
    return np.mat(centList),clusterAssment

# 测试
dataMat=loadDataSet('testSet2.txt')
dataMat=np.mat(dataMat)
centList,clusterAssment=biKmeans(dataMat,3)

#绘图
import matplotlib.pyplot as plt
fig=plt.figure()
plt.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0],c=clusterAssment[:,0].flatten().A[0])
plt.scatter(centList[:,0].flatten().A[0],centList[:,1].flatten().A[0],marker='+',c='red')
plt.show()

猜你喜欢

转载自blog.csdn.net/uncledrew2017/article/details/82803860