二分K-Means算法

二分K-Means算法是对K-Means算法的优化,主要优化的地方是在选取质心的时候,二分K-Means算法有效地避免了在初始选取质心时的误差,可以有效地提高算法效率。

测试数据:testSet2.txt 中间用tab分隔

3.275154    2.957587
-3.344465    2.603513
0.355083    -3.376585
1.852435    3.547351
-2.078973    2.552013
-0.993756    -0.884433
2.682252    4.007573
-3.087776    2.878713
-1.565978    -1.256985
2.441611    0.444826
-0.659487    3.111284
-0.459601    -2.618005
2.177680    2.387793
-2.920969    2.917485
-0.028814    -4.168078
3.625746    2.119041
-3.912363    1.325108
-0.551694    -2.814223
2.855808    3.483301
-3.594448    2.856651
0.421993    -2.372646
1.650821    3.407572
-2.082902    3.384412
-0.718809    -2.492514
4.513623    3.841029
-4.822011    4.607049
-0.656297    -1.449872
1.919901    4.439368
-3.287749    3.918836
-1.576936    -2.977622
3.598143    1.975970
-3.977329    4.900932
-1.791080    -2.184517
3.914654    3.559303
-1.910108    4.166946
-1.226597    -3.317889
1.148946    3.345138
-2.113864    3.548172
0.845762    -3.589788
2.629062    3.535831
-1.640717    2.990517
-1.881012    -2.485405
4.606999    3.510312
-4.366462    4.023316
0.765015    -3.001270
3.121904    2.173988
-4.025139    4.652310
-0.559558    -3.840539
4.376754    4.863579
-1.874308    4.032237
-0.089337    -3.026809
3.997787    2.518662
-3.082978    2.884822
0.845235    -3.454465
1.327224    3.358778
-2.889949    3.596178
-0.966018    -2.839827
2.960769    3.079555
-3.275518    1.577068
0.639276    -3.412840
 

功能函数:

from numpy import *
from math import *

'''
loadDataSet(fileName)函数将文本文件导入到一个列表中,
文本文件每一行为tab分隔的浮点数,
每一个列表会被添加到dataMat中,最后返回dataMat,
该返回值是一个包含许多其他列表的列表
'''
def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine))
        dataMat.append(fltLine)
    return dataMat

'''
distEclud(vecA, vecB)函数计算两个向量的欧式距离
公式:sqrt((x1-x2)^2+(y1-y2)^2)
'''
def distEclud(vecA, vecB):
    return math.sqrt(sum(power(vecA - vecB, 2)))

'''
randCent()函数为给定数据集构建一个包含k个随机质心的集合。
随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小值和最大值来完成。
然后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
'''
def randCent(dataSet, k):
    n = shape(dataSet)[1]
    centroids = mat(zeros((k,n)))#创建存储k个质心的矩阵
    for j in range(n):#在边界范围内,随机生成k个质心
        minJ = min(dataSet[:,j]) #边界的最小值
        rangeJ = float(max(dataSet[:,j]) - minJ) #边界范围
        centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
    return centroids

二分K-Means算法:


'''
二分K-均值聚类算法
'''
def biKmeans(dataSet, k, distMeas=distEclud):
    m = shape(dataSet)[0]#确定数据集中数据点的总数
    
    #创建一个矩阵来存放每个点的簇分配结果,包含两列:一列是记录簇索引值,第二列是存储误差。
    #误差是指当前点到簇质心的距离,后面将使用该误差来评价聚类的效果。
    clusterAssment = mat(zeros((m,2)))

    centroid0 = mean(dataSet, axis=0).tolist()[0]#计算整个数据集的质心,即初始时的质心的坐标为所有数据点的均值
    centList =[centroid0] #创建一个初始化只要一个初始质心的列表
    
    #计算所有数据点到初始质心的距离平方误差
    for j in range(m):
        clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
        
    #该while循环不断地对簇进行划分,直到得到设定的簇数目为止
    while (len(centList) < k):
        lowestSSE = inf
        for i in range(len(centList)):#对每一个质心
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#将当前簇i中的所有数据看成一个小的数据集
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)#通过KMeans()函数,得到生成两个质心的簇,即二分,获取到质心及其每个簇的误差值
            #将二分kMeans结果中的平方和的距离进行求和
            sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
            #将未参与二分kMeans分配结果中的平方和的距离进行求和
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            #???总的(未拆分和已拆分)误差和越小,越相似,效果越优化,划分的结果越好
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #调用二分kMeans的结果,默认簇是0,1
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit #更新为最佳质心
        print('最好的质心列表是: ',bestCentToSplit)
        print('最好的簇分配结果的长度是the len of bestClustAss is: ', len(bestClustAss))
        #更新质心列表
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#更新原来的质心list中的第i个质心为使用二分kMeans后最好的质心的第一个质心
        centList.append(bestNewCents[1,:].tolist()[0])#添加最佳质心的第二个质心
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#重新分配最好簇下的数据(质心)以及误差平方和
    return mat(centList), clusterAssment

画图函数:


#画图
def show(dataSet, k, centroids, clusterAssment):
    from matplotlib import pyplot as plt  
    numSamples, dim = dataSet.shape  
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']  
    for i in range(numSamples):  
        markIndex = int(clusterAssment[i, 0])  
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])  
    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']  
    for i in range(k):  
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 7, color='orange')
    plt.show()

运行函数:


def main():
    dataMat = mat(loadDataSet('testSet.txt'))
    # 指定获取四个质心
    #myCentroids, clustAssing= kMeans(dataMat,4)
    myCentroids, clustAssing = biKmeans(dataMat, 4)
    print("--------------------------------------------------")
    print("最终的质心列表:")
    print(myCentroids)
    print("--------------------------------------------------")
    show(dataMat, 4, myCentroids, clustAssing)


if __name__ == '__main__':
    main()

运行结果:

--------------------------------------------------
[[ 0.45583556 -2.83951859]
 [ 4.53016863  0.26630517]]
--------------------------------------------------
[[-1.42524564 -0.99766158]
 [ 2.80397812  2.36861932]]
--------------------------------------------------
[[-1.4194757  -1.09467459]
 [ 2.62933196  2.44063554]]
--------------------------------------------------
[[-1.41575251 -1.18274747]
 [ 2.47206763  2.48258193]]
--------------------------------------------------
[[-1.427618   -1.24752315]
 [ 2.35525282  2.471975  ]]
--------------------------------------------------
[[-1.42255751 -1.32962551]
 [ 2.21590952  2.48810334]]
--------------------------------------------------
[[-1.4083314  -1.41809174]
 [ 2.0709171   2.50828943]]
--------------------------------------------------
[[-1.38556017 -1.59171252]
 [ 1.81930722  2.52332178]]
--------------------------------------------------
[[-1.14971512 -2.36515883]
 [ 0.99613495  2.5978361 ]]
--------------------------------------------------
[[-0.80835482 -2.75696741]
 [ 0.56675076  2.72843476]]
--------------------------------------------------
[[-0.54735726 -2.93692713]
 [ 0.2978695   2.76065064]]
--------------------------------------------------
[[-0.2897198  -2.83942545]
 [ 0.08249337  2.94802785]]
sseSplit, and notSplit:  792.9168565373268 0.0
最好的质心列表是:  0
最好的簇分配结果的长度是the len of bestClustAss is:  80
--------------------------------------------------
[[ 0.56946516 -2.26967164]
 [-1.82366006 -2.2423357 ]]
--------------------------------------------------
[[ 2.65077367 -2.79019029]
 [-3.53973889 -2.89384326]]
sseSplit, and notSplit:  84.25921395268443 326.2840752011824
--------------------------------------------------
[[ 2.64689211  2.65678291]
 [-1.06102414  3.09849093]]
--------------------------------------------------
[[ 2.6265299   3.10868015]
 [-2.46154315  2.78737555]]
sseSplit, and notSplit:  66.36683512000786 466.63278133614426
最好的质心列表是:  0
最好的簇分配结果的长度是the len of bestClustAss is:  40
--------------------------------------------------
[[ 2.8424327  -2.29684331]
 [ 0.58029953 -3.49421623]]
--------------------------------------------------
[[ 3.03713839 -2.62802833]
 [ 0.33258533 -3.763162  ]]
sseSplit, and notSplit:  40.03362852612693 348.38731175812995
--------------------------------------------------
[[-0.97552283  2.72658352]
 [ 2.17396721  1.63579194]]
--------------------------------------------------
[[-2.46154315  2.78737555]
 [ 2.6265299   3.10868015]]
sseSplit, and notSplit:  66.36683512000786 84.25921395268443
--------------------------------------------------
[[-3.53396447 -2.69981673]
 [-2.29716453 -2.54415119]]
--------------------------------------------------
[[-3.98132369 -2.837576  ]
 [-2.58297183 -3.01575567]]
--------------------------------------------------
[[-4.12637645 -2.82110091]
 [-2.73311225 -2.993864  ]]
--------------------------------------------------
[[-4.1986774  -2.8253822 ]
 [-2.807585   -2.96991111]]
sseSplit, and notSplit:  12.83784706044089 388.4400525969194
最好的质心列表是:  1
最好的簇分配结果的长度是the len of bestClustAss is:  40
--------------------------------------------------
最终的质心列表:
[[ 2.65077367 -2.79019029]
 [-2.46154315  2.78737555]
 [-3.53973889 -2.89384326]
 [ 2.6265299   3.10868015]]
--------------------------------------------------

猜你喜欢

转载自blog.csdn.net/fu_jian_ping/article/details/88907997