【机器学习实战】K-mean代码

代码注释的请忽略,中文注释算是比较详细。修改了书上的版本,因为Python已经3.6了
1)map 的规则变了
2)源代码的矩阵,list定义有点混乱

'''
Created on Feb 16, 2011
Modify on Mar 27, 2018
k Means Clustering for Ch10 of Machine Learning in Action
@author: Peter Harrington ---++++---YM 
'''
from numpy import *
import numpy as np


def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        fltLine = []
        curLine = line.strip().split('\t')  # 读取的数据是坐标形式          
        fltLine = [float(curLine[0]),float(curLine[1])]
        dataMat.append(fltLine)
        # fltLine = map(float,curLine) # map all elements to float()
        # 为什么会显示map类型呢??????????map不是函数应用吗
        # print(curLine[0])
        # print(type(fltLine))
    return mat(dataMat)


# calculate Euclidean distance  
def euclDistance(vector1, vector2):  
    return sqrt(sum(power(vector2 - vector1, 2)))  #求这两个矩阵的距离,vector1、2均为矩阵

# init centroids with random samples  
#在样本集中随机选取k个样本点作为初始质心
def initCentroids(dataSet, k):  
    numSamples, dim = dataSet.shape   #矩阵的行数、列数 
    centroids = zeros((k, dim))         #感觉要不要你都可以
    for i in range(k):  
        index = int(random.uniform(0, numSamples))  #随机产生一个浮点数,然后将其转化为int型
        centroids[i,:] = dataSet[index, :]  
    return centroids

# k-means cluster 
#dataSet为一个矩阵
#k为将dataSet矩阵中的样本分成k个类 
def kMeans(dataSet, k):  
# def kMeans(dataSet, k, distMeas=euclDistance, createCent=initCentroids):
    numSamples = dataSet.shape[0]  #读取矩阵dataSet的第一维度的长度,即获得有多少个样本数据
    # first column stores which cluster this sample belongs to,  
    # second column stores the error between this sample and its centroid  
    clusterAssment = mat(zeros((numSamples, 2)))  #得到一个N*2的零矩阵
    clusterChanged = True  

    ## step 1: init centroids  
    centroids = initCentroids(dataSet, k)  #在样本集中随机选取k个样本点作为初始质心

    while clusterChanged:  
        clusterChanged = False  
        ## for each sample  
        for i in range(numSamples):  #range
            minDist  = 100000.0  
            minIndex = 0  
            ## for each centroid  
            ## step 2: find the centroid who is closest  
            #计算每个样本点与质点之间的距离,将其归内到距离最小的那一簇
            for j in range(k):  
                distance = euclDistance(centroids[j, :], dataSet[i, :])  
                if distance < minDist:  
                    minDist  = distance  
                    minIndex = j                
            ## step 3: update its cluster 
            #k个簇里面与第i个样本距离最小的的标号和距离保存在clusterAssment中
            #若所有的样本不在变化,则退出while循环
            if clusterAssment[i, 0] != minIndex:  
                clusterChanged = True  
                clusterAssment[i, :] = minIndex, minDist**2  #两个**表示的是minDist的平方

        ## step 4: update centroids  
        for j in range(k):  
            #clusterAssment[:,0].A==j是找出矩阵clusterAssment中第一列元素中等于j的行的下标,返回的是一个以array的列表,第一个array为等于j的下标
            pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]] #将dataSet矩阵中相对应的样本提取出来 
            centroids[j, :] = mean(pointsInCluster, axis = 0)  #计算标注为j的所有样本的平均值

    print ('Congratulations, cluster complete!')  
    print(type(centroids),type(clusterAssment))
    return mat(centroids), clusterAssment  



def biKmeans(dataSet, k):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))
    centroid0 = mean(dataSet, axis=0).tolist()[0]
    centList =[centroid0] #create a list with one centroid
    for j in range(m):#calc initial Error
        clusterAssment[j,1] = euclDistance(mat(centroid0), dataSet[j])**2
    while (len(centList) < k):
        lowestSSE = inf    # 尽量使得SSE最小
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]# get the data points currently in cluster i
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2)
            sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
        print('the bestCentToSplit is: ',bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids 
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
    return mat(centList), clusterAssment


import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(datMat,numClust):
    # datList = []
    # for line in open('places.txt').readlines():
    #     lineArr = line.split('\t')
    #     datList.append([float(lineArr[4]), float(lineArr[3])])
    # datMat = mat(datList)
    myCentroids, clustAssing = biKmeans(datMat, numClust)  # 用的是二分kMeans聚类
    fig = plt.figure()
    rect=[0.1,0.1,0.8,0.8]
    scatterMarkers=['s', 'o', '^', '8', 'p', \
                    'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0=fig.add_axes(rect, label='ax0', **axprops)
    imgP = plt.imread('Portland.png')
    ax0.imshow(imgP)
    ax1=fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
    ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
    plt.show()


if __name__ == '__main__':    
    # print(type(datmat))
    datmat = loadDataSet('testSet.txt')
    # print(datmat[:][0])# 第一列的最小值    
    # print(distEclud(datmat[0],datmat[1]))  
    #---------------------------  
    # datmat = mat(np.random.rand(100,2))
    # print(type(datmat))
    clusterClubs(datmat,4) 
    # myCentroids,clustAssing = kMeans(datmat,4)

猜你喜欢

转载自blog.csdn.net/acbattle/article/details/80472429