数据挖掘大作业(一):Kmeans+PAM

  • 题目描述
  1. 编程实现K-means算法对waveform数据进行聚类,并对无噪声得图像进行分割;
  2. 编程实现PAM算法对有20%高斯噪声的waveform数据聚类,并对有噪声得图像进行分割。
  • 算法描述

(1) K -means

      

(2) PAM

      

  • 结果展示

1.K-means算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

2.K-means算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

3.PAM算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

4.PAM算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)

• 图像分割部分

                                 

              无噪声原图                                              有噪声原图

5.K-means算法对无噪声得图像进行分割

6.K-means算法对有噪声得图像进行分割

7.PAM算法对无噪声得图像进行分割

8.PAM算法对有噪声得图像进行分割

  • 实验代码
  • # -*- coding:utf-8 -*-
    from numpy import *
    import pandas as pd
    import matplotlib.pyplot as plt
    import random
    import numpy as np
    from PIL import Image
    
    
    def image_gauss_noise(image):  # 图片添加高斯噪声
        img = image.astype(np.int16)  # 此步是为了避免像素点小于0,大于255的情况
        for i in range(img.shape[0]):
            for j in range(img.shape[1]):
                    img[i, j] += random.gauss(mu=0, sigma=10)
        img[img > 255] = 255
        img[img < 0] = 0
        img = img.astype(np.uint8)
        return img
    
    
    def data_gauss_noise(data):  # 10%数据添加高斯噪声
        m, n = shape(data)
        msample = set((m * np.random.rand(int(m * 0.2))).astype(int))
        for i in msample:
            for j in range(n):
                data[i, j] += random.gauss(mu=0, sigma=0.1)
        return data
    
    
    # 计算两个向量的距离,欧式距离
    def disMea(vecA, vecB):
        return sqrt(sum(power(vecA - vecB, 2)))
    
    
    # 随机选择中心点
    def createCent(dataSet, k):
        n = shape(dataSet)[1]
        centriods = mat(zeros((k, n)))
        for j in range(n):
            minJ = min(dataSet[:, j])
            rangeJ = float(max(array(dataSet)[:, j]) - minJ)
            centriods[:, j] = minJ + rangeJ * np.random.rand(k, 1)
        return centriods
    
    
    def kmeans(dataSet, k):
        m = shape(dataSet)[0]
        clusterA = mat(zeros((m, 1)))
        centriods = createCent(dataSet, k)
        clusterC = True
        itr = 10
        while clusterC and itr:
            clusterC = False
            for i in range(m):
                minDist = inf
                minIndex = -1
                for j in range(k):
                    distJI = disMea(centriods[j, :], dataSet[i, :])
                    if distJI < minDist:
                        minDist = distJI;
                        minIndex = j
                if clusterA[i, 0] != minIndex:
                    clusterC = True
    
                clusterA[i, 0] = int(minIndex)
    
            for cent in range(k):
                ptsInClust = dataSet[nonzero(clusterA[:, 0].A == cent)[0]]  # get all the point in this cluster
                centriods[cent, :] = mean(ptsInClust, axis=0)  # assign centroid to mean
            #         print(itr)
            itr -= 1
        return centriods, clusterA
    
    
    def show1(dataSet, k, centriods, clusterA, count):
        plt.figure()
        m, n = shape(dataSet)
        mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
        for i in range(m):
            markIndex = int(clusterA[i, 0])
            plt.plot(dataSet[i, 6], dataSet[i, 9], mark[markIndex])
    
        #     mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
        #     for i in range(k):
        #         plt.plot(centriods[i, showindex[6]], centriods[i, showindex[9]], mark[i], markersize=12)
        plt.savefig("Figure_"+str(count)+".png")
    
    
    def pearson_distance(vector1, vector2):
        from scipy.spatial.distance import pdist
        X = vstack([vector1, vector2])
        return pdist(X)
    
    
    def totalcost(blogwords, medoids_idx):
        distances_cache = { }
        size = shape(blogwords)[0]
        total_cost = 0.0
        medoids = { }
        for idx in medoids_idx:
            medoids[idx] = []
        for i in range(size):
            choice = None
            min_cost = inf
            for m in medoids:
                tmp = distances_cache.get((m, i), None)
                if tmp == None:
                    tmp = pearson_distance(blogwords[m], blogwords[i])
                    distances_cache[(m, i)] = tmp
                if tmp < min_cost:
                    choice = m
                    min_cost = tmp
            medoids[choice].append(i)
            total_cost += min_cost
        return total_cost, medoids
    
    
    def PAM(dataSet, k):
        m, n = shape(dataSet)  # 数据集的行
        iter_count = 0
        # 随机选取K个聚类中心
        CenterIndex = random.sample([i for i in range(m)], k)
        # 计算初始的代价和聚类结果
        pre_cost, medoids = totalcost(dataSet, CenterIndex)
        current_cost = inf
        best_choice = []
        best_res = { }
        itr = 5
        while itr:
            # 遍历所有中心点
            for m in medoids:
                # 逐个选取中心点的簇中的数据,进行替代计算
                for item in medoids[m]:
                    # 取的点不是中心点才计算
                    if item != m:
                        #                     print("now replace is %s" % item)
                        # 获取中心点m在类簇中的下标
                        #                     print("In for CenterIndex is %s" % CenterIndex)
                        idx = CenterIndex.index(m)
                        #                     print("now will be replaced index is %s" % idx)
                        # 临时记录该数据,因为要被替换进行计算
                        swap_temp = CenterIndex[idx]
                        # 进行替换
                        CenterIndex[idx] = item
                        # 替换后的代价和类簇
                        tmp, medoids_ = totalcost(dataSet, CenterIndex)
                        # 如果代价更小,那么就替换
                        if tmp < current_cost:
                            # 进行替换,中心点的修改
                            best_choice = list(CenterIndex)
                            # 类簇的修改
                            best_res = dict(medoids_)
                            # 代价的修改
                            current_cost = tmp
                        # 将中心点进行复原,重复上面的操作直到所有的非中心点数据计算完毕才选择一个最小的,而不是选择目前算的更小值
                        CenterIndex[idx] = swap_temp
            # 若果当前计算的最好的类簇的中心和前一次的中心是一样的,那么认为趋于稳定,结束计算
            if best_choice == CenterIndex:
                break
            # 否则那么更新,重复上面的步骤
            if current_cost <= pre_cost:
                pre_cost = current_cost
                medoids = best_res
                CenterIndex = best_choice
            itr -= 1
            print(itr)
        # 返回最小代价,中心点,划分的聚类结果
        # current_cost, best_choice, best_res
        m, n = shape(dataSet)
        centriods = mat(zeros((k, n)))
        for i in range(k):
            centriods[i, :] = dataSet[best_choice[i], :]
        clusterA = mat(zeros((m, 1)))
        n = 0
        for i in list(best_res.keys()):
            for j in best_res[i]:
                clusterA[j, 0] = n
            n += 1
        return centriods, clusterA
    
    
    
    def fun1(count):  # waveform + kmeans
        dataset = pd.read_csv('waveform.csv',header=None)
        data = mat(dataset)[:,1:22]
        myCentroids, clustAssing = kmeans(data, 3)
        show1(data, 3, myCentroids, clustAssing, count)
    
    def fun2(count):  # gauss_noise + waveform + kmeans
        dataset = pd.read_csv('waveform.csv',header=None)
        data = mat(dataset)[:,1:22]
        data = data_gauss_noise(data)
        myCentroids, clustAssing = kmeans(data, 3)
        show1(data, 3, myCentroids, clustAssing, count)
    
    def fun3(count):  # lena + kmeans
        q = Image.open('lena.jpg')
        q = q.convert('L')
        q.save("lena_1.png")
        m, n = q.size
        q1 = array(q)
        q1 = q1.reshape((m * n, 1))
        k = 3
        Centroids, clustAssing = kmeans(q1, k)
        y_new = array(clustAssing).reshape((n, m)).astype(int16)
        pic_new = Image.new("L", (m, n))
        for i in range(m):
            for j in range(n):
                pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
        pic_new.save("Figure_"+str(count)+".png")
    
    
    def fun4(count):  # gauss_noise + lena + kmeans
        q = Image.open('lena.jpg')
        q = q.convert('L')
        gauss_img = image_gauss_noise(np.array(q))
        q = Image.fromarray(gauss_img)
        q.save("lena_2.png")
        m, n = q.size
        q1 = array(q)
        q1 = q1.reshape((m * n, 1))
        k = 3
        Centroids, clustAssing = kmeans(q1, k)
        y_new = array(clustAssing).reshape((n, m)).astype(int16)
        pic_new = Image.new("L", (m, n))
        for i in range(m):
            for j in range(n):
                pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
        pic_new.save("Figure_"+str(count)+".png")
    
    
    def fun5(count):  # waveform + kmeans
        dataset = pd.read_csv('waveform.csv', header=None)
        data = mat(dataset)[:, 1:22]
        myCentroids, clustAssing = PAM(data, 3)
        show1(data, 3, myCentroids, clustAssing, count)
    
    
    def fun6(count):  # gauss_noise + waveform + kmeans
        dataset = pd.read_csv('waveform.csv', header=None)
        data = mat(dataset)[:, 1:22]
        data = data_gauss_noise(data)
        myCentroids, clustAssing = PAM(data, 3)
        show1(data, 3, myCentroids, clustAssing, count)
    
    
    def fun7(count):  # lena + kmeans
        q = Image.open('lena.jpg')
        q = q.convert('L')
        m, n = q.size
        q1 = array(q)
        q1 = q1.reshape((m * n, 1))
        k = 3
        Centroids, clustAssing = PAM(q1, k)
        y_new = array(clustAssing).reshape((n, m)).astype(int16)
        pic_new = Image.new("L", (m, n))
        for i in range(m):
            for j in range(n):
                pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
        pic_new.save("Figure_"+str(count)+".png")
    
    
    def fun8(count):  # gauss_noise + lena + kmeans
        q = Image.open('lena.jpg')
        q = q.convert('L')
        gauss_img = image_gauss_noise(np.array(q))
        q = Image.fromarray(gauss_img)
        m, n = q.size
        q1 = array(q)
        q1 = q1.reshape((m * n, 1))
        k = 3
        Centroids, clustAssing = PAM(q1, k)
        y_new = array(clustAssing).reshape((n, m)).astype(int16)
        pic_new = Image.new("L", (m, n))
        for i in range(m):
            for j in range(n):
                pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]]))
        pic_new.save("Figure_"+str(count)+".png")
    
    
    if '__main__' == __name__:
        fun1(1)
        fun2(2)
        fun3(3)
        fun4(4)
        fun5(5)
        fun6(6)
        fun7(7)
        fun8(8)

猜你喜欢

转载自blog.csdn.net/shunzi1013238638/article/details/107512775