K-means聚类算法Python实现及绘图

2021年美赛A题真菌种群K-means聚类算法Python实现及绘图

import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math


# calculate the euler distance
def calcDis(dataSet, centroids, k):
    clalist = []
    for data in dataSet:
        diff = np.tile(data, (k,
                              1)) - centroids
        squaredDiff = diff ** 2
        squaredDist = np.sum(squaredDiff, axis=1)
        distance = squaredDist ** 0.5
        clalist.append(distance)
    clalist = np.array(clalist)
    return clalist


# calculate the centroids
def classify(dataSet, centroids, k):
    # calculate the distance between the samples and centroids
    clalist = calcDis(dataSet, centroids, k)
    # Grouping the samples and recalculate the centroids
    minDistIndices = np.argmin(clalist, axis=1)
    newCentroids = pd.DataFrame(dataSet).groupby(
        minDistIndices).mean()
    newCentroids = newCentroids.values

    changed = newCentroids - centroids

    return changed, newCentroids


def kmeans(dataSet, k):
    centroids = random.sample(dataSet, k)

    # renew the centroids until there is no change
    changed, newCentroids = classify(dataSet, centroids, k)
    while np.any(changed != 0):
        changed, newCentroids = classify(dataSet, newCentroids, k)

    centroids = sorted(newCentroids.tolist())

    cluster = []
    clalist = calcDis(dataSet, centroids, k)
    minDistIndices = np.argmin(clalist, axis=1)
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minDistIndices):
        cluster[j].append(dataSet[i])

    return centroids, cluster


def createDataSet():
    # df1 = pd.read_excel(r'D:\MCM2021\data\MT-DR.xlsx')
    # mt = df1['MT'].values
    # dr = df1['DR'].values
    # length = len(dr)
    # ary1 = []
    # for i in range(length):
    #     ary1.append([mt[i],dr[i]])
    # df2 = pd.read_excel(r'D:\MCM2021\data\fungal_biogeography-master\fungi_data\fungal_trait_data.xlsx')
    # rate2 = df2['rate.0.5'].values
    # ranking = df2['ranking'].values
    # wnw2 = df2['water.niche.width'].values
    # length2 = len(rate2)
    ary2 = []
    df3 = pd.read_csv(r'D:\MCM2021\data\fungi classification.csv')
    rate3 = df3['extension rate'].values
    wnw3 = df3['water.niche.width'].values
    length3 = len(rate3)
    for i in range(length3):
        ary2.append([rate3[i], wnw3[i]])
    return ary2


def a_plot(k, cluster):
    xlist = []
    ylist = []
    data_list = cluster[k]
    length = len(data_list)
    for j in range(0, length):
        xlist.append(data_list[j][0])
        ylist.append(data_list[j][1])
    return xlist, ylist


def eluer(centroids, cluster, k):
    i = 0
    j = 0
    dis = []
    for i in range(0, k):
        l = 0
        length = len(cluster[i])
        for j in range(length):
            point = cluster[i][j]
            l += math.sqrt((point[0] - centroids[i][0]) ** 2 + (point[1] - centroids[i][1]) ** 2)
        dis.append(l)

    return dis


if __name__ == '__main__':
    k = 3
    i = 0
    dataset = createDataSet()
    centroids, cluster = kmeans(dataset, k)
    dis = eluer(centroids, cluster, k)
    print('距离为:%s' % dis)
    print('质心为:%s' % centroids)
    print('集群为:%s' % cluster)
    x_list = []
    y_list = []
    fig_1 = plt.figure()
    ax_1 = fig_1.add_subplot(1, 1, 1)
    for i in range(0, k):
        x_list, y_list = a_plot(i, cluster)
        color1 = ["g", "b", "r", 'k', 'y']
        color2 = ["c", 'm', 'k', 'g', 'r']
        style1 = ['v', 'o', '*', '>', '1']
        label_list = ['survival', 'ordinary', 'aggressive']
        str1 = color1[i]
        str2 = color2[i]
        ax_1.scatter(x_list, y_list, color=str1, marker=style1[i], label=label_list[i])
        ax_1.grid()
        # plt.scatter(centroids[i][0], centroids[i][1], color=str2, marker=style1[i])
        print(centroids[i][0], centroids[i][1])
    ax_1.set_xlabel('Extension Rate', size='14')
    ax_1.set_ylabel('Moisture Niche Width', size='14')
    ax_1.legend()
    plt.savefig("fungi classification")
    print('Saved')
    plt.show()
    df2 = pd.read_excel(r'D:\MCM2021\data\competitive ranking.xlsx')
    cr = df2['ranking'].values
    er = df2['extension'].values
    fig = plt.figure(2)
    ax = fig.add_subplot(1, 1, 1)
    temx = []
    temy = []
    for m in range(10):
        temx.append(er[m])
        temy.append(cr[m])
    ax.scatter(temx, temy, color='g', marker='v', label='survival')
    print(temx)
    temx = []
    temy = []
    for t in range(9):
        temx.append(er[t+10])
        temy.append(cr[t+10])
    ax.scatter(temx, temy, color='b', marker='o', label='ordinary')
    print(temx)
    temx = []
    temy = []
    for y in range(15):
        temx.append(er[y+19])
        temy.append(cr[y+19])
    ax.scatter(temx, temy, color='r', marker='*', label='aggressive')
    print(temx)

    ax.set_xlabel('Extension Rate', size='14')
    ax.set_ylabel('Competitive Ranking', size='14')
    ax.legend()
    ax.grid()
    plt.savefig('ER-CR')

    plt.show()

结果图

猜你喜欢

转载自blog.csdn.net/joshua_shi_t/article/details/121132847