Experimental clustering (k-means / DBSCAN)

K-Means

#k-means
#导入k-means相关的库的方法
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from scipy.spatial.distance import pdist
from sklearn import metrics, preprocessing

data_path = "D:\\0322_test.csv"

# 读取数据文件
data_frame = pd.read_csv(data_path, encoding='gbk')

# 获取字段名
cols = list(data_frame.columns)

# 数据本身的散点图
def draw_scatter(x_label, y_label):
    # 绘图参数的设置
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    plt.scatter(data_frame[x_label], data_frame[y_label])

    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.title('{0}-{1}散点图'.format(x_label, y_label))
plt.show()

# K-Means聚类
def k_means_cluster(x_label, y_label, k):
    # 调用sklearn库生成K-Means模型
    #n_clusters聚集几个簇 max_iter最大迭代次数
    clu = KMeans(n_clusters=k, max_iter=300)
    #获取数据集
    X_value = data_frame[[x_label, y_label]].values
    print(type(X_value))
    # 开始进行K-Means聚类
    clu.fit(X_value)

    # 输出样本所属的簇
    print('样本所属簇编号:', clu.labels_)
    print(clu.labels_[100:200])
    # 输出簇中心坐标
    print('簇中心坐标:', clu.cluster_centers_)

    # 计算V值 (V=簇内平均误差平方和/簇间平均距离)
    v_value = clu.inertia_ / (k * np.average(pdist(clu.cluster_centers_)))
    print('v值{0}'.format(v_value))

    # 可视化聚类属性(散点图)
    # 参数设置
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

    # 以簇编号作为颜色区分依据
    plt.scatter(data_frame[x_label], data_frame[y_label], c=clu.labels_)

    plt.title('K={0}聚类结果'.format(k))
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()

if __name__ == '__main__':
    draw_scatter('col2', 'col4')
    k_means_cluster('col2', 'col4', 2)

Here Insert Picture Description
Here Insert Picture Description
Here Insert Picture Description

DBSCAN

#DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics, preprocessing
data_path = "D:\\0322_test.csv"

# 读取数据文件
data_frame = pd.read_csv(data_path, encoding='gbk')

# DBSCAN聚类
def dbscan_cluster(x_label, y_label):

    # 生成DBSCAN模型
    clu = DBSCAN(eps=4, min_samples=5)

    X_value = data_frame[[x_label, y_label]].values

    # 开始进行DBSCAN聚类
    clu.fit(X_value)
    # 输出样本所属的簇
    print('样本所属簇编号:', clu.labels_)

    # 可视化聚类属性(散点图)

    # 参数设置
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

    # 以簇编号作为颜色区分依据
    plt.scatter(data_frame[x_label], data_frame[y_label], c=clu.labels_)

    plt.title('DBSCAN聚类结果')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()
if __name__ == '__main__':
dbscan_cluster('col2', 'col4')

Here Insert Picture Description

Published 26 original articles · won praise 23 · views 1623

Guess you like

Origin blog.csdn.net/surijing/article/details/105045059