K-Means
#k-means
#导入k-means相关的库的方法
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from scipy.spatial.distance import pdist
from sklearn import metrics, preprocessing
data_path = "D:\\0322_test.csv"
# 读取数据文件
data_frame = pd.read_csv(data_path, encoding='gbk')
# 获取字段名
cols = list(data_frame.columns)
# 数据本身的散点图
def draw_scatter(x_label, y_label):
# 绘图参数的设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.scatter(data_frame[x_label], data_frame[y_label])
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.title('{0}-{1}散点图'.format(x_label, y_label))
plt.show()
# K-Means聚类
def k_means_cluster(x_label, y_label, k):
# 调用sklearn库生成K-Means模型
#n_clusters聚集几个簇 max_iter最大迭代次数
clu = KMeans(n_clusters=k, max_iter=300)
#获取数据集
X_value = data_frame[[x_label, y_label]].values
print(type(X_value))
# 开始进行K-Means聚类
clu.fit(X_value)
# 输出样本所属的簇
print('样本所属簇编号:', clu.labels_)
print(clu.labels_[100:200])
# 输出簇中心坐标
print('簇中心坐标:', clu.cluster_centers_)
# 计算V值 (V=簇内平均误差平方和/簇间平均距离)
v_value = clu.inertia_ / (k * np.average(pdist(clu.cluster_centers_)))
print('v值{0}'.format(v_value))
# 可视化聚类属性(散点图)
# 参数设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 以簇编号作为颜色区分依据
plt.scatter(data_frame[x_label], data_frame[y_label], c=clu.labels_)
plt.title('K={0}聚类结果'.format(k))
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
if __name__ == '__main__':
draw_scatter('col2', 'col4')
k_means_cluster('col2', 'col4', 2)
DBSCAN
#DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics, preprocessing
data_path = "D:\\0322_test.csv"
# 读取数据文件
data_frame = pd.read_csv(data_path, encoding='gbk')
# DBSCAN聚类
def dbscan_cluster(x_label, y_label):
# 生成DBSCAN模型
clu = DBSCAN(eps=4, min_samples=5)
X_value = data_frame[[x_label, y_label]].values
# 开始进行DBSCAN聚类
clu.fit(X_value)
# 输出样本所属的簇
print('样本所属簇编号:', clu.labels_)
# 可视化聚类属性(散点图)
# 参数设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 以簇编号作为颜色区分依据
plt.scatter(data_frame[x_label], data_frame[y_label], c=clu.labels_)
plt.title('DBSCAN聚类结果')
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.show()
if __name__ == '__main__':
dbscan_cluster('col2', 'col4')