22.01.Cluster

 

1. 클러스터링 iris 데이터셋 확인

In [2]:
from sklearn import cluster
from sklearn import datasets

iris = datasets.load_iris()
data = iris['data']

model = cluster.KMeans( n_clusters=3 )
model.fit( data )

print(model.labels_)
 
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]
In [5]:
# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import datasets

# iris 데이터를 로드
iris = datasets.load_iris()
data = iris["data"]

# 학습 → 클러스터 생성
model = cluster.KMeans(n_clusters=3)
model.fit(data)
Out[5]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
In [6]:
# 학습 결과의 라벨 취득
labels = model.labels_

### 그래프 그리기
x_index = 2
y_index = 3

data_x=data[:,x_index]
data_y=data[:,y_index]

x_max = 7.5
x_min = 0
y_max = 3
y_min = 0
x_label = iris["feature_names"][x_index]
y_label = iris["feature_names"][y_index]


plt.scatter(data_x[labels==0], data_y[labels==0],c='black' ,alpha=0.3,s=100, marker="o",label="cluster 0")
plt.scatter(data_x[labels==1], data_y[labels==1],c='black' ,alpha=0.3,s=100, marker="o",label="cluster 1")
plt.scatter(data_x[labels==2], data_y[labels==2],c='black' ,alpha=0.3,s=100, marker="o",label="cluster 2")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xlabel(x_label,fontsize='large')
plt.ylabel(y_label,fontsize='large')
plt.show()
 
 

2. k-means 모델 사용

In [7]:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import datasets

# iris 데이터를 로드
iris = datasets.load_iris()
data = iris["data"]

# 초기 중심점을 정의
init_centers=np.array([
       [4,2.5,3,0],
       [5,3  ,3,1],
       [6,4  ,3,2]])

# 데이터 정의와 값 꺼내기
x_index = 1
y_index = 2

data_x=data[:,x_index]
data_y=data[:,y_index]

# 그래프의 스케일과 라벨 정의
x_max = 4.5
x_min = 2
y_max = 7
y_min = 1
x_label = iris["feature_names"][x_index]
y_label = iris["feature_names"][y_index]

def show_result(cluster_centers,labels):
    # cluster 0과 중심점을 그리기
    plt.scatter(data_x[labels==0], data_y[labels==0],c='black' ,alpha=0.3,s=100, marker="o",label="cluster 0")
    plt.scatter(cluster_centers[0][x_index], cluster_centers[0][y_index],facecolors='white', edgecolors='black', s=300, marker="o")

     # cluster 1과 중심점을 그리기
    plt.scatter(data_x[labels==1], data_y[labels==1],c='black' ,alpha=0.3,s=100, marker="^",label="cluster 1")
    plt.scatter(cluster_centers[1][x_index], cluster_centers[1][y_index],facecolors='white', edgecolors='black', s=300, marker="^")

     # cluster 와 중심점을 그리기
    plt.scatter(data_x[labels==2], data_y[labels==2],c='black' ,alpha=0.3,s=100, marker="*",label="cluster 2")
    plt.scatter(cluster_centers[2][x_index], cluster_centers[2][y_index],facecolors='white', edgecolors='black', s=500, marker="*")

    # 그래프의 스케일과 축 라벨을 설정
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xlabel(x_label,fontsize='large')
    plt.ylabel(y_label,fontsize='large')
    plt.show()


# 초기 상태를 표시 
labels=np.zeros(len(data),dtype=np.int)
show_result(init_centers,labels)

for i in range(5):
	model = cluster.KMeans(n_clusters=3,max_iter=1,init=init_centers).fit(data)
	labels = model.labels_
	init_centers=model.cluster_centers_
	show_result(init_centers,labels)
 
 
C:\ProgramData\Anaconda3\envs\machine\lib\site-packages\sklearn\cluster\k_means_.py:969: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)
 
 
C:\ProgramData\Anaconda3\envs\machine\lib\site-packages\sklearn\cluster\k_means_.py:969: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)
 
 
C:\ProgramData\Anaconda3\envs\machine\lib\site-packages\sklearn\cluster\k_means_.py:969: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)
 
 
C:\ProgramData\Anaconda3\envs\machine\lib\site-packages\sklearn\cluster\k_means_.py:969: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)
 
 
C:\ProgramData\Anaconda3\envs\machine\lib\site-packages\sklearn\cluster\k_means_.py:969: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)
 
 

2. k-means 모델 사용 2

In [8]:
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import datasets

# iris 데이터를 로드
iris = datasets.load_iris()
data = iris['data']

# 학습 → 클러스터 생성
model = cluster.KMeans(n_clusters=3)
model.fit(data)
Out[8]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
In [9]:
# 학습 결과의 라벨 취득
labels = model.labels_

# 그래프 그리기
ldata = data[labels == 0]
plt.scatter(ldata[:, 2], ldata[:, 3],
                  c='black' ,alpha=0.3,s=100 ,marker="o")

ldata = data[labels == 1]
plt.scatter(ldata[:, 2], ldata[:, 3],
                  c='black' ,alpha=0.3,s=100 ,marker="^")

ldata = data[labels == 2]
plt.scatter(ldata[:, 2], ldata[:, 3],
                  c='black' ,alpha=0.3,s=100 ,marker="*")

# 축 라벨의 설정
plt.xlabel(iris["feature_names"][2],fontsize='large')
plt.ylabel(iris["feature_names"][3],fontsize='large')

plt.show()
 
 

3. 속성값 4개를 2개씩 조합하여 출력

In [10]:
import matplotlib.pyplot as plt
from sklearn import cluster
from sklearn import datasets


# iris 데이터를 로드
iris = datasets.load_iris()
data = iris['data']

# 학습 → 클러스터 생성
model = cluster.KMeans(n_clusters=3)
model.fit(data)

# 学習結果のラベル取得
labels = model.labels_


### グラフの描画
MARKERS = ["o", "^" , "*" , "v", "+", "x", "d", "p", "s", "1", "2"]

# 指定されたインデックスの feature 値で散布図を作成する関数
def scatter_by_features(feat_idx1, feat_idx2):
    for lbl in range(labels.max() + 1):
        clustered = data[labels == lbl]
        plt.scatter(clustered[:, feat_idx1], clustered[:, feat_idx2],
                    c='black' ,alpha=0.3,s=100,
                    marker=MARKERS[lbl], label='label {}'.format(lbl))

    plt.xlabel(iris["feature_names"][feat_idx1],fontsize='xx-large')
    plt.ylabel(iris["feature_names"][feat_idx2],fontsize='xx-large')


plt.figure(figsize=(16, 16))

# feature "sepal length" と "sepal width"
plt.subplot(3, 2, 1)
scatter_by_features(0, 1)

# feature "sepal length" と "petal length"
plt.subplot(3, 2, 2)
scatter_by_features(0, 2)

# feature "sepal length" と "petal width"
plt.subplot(3, 2, 3)
scatter_by_features(0, 3)

# feature "sepal width" と "petal length"
plt.subplot(3, 2, 4)
scatter_by_features(1, 2)

# feature "sepal width" と "petal width"
plt.subplot(3, 2, 5)
scatter_by_features(1, 3)

# feature "petal length" と "petal width"
plt.subplot(3, 2, 6)
scatter_by_features(2, 3)

plt.tight_layout()
plt.show()
 
 
 

猜你喜欢

转载自www.cnblogs.com/kingboy100/p/11043294.html