scikit-learn使用KNN进行分类

k近邻算法的核心思想是未标记样本的类别,由距离其最近的k个邻居投票决定。

该算法具有准确性高,对异常值和噪声有较高的容忍度等优点。缺点是计算量比较大,内存消耗量也大。

demo code如下:


from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier


#生成数据

centers = [[-2,2], [2,2], [0,4]]
"""
生成60个样本,这60个样本分布在centers中心点周围,cluster_std指明生成点分布的松散程度,x是样本集,,y是类别
"""
X, y = make_blobs(n_samples=60,centers=centers, random_state=0, cluster_std=0.6)

#print(X)
#print(y)
plt.figure(figsize=(16,10), dpi=144)
c=np.array(centers)
plt.scatter(X[:,0], X[:,1], c=y, s=100, cmap='cool')
plt.scatter(c[:,0],c[:,1],s=100,marker='^', c='orange')

#plt.show()
#训练KNN模型
k = 5
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(X,y)


#X_sample=[0,2]
#y_sample = clf.predict(X_sample)
#报错误如下:Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
#方法1
X_sample=[[0,2]]
y_sample = clf.predict(X_sample)
print('y_sample = ',y_sample)
#方法二
X_sample2=[0,2]
temp = np.array(X_sample2).reshape((1,-1))
print(type(temp))
y_sample2 = clf.predict(temp)
print('y_sample2 = ',y_sample2)
print(type(y_sample2))
#y_sample = clf.predict(np.array(X_sample).reshape((1,-1)))
neighbors = clf.kneighbors(temp, return_distance=False)
print(type(neighbors))

plt.scatter(X_sample2[0], X_sample2[1], marker='x',c=y_sample2[0], s=100, cmap='cool')
print('neightbors num ', neighbors[0])

for i in neighbors[0]:
    plt.plot([X[i][0], X_sample2[0]],[X[i][1], X_sample2[1]], 'k--', linewidth=0.6)

plt.show()


print('done')

猜你喜欢

转载自blog.csdn.net/szfhy/article/details/80188513
今日推荐