K--最邻近（K-NN）算法

视频地址：
https://study.163.com/course/courseMain.htm?courseId=1005709005

ppt拍照版下载：
https://download.csdn.net/download/yj13811596648/10735093

代码整理：

# -*- coding: utf-8 -*

import numpy as np
import matplotlib.pyplot as plt

from collections import Counter

def dist(A,B):
    a = np.asarray(A)
    b = np.asarray(B)
    a = a.ravel()
    b = b.ravel()
    d = a-b
    return np.linalg.norm(d)

def cosSim(A,B):
    a = np.asarray(A)
    b = np.asarray(B)
    a = a.ravel()
    b = b.ravel()
    d1 = np.dot(a,b)
    d2 = np.linalg.norm(a)*np.linalg.norm(b)
    return d1/d2
    
    
#K_NN函数定义：
#data表示测试数据
#predict表示待测试样本
#k表示k_nn中的k值
#distfun表示选择的距离
def k_nn_test(data,predict,k=3,distfun=dist):
    if len(data) >= k:
        print('K is set to a value less than total voting groups!')
    
    distances = []
    for group in data:
        for features in data[group]:
            #计算每个样本与测试样本之间的距离，采用distfun距离算法
            test_distance = distfun(features,predict)
            #将距离以及类别组成列表存入distances中
            distances.append([test_distance,group])
#     print(distances)
            
    #i[0]为距离，i[1]为类别，我们需要的是类别
    #取按照distances列表进行排序后的0到k-1个值
    votes = [i[1] for i in sorted(distances)[:k] ]
    #使用collections.Counter类来统计跟踪的值出现的次数
    #most_common()：取元素次数最多的前1个也就是那个多数派
    vote_result = Counter(votes).most_common(1)[0][0]
    return vote_result

if __name__ == '__main__':
    dataset = {'k':[[1,3],[2,4],[2,1]],'r':[[6,3],[7,7],[5,6]]}
    new_features = [4,4]
    #dataset(数据集)只是一个python字典，其中的键看作类，后面的值看作这个类相关的数据点
    #new_features是将要预测其所属类的点
    #我们可以做一个快速图表
            
    for i in dataset:
        for i2 in dataset[i]:
            plt.scatter(i2[0],i2[1],s=100,color=i)
            
    plt.scatter(new_features[0],new_features[1],s=100)
    plt.show()
    
    result = k_nn_test(dataset,new_features)
    plt.scatter(new_features[0],new_features[1],s=100,color=result)
    plt.show()

代码整理：
训练集下载地址:
https://download.csdn.net/download/yj13811596648/10735063

def loadImage(filename,fsize=40):
    from PIL import Image
    
    image = Image.open(filename)
    img1 = image.resize((fsize,fsize))
    image.close()
    img = np.asarray(img1)
    img1.close()
    x = img.ravel()  #将数组降为一维
    return x          #x是一张图片生成的一个向量


def loadDataset():   
    import os
    Ylab = [chr(i+ord('A')) for i in range(26)]  # Y是A到Z，26个字母构成的列表
    fsize = 40
    X = []
    Y = []
    for ypath in Ylab:
        #下面，最后需要[0]的作用是去掉了一层列表符号
        pngfiles = [ dirs[2]  for dirs in os.walk('/Users/he-jia/English_hand_writing/Img/daxie/'+ypath)][0]    # pngfiles是文件夹A下面所有训练图片的文件名构成的列表
        for file in pngfiles:    #针对每一个A的训练图片
            if not (file.endswith('.png') or file.endswith('.PNG') ):  #如果不是png文件可以跳过
                continue
            x = loadImage('/Users/he-jia/English_hand_writing/Img/daxie/'+ypath+'/'+file,fsize)    # loadImage函数用于将每一个测试图片生成一个行向量
            X.append(x)   # X是一个列表
            Y.append(ypath)
    return np.mat(X),np.asarray(Y) 
#这两个函数都是将列表矩阵化，
#当A文件夹里的训练图片被遍历后，X1=np.mat(X)最终是个55行4800列的矩阵，X1 是个二维矩阵，Y1 =np.asarry(Y)是由一个列向量构成的矩阵，26行1列
# 55*26 =1430,当A到Z文件夹里的训练图片都被遍历之后，X1=np.mat(X)最终是个1430行4800列的矩阵，Y1 =np.asarry(Y)是由一个列向量构成的矩阵，1430行1列


import sklearn.neighbors as knnlib
import datetime
begin = datetime.datetime.now()
print(begin)
print('------------------------')

#训练分类器
testx =  loadImage('/Users/he-jia/English_hand_writing/test.png')
charX,charY = loadDataset()   #charX,charY 是两个矩阵，一个1430行4800列，一个26行1列
k = int(np.sqrt(len(charY)))  #k为样本数量开方
knn = knnlib.KNeighborsClassifier(algorithm = 'ball_tree',n_neighbors=k,weights='distance',p=1) 
#建立knn分类器
#四个参数含义：量度距离，以曼哈顿距离算法，k个近邻，约等按球树
print(charX.shape)
print(charY.shape)
knn = knn.fit(charX,charY) #训练knn模型
testx = np.mat(testx)
y = knn.predict(testx)    #测试预测样本，先要转为矩阵
print('测试图片结果为：',y)


print('------------------------')
end = datetime.datetime.now()
print(end)

K--最邻近（K-NN）算法

猜你喜欢