k-NN——算法实现

k-NN 没有特别的训练过程,给定训练集,标签,k,计算待预测特征到训练集的所有距离,选取前k个距离最小的训练集,k个中标签最多的为预测标签

约会类型分类、手写数字识别分类

  1. 计算输入数据到每一个训练数据的距离
  2. 选择前k个,判断其中类别最多的类作为预测类
import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt

# inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    distances = np.sum(diffMat**2, axis=1)**0.5
    sortDistances = distances.argsort() # 计算距离
    classCount = {}
    for i in range(k):
        voteLable = labels[sortDistances[i]]
        classCount[voteLable] = classCount.get(voteLable, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
    result = sortedClassCount[0][0]
    # print("Predict: ", result)
    return result

# 将一个文件写入矩阵,文件有4列,最后一列为labels,以\t间隔
def file2matrix(filename):
    with open(filename) as f:
        arrayLines = f.readlines()
        # print(arrayLines) # 有\n
    numberOfLines = len(arrayLines) # 将txt文件按行读入为一个list,一行为一个元素
    returnMat = np.zeros((numberOfLines, 3))
    classLabelVector = []
    index = 0
    for line in arrayLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

# 画一些图
def ex3():
    datingDateMat, datingLables = file2matrix("datingTestSet2.txt")
    fig = plt.figure()
    ax = fig.add_subplot(1,2,1)
    ax.scatter(datingDateMat[:,1], datingDateMat[:,2], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
    ax2 = fig.add_subplot(1,2,2)
    ax2.scatter(datingDateMat[:,0], datingDateMat[:,1], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
    plt.show()

# 将数据集归一化[0 1]之间 (value - min)/(max - min)
def autoNorm(dataSet):
    minVals = dataSet.min(axis=0)
    maxVals = dataSet.max(axis=0)
    ranges = maxVals - minVals
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals

# 分类器,输入数据集,归一化参数,labels,70%作为训练集,30%测试集
def datingClassTest(normDataSet, ranges, minVals, labels):
    m = normDataSet.shape[0]
    numOfTrain = int(m*0.7)
    trainIndex = np.arange(m)
    np.random.shuffle(trainIndex)
    dataSet = normDataSet[trainIndex[0:numOfTrain],:]
    testSet = normDataSet[trainIndex[numOfTrain:],:]
    labels = np.array(labels)
    dataSetLabels = labels[trainIndex[0:numOfTrain]]
    testSetLabels = labels[trainIndex[numOfTrain:]]

    k = int(input("Input k: "))
    results = []
    for inX in testSet:
        result = classify0(inX, dataSet, dataSetLabels, k)
        results.append(result)
    compResultsAndLable = np.argwhere(results==testSetLabels)
    acc = len(compResultsAndLable)/len(testSetLabels)
    print("Accuracy: {:.2f}".format(acc))
    print("Error: {:.2f}".format(1-acc))

    classList = ['not at all', 'in small doses', 'in large doses']
    inX1 = float(input("1: percentage of time spent playing video games? "))
    inX2 = float(input("2: frequent flier miles earned per year? "))
    inX3 = float(input("3: liters of ice cream consumed per year? "))
    inXUser = [inX1,inX2,inX3]
    inXUser = (inXUser - minVals)/ranges
    result = classify0(inXUser, dataSet, dataSetLabels, k)
    print("Predict: ", classList[result])



if __name__ == '__main__':
    # # -- ex1 --
    # inX = [1, 1]
    # dataSet = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    # labels = ['A', 'A', 'B', 'B']
    # k = 3
    # classify0(inX, dataSet, labels, k)

    # # -- ex2 --
    datingDateMat, datingLables = file2matrix("datingTestSet2.txt")

    # # -- ex3 --
    # ex3()

    # #-- ex4 --
    # normDataSet, ranges, minVals = autoNorm(datingDateMat)

    # # -- ex5 --
    # datingClassTest(normDataSet, ranges, minVals, datingLables)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import operator

def img2vector(filename):
    with open(filename) as f:
        lines = f.readlines()
    return_vector = []
    for line in lines:
        line = line.strip()
        for j in line:
            return_vector.append(int(j))
    return return_vector


# inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    distances = np.sum(diffMat**2, axis=1)**0.5
    sortDistances = distances.argsort() # 计算距离
    classCount = {}
    for i in range(k):
        voteLable = labels[sortDistances[i]]
        classCount[voteLable] = classCount.get(voteLable, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
    result = sortedClassCount[0][0]
    # print("Predict: ", result)
    return result


def handwriting_class_test(data_set, training_labels, test_set, test_labels, k):
    results = []
    for i in range(len(test_set)):
        result = classify0(test_set[i], data_set, training_labels, k)
        results.append(result)
        # print('predict: ', result, 'answer: ', test_labels[i])
    compare_results = np.argwhere(results==test_labels)
    acc = len(compare_results)/len(test_labels)
    print("Accuracy: {:.5f}".format(acc))
    print("Error: {:.5f}".format(1-acc))

if __name__ == '__main__':
    dir_path = r'H:\ML\MachineLearninginAction\02kNN\digits'
    training_path = os.path.join(dir_path, r'trainingDigits')
    test_path = os.path.join(dir_path, r'testDigits')

    training_files_list = os.listdir(training_path)
    test_files_list = os.listdir(test_path)

    # 计算训练集矩阵与labels
    m = len(training_files_list)
    # m = 5
    data_set = np.zeros((m, 1024))
    training_labels = np.zeros(m)
    for i in range(m):
        data_set[i] = img2vector(os.path.join(training_path, training_files_list[i]))
        training_labels[i] = training_files_list[i].split('_')[0]
    # 测试集矩阵与labels
    mt = len(test_files_list)
    test_set = np.zeros((mt,1024))
    test_labels = np.zeros(mt)
    for i in range(mt):
        test_set[i] = img2vector(os.path.join(test_path, test_files_list[i]))
        test_labels[i] = test_files_list[i].split('_')[0]
    k = 3
    handwriting_class_test(data_set, training_labels, test_set, test_labels, k)
发布了46 篇原创文章 · 获赞 0 · 访问量 1047

猜你喜欢

转载自blog.csdn.net/weixin_37680513/article/details/102991085