k-NN-- algorithm

All distances k-NN training process is not particularly given training set, labels, k, is calculated to be predicted characteristics of the training set, the first k selected minimum distance training set, the k most label tag prediction

Dating type classification, classification of handwritten numeral recognition

  1. Inputting data to calculate the distance of each training data
  2. Before selecting the k, which is determined as the largest category class-based prediction
import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt

# inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    distances = np.sum(diffMat**2, axis=1)**0.5
    sortDistances = distances.argsort() # 计算距离
    classCount = {}
    for i in range(k):
        voteLable = labels[sortDistances[i]]
        classCount[voteLable] = classCount.get(voteLable, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
    result = sortedClassCount[0][0]
    # print("Predict: ", result)
    return result

# 将一个文件写入矩阵,文件有4列,最后一列为labels,以\t间隔
def file2matrix(filename):
    with open(filename) as f:
        arrayLines = f.readlines()
        # print(arrayLines) # 有\n
    numberOfLines = len(arrayLines) # 将txt文件按行读入为一个list,一行为一个元素
    returnMat = np.zeros((numberOfLines, 3))
    classLabelVector = []
    index = 0
    for line in arrayLines:
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

# 画一些图
def ex3():
    datingDateMat, datingLables = file2matrix("datingTestSet2.txt")
    fig = plt.figure()
    ax = fig.add_subplot(1,2,1)
    ax.scatter(datingDateMat[:,1], datingDateMat[:,2], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
    ax2 = fig.add_subplot(1,2,2)
    ax2.scatter(datingDateMat[:,0], datingDateMat[:,1], s=15.0*np.array(datingLables), c=15.0*np.array(datingLables))
    plt.show()

# 将数据集归一化[0 1]之间 (value - min)/(max - min)
def autoNorm(dataSet):
    minVals = dataSet.min(axis=0)
    maxVals = dataSet.max(axis=0)
    ranges = maxVals - minVals
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals

# 分类器,输入数据集,归一化参数,labels,70%作为训练集,30%测试集
def datingClassTest(normDataSet, ranges, minVals, labels):
    m = normDataSet.shape[0]
    numOfTrain = int(m*0.7)
    trainIndex = np.arange(m)
    np.random.shuffle(trainIndex)
    dataSet = normDataSet[trainIndex[0:numOfTrain],:]
    testSet = normDataSet[trainIndex[numOfTrain:],:]
    labels = np.array(labels)
    dataSetLabels = labels[trainIndex[0:numOfTrain]]
    testSetLabels = labels[trainIndex[numOfTrain:]]

    k = int(input("Input k: "))
    results = []
    for inX in testSet:
        result = classify0(inX, dataSet, dataSetLabels, k)
        results.append(result)
    compResultsAndLable = np.argwhere(results==testSetLabels)
    acc = len(compResultsAndLable)/len(testSetLabels)
    print("Accuracy: {:.2f}".format(acc))
    print("Error: {:.2f}".format(1-acc))

    classList = ['not at all', 'in small doses', 'in large doses']
    inX1 = float(input("1: percentage of time spent playing video games? "))
    inX2 = float(input("2: frequent flier miles earned per year? "))
    inX3 = float(input("3: liters of ice cream consumed per year? "))
    inXUser = [inX1,inX2,inX3]
    inXUser = (inXUser - minVals)/ranges
    result = classify0(inXUser, dataSet, dataSetLabels, k)
    print("Predict: ", classList[result])



if __name__ == '__main__':
    # # -- ex1 --
    # inX = [1, 1]
    # dataSet = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    # labels = ['A', 'A', 'B', 'B']
    # k = 3
    # classify0(inX, dataSet, labels, k)

    # # -- ex2 --
    datingDateMat, datingLables = file2matrix("datingTestSet2.txt")

    # # -- ex3 --
    # ex3()

    # #-- ex4 --
    # normDataSet, ranges, minVals = autoNorm(datingDateMat)

    # # -- ex5 --
    # datingClassTest(normDataSet, ranges, minVals, datingLables)
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import operator

def img2vector(filename):
    with open(filename) as f:
        lines = f.readlines()
    return_vector = []
    for line in lines:
        line = line.strip()
        for j in line:
            return_vector.append(int(j))
    return return_vector


# inX: test data, N features (1xN)
# dataSet: M samples, N features (MxN)
# label: for M samples (1xM)
# k: k-Nearest Neighbor
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    distances = np.sum(diffMat**2, axis=1)**0.5
    sortDistances = distances.argsort() # 计算距离
    classCount = {}
    for i in range(k):
        voteLable = labels[sortDistances[i]]
        classCount[voteLable] = classCount.get(voteLable, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # 找出最多投票的类
    result = sortedClassCount[0][0]
    # print("Predict: ", result)
    return result


def handwriting_class_test(data_set, training_labels, test_set, test_labels, k):
    results = []
    for i in range(len(test_set)):
        result = classify0(test_set[i], data_set, training_labels, k)
        results.append(result)
        # print('predict: ', result, 'answer: ', test_labels[i])
    compare_results = np.argwhere(results==test_labels)
    acc = len(compare_results)/len(test_labels)
    print("Accuracy: {:.5f}".format(acc))
    print("Error: {:.5f}".format(1-acc))

if __name__ == '__main__':
    dir_path = r'H:\ML\MachineLearninginAction\02kNN\digits'
    training_path = os.path.join(dir_path, r'trainingDigits')
    test_path = os.path.join(dir_path, r'testDigits')

    training_files_list = os.listdir(training_path)
    test_files_list = os.listdir(test_path)

    # 计算训练集矩阵与labels
    m = len(training_files_list)
    # m = 5
    data_set = np.zeros((m, 1024))
    training_labels = np.zeros(m)
    for i in range(m):
        data_set[i] = img2vector(os.path.join(training_path, training_files_list[i]))
        training_labels[i] = training_files_list[i].split('_')[0]
    # 测试集矩阵与labels
    mt = len(test_files_list)
    test_set = np.zeros((mt,1024))
    test_labels = np.zeros(mt)
    for i in range(mt):
        test_set[i] = img2vector(os.path.join(test_path, test_files_list[i]))
        test_labels[i] = test_files_list[i].split('_')[0]
    k = 3
    handwriting_class_test(data_set, training_labels, test_set, test_labels, k)

 

 

 

Published 46 original articles · won praise 0 · Views 1047

Guess you like

Origin blog.csdn.net/weixin_37680513/article/details/102991085