一、概述
kNN算法采用测量不同特征值之间的距离方法进行分类。对未知类别属性的数据集中的每个点执行如下操作:
(1)计算已知类别数据集中的点与当前点之间的距离;
(2)按照距离递增次序排序;
(3)选取与当前点距离最小的k个点;
(4)确定前K个点所在类别的出现频率;
(5)返回前k个点出现频率最高的类别作为当前点的预测分类。
二、代码实现
1.基于scikit-learn包实现
import numpy as np from sklearn import neighbors def split_data(data, test_size): data_num = data.shape[0] train_ind = list(range(data_num)) test_ind = [] test_num = int(data_num * test_size) for i in range(test_num): rand_ind = np.random.randint(0, len(train_ind)) test_ind.append(rand_ind) del train_ind[rand_ind] train_data = data[train_ind] test_data = data[test_ind] return train_data, test_data # load the data and divide the data mydata = np.loadtxt(open("iris.txt","rb"), delimiter = ",", skiprows = 0) train_data, test_data = split_data(mydata, 0.3) n = mydata.shape[1] test_label = test_data[:, n-1] test_data = test_data[:,0:n-1] train_label = train_data[:,n-1] train_data = train_data[:, 0:n-1] # get the KNN classifier knn = neighbors.KNeighborsClassifier() knn.fit(train_data, train_label) print(knn.predict(test_data))
运行结果如下:
2、python代码逐步实现
import numpy as np import operator def split_data(data, test_size): data_num = data.shape[0] train_ind = list(range(data_num)) test_ind = [] test_num = int(data_num * test_size) for i in range(test_num): rand_ind = np.random.randint(0, len(train_ind)) test_ind.append(rand_ind) del train_ind[rand_ind] train_data = data[train_ind] test_data = data[test_ind] return train_data, test_data def createDataSet(): group = np.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) # labels = ['A', 'A', 'B', 'B'] labels = np.array([1, 1, 2, 2]) # print (group) # print (labels) return group, labels def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 sortedDistIndicies = distances.argsort() classCount = {} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True) return sortedClassCount[0][0] def classify1(inX, dataSet, labels, k): result_ind = [] inX_size = inX.shape[0] dataSetSize = dataSet.shape[0] for i in range(inX_size): diffMat = np.tile(inX[i,:], (dataSetSize, 1)) - dataSet sqDiffMat = diffMat ** 2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 sortedDistIndicies = distances.argsort() classCount = {} for j in range(k): voteIlabel = labels[sortedDistIndicies[j]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) ind = sortedClassCount[0][0] result_ind.append(ind) return result_ind # load the data and divide the data mydata = np.loadtxt(open("iris.txt","rb"), delimiter = ",", skiprows = 0) train_data, test_data = split_data(mydata, 0.3) n = mydata.shape[1] test_label = test_data[:, n-1] test_data = test_data[:,0:n-1] train_label = train_data[:,n-1] train_data = train_data[:, 0:n-1] # test code -- classify 0 result_ind = [] for i in range(len(test_data)): ind = classify0(test_data[i,:], train_data, train_label,7) result_ind.append(ind) print(result_ind) # # # # test code -- classify 1 # result_index = classify1(test_data, train_data, train_label, 3) # print(result_index)
运行结果如下: