Pasos del algoritmo:
1. Calcule la distancia entre los datos de entrada y el conjunto de muestra de etiquetas conocido
2. Ordene los valores de distancia calculados en el paso 1.
3. Seleccione los primeros K valores de la clasificación (el origen de K en el algoritmo de proximidad K)
4. Calcule esto. La frecuencia de diferentes etiquetas que aparecen en los valores de K, y la etiqueta con la frecuencia más alta se utiliza como resultado de esta predicción.
Aquí, la fórmula de cálculo de distancia se selecciona como distancia euclidiana, que también es la fórmula de cálculo de distancia más común:
Conjunto de datos de citasTestSet2.txt
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
63275 8.385879 1.669485 1
5569 4.875435 0.728658 2
51052 4.680098 0.625224 1
77372 15.299570 0.331351 1
43673 1.889461 0.191283 1
61364 7.516754 1.269164 1
69673 14.239195 0.261333 1
15669 0.000000 1.250185 2
28488 10.528555 1.304844 3
6487 3.540265 0.822483 2
37708 2.991551 0.833920 1
22620 5.297865 0.638306 2
28782 6.593803 0.187108 3
19739 2.816760 1.686209 2
36788 12.458258 0.649617 3
sklearn ajusta directamente la biblioteca para lograr
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
def file2matrix(filename): #打开文件,获取数据和标签
love_dictionary = {
'largeDoses':3, 'smallDoses':2, 'didntLike':1}
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #get the number of lines in the file
returnMat = np.zeros((numberOfLines, 3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
if(listFromLine[-1].isdigit()):
classLabelVector.append(int(listFromLine[-1]))
else:
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
if __name__ == '__main__':
hoRatio = 0.50 #取数据的50%作为已知标签的样本集 50%作为未知标签数据
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
scaler = MinMaxScaler() # 归一化数据
scaler = scaler.fit(datingDataMat) # fit,在这里本质是生成min(x)和max(x)
normMat = scaler.transform(datingDataMat) # 通过接口导出结果
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
model =KNeighborsClassifier(n_neighbors=3)
model.fit(normMat[numTestVecs:m, :], datingLabels[numTestVecs:m])
y=model.predict(normMat[0:numTestVecs, :])
#计算准确率
result = y - datingLabels[0:numTestVecs]
error=0
for i in range(len(result)):
if result[i]!=0:
error+=1
print(f'准确率:{
((1-error/len(result))*100)}%')
import numpy as np
from os import listdir
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {
}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def file2matrix(filename):
love_dictionary = {
'largeDoses':3, 'smallDoses':2, 'didntLike':1}
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #get the number of lines in the file
returnMat = np.zeros((numberOfLines, 3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
if(listFromLine[-1].isdigit()):
classLabelVector.append(int(listFromLine[-1]))
else:
classLabelVector.append(love_dictionary.get(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m, 1))
normDataSet = normDataSet/np.tile(ranges, (m, 1)) #element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.50 #hold out 10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("返回的分类结果是: %d, 真实的分类标签是: %d" % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print('准确率为{:.2f}%'.format((1-errorCount / float(numTestVecs))*100))
print(errorCount)
if __name__ == '__main__':
datingClassTest()