机器学习knn算法实战(参考Machine Learning in action)
1. k-近邻算法
knn分类算法( k-Nearest Neighbors classification algorithm)是机器学习中的有监督分类算法。其思想十分简单:
现要判断测试集中某个实例Test_x的类别,先计算出Test_x到训练集中所有实例{ Train1, train2, …, trainN }的距离{ d1, d2,…, dN}, 然后选出k个距离最近(最相似性)的训练集实例,观察这k个实例的标签,哪类标签最多,test_x归为哪类.
举个例子:
训练集有两类标签(红label0、蓝label1)上图所示,设置k=9,我们不妨把点到点的直线距离作为相似度度量。可以看到,训练集中与测试实例(灰色)距离最近的9个点中,有3个红色,6个蓝色,所以,测试实例可归为label1。
2. python实现最简单的knn算法
使用欧氏距离作为距离度量:
from numpy import *
import operator
def createDataSet():
group = array([ [1.0,1.1], [1.0,1.0], [0,0], [0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX, dataSet, labels, k):
#inX测试实例
#dataSet训练集
[dataSetSize,featureSize] = dataSet.shape #训练集实例个数,特征维度
#在矩阵内完成最小二乘法,即计算距离
diffMat = tile(inX, (dataSetSize,1)) #将inX复制扩展,行方向dataSetSize次,列方向1次
diffMat -= dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#按距离升序排序,返回原下标
sortedDistIndicies = distances.argsort()
#选取距离最小的k个点, 对其中标签进行分类统计
classCount = {
}
for i in range(k):
VoteIlabel = labels[sortedDistIndicies[i]]
classCount[VoteIlabel] = classCount.get(VoteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
if __name__=='__main__':
dataSet,labels = createDataSet()
testSet = [[1.1, 1.0], [0.2, 0.3], [0.1, 0.2], [0.9, 0.8]]
k = 3
for inX in testSet:
print('inX',inX,'-----> class',classify0(inX, dataSet, labels,k))
3.例子:海伦约会
部分数据:
列1:Number of frequent flyer miles earned per year
列2:Percentage of time spent playing video games
列3:Liters of ice cream consumed weekly
列4:分类标签
类1:‘didntLike’,
类2:‘smallDoses’
类3: ‘largeDoses’
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
63275 8.385879 1.669485 1
5569 4.875435 0.728658 2
51052 4.680098 0.625224 1
77372 15.299570 0.331351 1
43673 1.889461 0.191283 1
61364 7.516754 1.269164 1
69673 14.239195 0.261333 1
15669 0.000000 1.250185 2
28488 10.528555 1.304844 3
#ex_02_02.py
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
import operator
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
#计算距离矩阵,并排序
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
#选前k个实例,对标签做分类统计
classCount = {
}
for i in range(k):
VoteIlabel = labels[sortedDistIndicies[i]]
classCount[VoteIlabel] = classCount.get(VoteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
#从文件读取数据
def file2matrix(filename):
lines = [line.strip().split('\t') for line in open(filename)]
dataSet = zeros((len(lines),3))
classLabelVector = []
index = 0
for line in lines:
dataSet[index] = line[0:3]#强制转化为数值
classLabelVector.append(int(line[-1]))
index += 1
return dataSet,classLabelVector
#归一化数据集
def autoNorm(dataSet):
minval = dataSet.min(0)
maxval = dataSet.max(0)
ranges = maxval - minval
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet-tile(minval,(m,1))
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minval
#算法测试
def datingClassTest():
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minval = autoNorm(datingDataMat)
m = normMat.shape[0]
hoRatio = 0.10 #90%做训练集,10%测试集
numTestVecs = int(m*hoRatio)
errorCount = 0
for i in range(numTestVecs):
calssifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],
datingLabels[numTestVecs:m], 3)
print("The calssifier came back with: %d, the real answer is: %d"
%(calssifierResult,datingLabels[i]))
if calssifierResult != datingLabels[i]:
errorCount += 1.0
print("The total error rate is: %f%%" %(errorCount/float(numTestVecs)*100))
#构建完整系统
def classfiPerson():
resultList = ['Not at all', 'In small doses', 'In large doses']
ffMiles = float(input("frequent flier miles earned per year?"))
percenTals = float(input("Percentage of time spent playing video games?"))
iceCream = float(input("Liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minval = autoNorm(datingDataMat)
inArr =array([ffMiles,percenTals,iceCream])
calssifierResult = classify0((inArr - minval)/ranges, normMat, datingLabels,3)
print("You will probably like this person: ",resultList[calssifierResult-1])
if __name__ == '__main__':
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
datingClassTest()
classfiPerson()
'''
fig = plt.figure()
ax =fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],
15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()
'''
4. 手写数字识别系统
#ex_02_03.py手写数字识别
from numpy import *
import operator
from os import listdir
from sklearn.neighbors import KNeighborsClassifier as kNN
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
#按距离升序排序,返回下标
sortedDistIndicies = distances.argsort()
classCount = {
}
for i in range(k): #距离最小的k个点
VoteIlabel = labels[sortedDistIndicies[i]]
classCount[VoteIlabel] = classCount.get(VoteIlabel,0)+1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def img2Vector(filename):
returnVect = zeros((32,32))
lines = [list(line.strip()) for line in open(filename)]
returnVect[:] = lines[:] #类型强制转换
'''
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[i,j] = int(lineStr[j])
'''
return returnVect.reshape(1,1024)
def handWritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2Vector('trainingDigits/%s'%fileNameStr)
testFileList = listdir('testDigits')
errorCount = 0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2Vector('testDigits/%s' %fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("classifierResult is: %d the real answer is: %d "%(classifierResult,classNumStr))
if classifierResult!=classNumStr:
errorCount += 1.0
print("\nthe total number of errors is: %d" %errorCount)
print("\nthe total error rate is: %f" %(errorCount/float(mTest)))
if __name__ == '__main__':
handWritingClassTest()
某个数字2的二值图:
00000000000000100000000000000000
00000000000001111100000000000000
00000000011111111110000000000000
00000000111111111110000000000000
00000001111111111110000000000000
00000001111111111111000000000000
00000011111111101111110000000000
00000011111100001111100000000000
00000011111000000111100000000000
00000111110000000111110000000000
00000111110000000011100000000000
00000011100000000111100000000000
00000000000000000111000000000000
00000000000000000011100000000000
00000000000000000111100000000000
00000000000000000111100000000000
00000000000000000111100000000000
00000000000000001111000000000000
00000000000000001111000000000000
00000000000000001110000000000000
00000000000000011111000000000000
00000000000000011111000000000000
00000000000000111110000000000000
00000000000001111100000000000000
00000000000011111111110000000000
00000000000111111111111111000000
00000000111111111111111111100000
00000000011111111111111111110000
00000000011111111111111111110000
00000000011111111111111111110000
00000000111111111110000000100000
00000000000110000000000000000000