机器学习实战笔记 k 近邻算法函数解析

本文章目的主要是在《机器学习实战》这本书与其实践代码的基础上，对其原理和特殊函数进行解释，并给出对应超链接。

第二章 第一个 感兴趣男人分类器（实现k近邻算法）

from numpy import *
import operator
import numpy as np
def createDataSet():

    group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

def classify0(inx,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    # numpy函数：[1]shape用法  https://jingyan.baidu.com/article/a24b33cd5c90b319fe002b9e.html
    diffMat=tile(inx,(dataSetSize,1))-dataSet
    #【python系列】numpy中的tile函数  http://blog.csdn.net/ksearch/article/details/21388985
    sqDiffMat=diffMat**2
    # **两个乘号就是乘方,比如2**4,结果就是2的4次方,结果是16 这里是全部元素分别乘方
    sqDistances=sqDiffMat.sum(axis=1)
    # python中的sum函数.sum(axis=1)  http://www.cnblogs.com/yyxayz/p/4033736.html
    distances=sqDistances**0.5
    sortedDistIndcies=distances.argsort()
    #浅述python中argsort()函数的用法 http://www.cnblogs.com/yyxf1413/p/6253995.html
    classCount={}
    #python中，花括号，中括号，小括号的区别在哪里 https://zhidao.baidu.com/question/484920124.html
    for i in range(k):
    #详细记录python的range()函数用法 http://www.cnblogs.com/buro79xxd/archive/2011/05/23/2054493.html
        voteIlabel=labels[sortedDistIndcies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
        #Python 字典(Dictionary) get()方法 http://www.runoob.com/python/att-dictionary-get.html
    sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    #python中的operator.itemgetter函数 http://www.cnblogs.com/zhoufankui/p/6274172.html
    #Python 字典items返回列表，iteritems返回迭代器 http://www.iplaypy.com/jinjie/items-iteritems.html
    #【Python】 sorted函数  http://www.cnblogs.com/sysu-blackbear/p/3283993.html
    return sortedClassCount[0][0]

def file2matrix(filename):
    fr=open(filename)
    arrayOLines =fr.readlines()
    #读所有行
    numberOfLines=len(arrayOLines)
    returnMat=np.zeros((numberOfLines,3))
    classLabelVector=[]
    index=0
    for line in arrayOLines:
        line=line.strip()
        #去回车符
        listFromLine=line.split('\t')
        returnMat[index,:]=listFromLine[0:3]
        #取前3成特征矩阵
        classLabelVector.append(int(listFromLine[-1]))
        #取最后一列元素
        index+=1
    return returnMat,classLabelVector

def autoNorm(dataset):
    minVals=dataset.min(0)
    maxVals=dataset.max(0)
    #python numpy中数组.min() http://blog.csdn.net/qq_18433441/article/details/54743271
    ranges=maxVals-minVals
    normDataSet=np.zeros(shape(dataset))
    m=dataset.shape[0]
    normDataSet=dataset-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals

def classifyPerson():
    resultList=['not in all','in small doses', 'in large doses']
    percentTats=float(input("percentage of time spent playing video games?"))
    ffMiles=float(input("frequent fliter miles earned per year?"))
    iceCream=float(input("liters of ice cream consumed per year?"))
    datingDataMat,datingLabels=file2matrix('C:\\Users\卢雨辰\Desktop\course resource·\machinelearninginaction\Ch02\datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    inArr=np.array([ffMiles,percentTats,iceCream])
    Result=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person:"+resultList[Result-1])
    print(normMat[0:10])

机器学习实战笔记 k 近邻算法 函数解析

猜你喜欢

机器学习实战笔记 k 近邻算法函数解析