【机器学习实战】 Knn-约会对象

import numpy as np
#import operator

filename = path + '/datingTestSet.txt'

# 加载数据
def loadDataset(filename): 
    dataSet = []
    labelSet = []
    with open(filename) as fr:
        for line in fr.readlines():
            line_arr = line.split('\t')
            dataSet.append([float(line_arr[0]),float(line_arr[1]),float(line_arr[2])]) # 确保是数值型数据
            labelSet.append(line_arr[-1])
    return dataSet, labelSet

# 归一化数据 : newValue = (oldValue - min) / (max - min)
def autoNorm(dataSet):
    dataSet = np.array(dataSet) # 转化为数组用于运算
    m = dataSet.shape[0] # 得到数据集行数

    minVal = dataSet.min(0);
    minVals = np.tile(minVal,(m,1)) # 获取每一特征最小/大值,并将其拓展到与数据集相同数量用于运算
    maxVal = dataSet.max(0); maxVals = np.tile(maxVal,(m,1))

    normDataset = np.zeros(np.shape(dataSet)) # 提前定义数组用于存放归一化后的数据

    normDataset = (dataSet - minVals)/(maxVals - minVals) # 归一化共识 newValue = (oldValue - min) / (max - min)
    return normDataset

# 建立分类模型
def Knn_Classifier_Model(inX,dataSet,labelSet,k):
    m = dataSet.shape[0]
    inXs = np.tile(inX, (m,1)) # 将输入的单行数据拉长
    distance = np.sqrt(((inXs - dataSet)**2).sum(axis=1))# 欧式距离公式,sum 中的参数 0 为纵向求和
    sort_asc_index = distance.argsort() # 升序排序,输出对应位置的索引

    class_count = {} # 统计前k个类别,用投票方式确定 calssification result
    for i in range(k):
        label = labelSet[sort_asc_index[i]]
        class_count[label] = class_count.get(label,0) + 1 # get 方法,字典中有label 返回 label对应的值;否则将label添加进字典并返回value=0
    dic = sorted(class_count.items(),key = lambda x:x[1],reverse = True) # dic.items(),返回元素为(key,value)的列表
    #sorted(class_count.items(),key = operator.itemgetter(1),reverse = True),参数1为位置参数
    return dic[0][0]

# 测试代码
def test():
    hoRatio = 0.1 # 10%作测试集
    dataSet , labelSet = loadDataset(filename)
    normDataset = autoNorm(dataSet)    
    m = len(normDataset)
    m_test = int(m * hoRatio) # 获取测试集数量

    error_count = 0.0
    for i in range(m_test):
        classifier_result = Knn_Classifier_Model(normDataset[i],normDataset[m_test:m],labelSet[m_test:m],3)
        if classifier_result != labelSet[i]:
            error_count += 1.0
    print("错误率:",error_count / float(m_test))

if __name__ == '__main__':
    test()

猜你喜欢

转载自blog.csdn.net/weixin_37392582/article/details/80434771