机器学习实战——k-近邻算法（代码）

from numpy import *
import operator
import os

# 创建一个数据集，用于测试
def createDataset():
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels

# k近邻算法
def classifier0(inX, dataset, labels, k):
    datasetSize = dataset.shape[0]
    diffMat = tile(inX, (datasetSize, 1)) - dataset
    distance = ((diffMat**2).sum(axis=1))**0.5
    sortedDistIndicies = distance.argsort()
    classcount = {}
    for i in range(k):
        voteILabel = labels[sortedDistIndicies[i]]
        classcount[voteILabel] = classcount.get(voteILabel, 0) + 1
    sortedClassCount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

# 相关函数说明
# tile(A,(m,n))，将数组A在行上重复m次、在列上重复n次，构成一个新的数组
# numpy.argsort()，返回数组值从小到大的索引值
# dict.get(key, default=None),返回指定键的值，如果值不在字典中返回默认值None
# sorted(iterable[, cmp[, key[, reverse]]])
# iterable可迭代对象。
# cmp比较的函数，这个具有两个参数，参数的值都是从可迭代对象中取出，大于则返回1，小于则返回-1，等于则返回0。
# key主要是用来进行比较的元素，只有一个参数，取自于可迭代对象中，指定可迭代对象中的一个元素来进行排序。
# reverse排序规则，True降序，False升序（默认）

# 测试
# group, labels = createDataset()
# clf = classifier0([0, 0], group, labels, 3)
# print(clf)

# 【实例】Hellen约会对象分类--导入数据
def file2matrix(filenname):
    lines = open(filenname).readlines()
    num = len(lines)
    returnMat = zeros((num, 3))
    classLabelVector = []
    index = 0
    for line in lines:
        returnMat[index, :] = line.strip().split('\t')[0:3]
        classLabelVector.append(int(line.strip().split('\t')[-1]))
        index += 1
    return returnMat, classLabelVector


# 数据分析--可视化
# filename = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt'
# datingMat, datingLabels = file2matrix(filename)
# print(datingMat)
# print(datingLabels)
# import matplotlib.pyplot as plt
# plt.figure().add_subplot(111).scatter(datingMat[:, 1], datingMat[:, 2])
# plt.show()

# 相关函数说明
# add_subplot(349)
# 参数349：将画布分割成3行4列，图像画在从左到右从上到下的第9块
# 3410是不行的，可以用另一种方式，(3,4,10)

# 数据归一化处理
def autoNorm(dataset):
    minVals = dataset.min(0)
    maxVals = dataset.max(0)
    ranges = maxVals - minVals
    normDataset = zeros(shape(dataset))
    m = dataset.shape[0]
    normDataset = dataset - tile(minVals, (m, 1))
    normDataset = normDataset/tile(ranges, (m, 1))
    return normDataset, ranges, minVals

# 测试分类器效果
def datingClassTest():
    hoRatio = 0.10
    datingMat, datingLabels = file2matrix('D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat, ranges, minVal = autoNorm(datingMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        clf = classifier0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print('the classifier came back with: %d, the real answer is: %d' % (clf, datingLabels[i]))
        if clf != datingLabels[i]:
            errorCount += 1
    print('the total error rate is %f' % (errorCount/float(numTestVecs)))

# datingClassTest()
# the total error rate is 0.050000

# 帮助Hellen对一个新约会对象进行分类
def classifyPerson():
    results = ['not at all', 'is small doses', 'is large doses']
    percentTat = float(input('percentage of time spent playing cideo games?'))
    ffMiles = float(input('frequent flier miles earned per year'))
    iceCream = float(input('liters of ice cream consumed per year'))
    datingMat, datingLabel = file2matrix('D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat, ranges, minVal = autoNorm(datingMat)
    inArr = array([ffMiles, percentTat, iceCream])
    clf = classifier0((inArr-minVal)/ranges, normMat, datingLabel, 3)
    print('you will probably like this person', results[clf-1])

# 【实例】手写数字识别
# 第一次编写的导入数据代码，模型错误率特别高，the error rate is: 0.821353
# def img2vector(filename):
#     returnVect = zeros((1, 1024))
#     for i in range(32):
#         line = open(filename).readline()
#         for j in range(32):
#             returnVect[0, 32*i+j] = int(line[j])
#     return returnVect

# 第二次编写的代码，跟课本上的一模一样，模型错误率大幅度下降，the error rate is: 0.010571
def img2vector(filename):
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32*i+j] = int(lineStr[j])
    return returnVect

# 两个导入数据函数最主要的区别在于，循环语句中的readline()函数的对象
# 第一次编写的代码，每次都重新打开文件，每次读取的都是第一行，数据错误

# 《python机器学习应用》中的导入数据代码，数据为一维数组，效率更高
# def img2vector(fileName):
#     retMat = zeros([1024], int)
#     lines = open(fileName).readlines()
#     for i in range(32):
#         for j in range(32):
#             retMat[i*32+j] = lines[i][j]
#     return retMat

def handwritingClassTest():
    labels = []
    train_path = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/digits/trainingDigits'
    trainingFileList = os.listdir(train_path)
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    for i in range(m):
        filename = trainingFileList[i]
        label = int(filename.split('_')[0])
        labels.append(label)
        trainingMat[i, :] = img2vector(train_path + '/' + filename)
    test_path = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/digits/testDigits'
    testFileList = os.listdir(test_path)
    errorCount = 0
    mTest = len(testFileList)
    for i in range(mTest):
        filename_test = testFileList[i]
        label_test = int(filename_test.split('_')[0])
        vector_test = img2vector(test_path + '/' + filename_test)
        clf = classifier0(vector_test, trainingMat, labels, 3)
        # print('the classifier came back with: %d, the real answer is: %d' % (clf, label_test))
        if clf != label_test:
            errorCount += 1
    print('the total nummer of error is: ', errorCount)
    print('the error rate is: %f' % float(errorCount/mTest))

# handwritingClassTest()
# the total nummer of error is:  10
# the error rate is: 0.010571
机器学习实战——k-近邻算法（代码）

猜你喜欢