KNN机器学习实战(sklearn)(包含训练数据)

本文直接给出sklearn里面KNN 算法的用法。具体实现过程如下:

# -*- coding: utf-8 -*-
import numpy as np
from sklearn import datasets
import operator
from sklearn import neighbors
import sklearn.model_selection as ms

digits = datasets.load_digits()
totalNum = len(digits.data)
# 选出80%样本作为训练样本,其余20%测试
trainNum = int(0.8 * totalNum)
trainX,testX, trainY,testY = ms.train_test_split(digits.data, digits.target, random_state = 1, train_size = 0.8)

ER = []
for n_neighbors  in range(1,16):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
    clf.fit(trainX, trainY)
    Z = clf.predict(testX)
    x = 1- np.mean(Z == testY)
    ER.append(x)

pd.DataFrame(ER).plot(title = 'the plot of error rate')

n_neighbors取值为1~15
通过以上的图形可知,n_neighbors = 7,8 时较为合适, 此时的error rate 为0.002778

# -*- coding: utf-8 -*-
import numpy as np
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.utils.testing import assert_equal

rng = np.random.RandomState(0)
# load and shuffle digits
digits = datasets.load_digits()
perm = rng.permutation(digits.target.size)
digits.data = digits.data[perm]
digits.target = digits.target[perm]

def test_neighbors_digits():
    # Sanity check on the digits dataset
    # the 'brute' algorithm has been observed to fail if the input
    # dtype is uint8 due to overflow in distance calculations.

    X = digits.data.astype('uint8')
    Y = digits.target
    (n_samples, n_features) = X.shape
    train_test_boundary = int(n_samples * 0.8)
    train = np.arange(0, train_test_boundary)
    test = np.arange(train_test_boundary, n_samples)
    (X_train, Y_train, X_test, Y_test) = X[train], Y[train], X[test], Y[test]
    clf = neighbors.KNeighborsClassifier(n_neighbors=1, algorithm='brute')
    clf_unit8 = clf.fit(X_train, Y_train)
    clf_float = clf.fit(X_train.astype(float), Y_train)
    score_uint8 = clf_unit8.score(X_test, Y_test)
    score_float = clf_float.score(X_test.astype(float), Y_test)
    assert_equal(score_uint8, score_float)
    pred_y = clf_unit8.predict(X_test)
    print("the acurracy rate is :", np.mean(pred_y == Y_test))  
test_neighbors_digits()

以下是机器学习实战书中的源代码。


# -*- coding: utf-8 -*-
"""
Created on Mon Sep 17 15:03:26 2018
"""
from numpy import *
import operator
path = r'C:\Users\Administrator\Desktop\python\MLiA_SourceCode\machinelearninginaction\KNN'

def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort()     
    classCount={}          
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return int(sortedClassCount[0][0])

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

def file2matrix(filename):
    fr = open(filename)
    numberOfLines = len(fr.readlines())         #get the number of lines in the file
    returnMat = zeros((numberOfLines,3))        #prepare matrix to return
    classLabelVector = []                       #prepare labels return   
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        line = line.strip()
        listFromLine = line.split('\t')
        returnMat[index,:] = list(map(float,listFromLine[0:3]))
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector

#normalize
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals

#autoNorm(datingDataMat)

def datingClassTest():
    hoRatio = 0.50      #hold out 10%
    datingDataMat,datingLabels = file2matrix(path+'/datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print (errorCount)

#datingClassTest()


###########################################################deal with digit
from os import listdir
pat = r'C:\Users\Administrator\Desktop\python\MLiA_SourceCode\machinelearninginaction\KNN\digits'

#将数据转化成1*1024的举证
def img2vector(filename):
    returnVect = zeros((1,1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect


def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir(pat + '/trainingDigits')           #load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector(pat +'/trainingDigits/%s' % fileNameStr)
    testFileList = listdir(pat +'/testDigits')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector(pat +'/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print ("\nthe total number of errors is: %d" % errorCount)
    print ("\nthe total error rate is: %f" % (errorCount/float(mTest)))
    return classifierResult, classNumStr, trainingMat

#以下为测试代码:
if __main__ = 'name':
    datingDataMat, datingLabels = file2matrix(path + '/datingTestSet2.txt')
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    d1 = pd.DataFrame(data = datingDataMat, columns = ['km', 'GameTime', 'IceCream'])
    d2 = pd.DataFrame(datingLabels, columns = ['label'])
    df = pd.concat([d1, d2], axis = 1)
    df.info()
    g = sns.FacetGrid(data = df, hue = 'label', size = 6, palette='Set2')
    g.map(plt.scatter,'GameTime','IceCream').add_legend()
    ax = sns.countplot(x = 'label', data = df, palette= 'Set3') #数据均匀分布
    ax = sns.boxplot(y = 'GameTime', x = 'label', data = df, palette= 'Set3') 
    ax = sns.boxplot(y = 'IceCream', x = 'label', data = df, palette= 'Set3')
    ax = sns.boxplot(y = 'km', x = 'label', data = df, palette= 'Set3')
    g = sns.FacetGrid(data= df, hue = 'label', size = 6, palette='Set3')
    g.map(plt.scatter,'GameTime','km').add_legend()
    handwritingClassTest()
    zero = trainingMat[8,:]
    img_0 = zero.reshape(32,32)
    plt.imshow(img_0)

猜你喜欢

转载自blog.csdn.net/weixin_42983055/article/details/82389392