Machine learning] [Python Python PANDAS KNN achieve less than 50 lines from Helen Dating identification (including data read, drawing, standardization, algorithm and other processes) + 60 line handwritten digit recognition

table of Contents

1, KNN principle

2, Helen target training data set

3, the code test input single

4, a single test results illustrating input

5, a plurality of test input the correct code, and the result was calculated

6, the handwriting identification code data and results 


 

1, KNN principle

  1. Distance between the point and the current point calculation known class data set;
  2. Sort ascending order according to the distance;
  3. K points selected minimum distance from the current point;
  4. Determining the frequency of occurrence of the first k classes point is located;
  5. Return to the previous k points most frequently occurring category classification as a predictor of the current point.

2, Helen target training data set

A total of 1000 data, only show part

3, the code test input single

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import collections

def read_data(filename):
    # 读取数据
    data = pd.read_table(filename,sep='	',header=None,names=['里程数','时间百分比','公升数','标签'])
    return data


def pic_show(data):
    # 画图配置
    plt.style.use('fivethirtyeight')
    plt.rc('font', family='SimHei', size=13)
    plt.rcParams['axes.unicode_minus'] = False
    sns.set_style({'font.sans-serif': ['simhei', 'Arial']})

    f, [ax1,ax2,ax3] = plt.subplots(nrows=1,ncols=3,figsize=(20,5))
    sns.scatterplot(x=data['里程数'],y=data['时间百分比'],hue=data['标签'],data=data,ax=ax1,legend='brief')
    sns.scatterplot(x=data['里程数'], y=data['公升数'], hue=data['标签'], data=data, ax=ax2, legend='brief')
    sns.scatterplot(x=data['时间百分比'], y=data['公升数'], hue=data['标签'], data=data, ax=ax3, legend='brief')
    ax1.set_title('每年飞行里程数与玩视频游戏所花时间占比',fontsize=10)
    ax2.set_title('每年飞行里程数与每周消费的冰淇淋公升数',fontsize=10)
    ax3.set_title('玩视频游戏所花时间占比与每周消费的冰淇淋公升数',fontsize=10)
    plt.show()

def feature_process(data):
    # 将特征值进行max-min标准化
    for col in data[['里程数','时间百分比','公升数']]:
        data[col] = (data[col] - data[col].min() ) / (data[col].max() - data[col].min() + 1)
    return data

def knn(test, features, labels, k):
    # knn实现
    # 1、计算距离
    distance = np.sum((test - features) ** 2, axis=1) ** 0.5
    # 2、distance.argsort()[0: k] 按距离递增排序,选取距离最近前k个的索引,如第一个的索引是,第二个的索引是
    k_labels = [labels[index] for index in distance.argsort()[:k]]
    # 3、计算每个类别出现的次数
    most_label = collections.Counter(k_labels)
    # 4、返回出现频率最多的类别
    return f'结果为:Helen {most_label.most_common(1)[0][0]} this man'

# 获取pandas数据
data = read_data('datingTestSet.txt')
# 画图观察数据
pic_show(data)
# 初始化测试数据
test = np.array([[3000,20,0.5]])
print(f'这个男人每年飞行里程数为{test[0][0]}km,玩视频游戏所花时间占为{test[0][1]}%,每周消费的冰淇淋{test[0][2]}公升')
# 测试数据标准化
test = (feature_process(pd.DataFrame(test, columns=['里程数', '时间百分比', '公升数']).append(other=data[['里程数', '时间百分比', '公升数']]))).iloc[0,:].values
# 进行训练集特征标准化
data = feature_process(data)
# 送入knn计算
print(knn(test=test,features=data.iloc[:,:data.shape[1]-1].values,labels=data.iloc[:,-1].values,k=3))

4, a single test results illustrating input

5, a plurality of test input the correct code, and the result was calculated

(Before the data is divided into a training set 900, the test set 100)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import collections

def read_data(filename):
    # 读取数据
    data = pd.read_table(filename,sep='	',header=None,names=['里程数','时间百分比','公升数','标签'])
    return data


def pic_show(data):
    # 画图配置
    plt.style.use('fivethirtyeight')
    plt.rc('font', family='SimHei', size=13)
    plt.rcParams['axes.unicode_minus'] = False
    sns.set_style({'font.sans-serif': ['simhei', 'Arial']})

    f, [ax1,ax2,ax3] = plt.subplots(nrows=1,ncols=3,figsize=(20,5))
    sns.scatterplot(x=data['里程数'],y=data['时间百分比'],hue=data['标签'],data=data,ax=ax1,legend='brief')
    sns.scatterplot(x=data['里程数'], y=data['公升数'], hue=data['标签'], data=data, ax=ax2, legend='brief')
    sns.scatterplot(x=data['时间百分比'], y=data['公升数'], hue=data['标签'], data=data, ax=ax3, legend='brief')
    ax1.set_title('每年飞行里程数与玩视频游戏所花时间占比',fontsize=10)
    ax2.set_title('每年飞行里程数与每周消费的冰淇淋公升数',fontsize=10)
    ax3.set_title('玩视频游戏所花时间占比与每周消费的冰淇淋公升数',fontsize=10)
    plt.show()

def feature_process(data):
    # 将特征值进行max-min标准化
    for col in data[['里程数','时间百分比','公升数']]:
        data[col] = (data[col] - data[col].min() ) / (data[col].max() - data[col].min() + 1)
    return data

def knn(test, features, labels, k):
    # knn实现
    # 1、计算距离
    distance = np.sum((test - features) ** 2, axis=1) ** 0.5
    # 2、distance.argsort()[0: k] 按距离递增排序,选取距离最近前k个的索引,如第一个的索引是,第二个的索引是
    k_labels = [labels[index] for index in distance.argsort()[:k]]
    # 3、计算每个类别出现的次数
    most_label = collections.Counter(k_labels)
    # 4、返回出现频率最多的类别
    return f'结果为:Helen {most_label.most_common(1)[0][0]} this man',most_label.most_common(1)[0][0]

# 获取pandas数据
data = read_data('datingTestSet.txt')
# 进行训练集特征标准化
data = feature_process(data)
# 初始化测试数据
test = data.iloc[900:,:]
count = 0
for row in test.values:
    # print(f'这个男人每年飞行里程数为{row[0]}km,玩视频游戏所花时间占为{row[1]}%,每周消费的冰淇淋{row[2]}公升')
    # 送入knn计算
    label = knn(test=row[:3],features=data.iloc[:900,:data.shape[1]-1].values,labels=data.iloc[:900,-1].values,k=3)[1]
    if str(label) == str(row[-1]):
        count += 1
print(f'正确率为{count}%')

 

6, the handwriting identification code data and results 

import numpy as np
from os import listdir

def img2vector(filename):
    # 创建1×1024零向量
    vector = np.zeros(shape=(1,1024))
    # 打开文件,每个文件是一个样本记录,即一行
    f = open(filename)
    for i in range(32):
        line = f.readline()
        for j in range(32):
            vector[0,32*i+j] = int(line[j])
    return vector

def get_train_data():
    # 训练集标签存放
    train_labels = []
    # 遍历训练集文件
    train_file_list = listdir('trainingDigits')
    # 行数=文件数
    row = len(train_file_list)
    # 创建训练集矩阵
    train_x_mat = np.zeros(shape=(row,1024))
    # 填充矩阵
    for i in range(row):
        filename = train_file_list[i]
        class_num = int(filename.split('_')[0])
        # 存入标签矩阵
        train_labels.append(class_num)
        # 填充特征矩阵
        train_x_mat[i,:] = img2vector(f'trainingDigits/{filename}')
    return train_x_mat, train_labels

def model_train(train_x,train_y):
    from sklearn.neighbors import KNeighborsClassifier
    # 参数说明:
    # n_neighbors:默认为5,就是k-NN的k的值,选取最近的k个点。
    # weights:默认是uniform,参数可以是uniform、distance,也可以是用户自己定义的函数。uniform是均等的权重,就说所有的邻近点的权重都是相等的。distance是不均等的权重,距离近的点比距离远的点的影响大。用户自定义的函数,接收距离的数组,返回一组维数相同的权重。
    # algorithm:快速k近邻搜索算法,默认参数为auto,可以理解为算法自己决定合适的搜索算法。\
    # 除此之外,用户也可以自己指定搜索算法ball_tree、kd_tree、brute方法进行搜索,brute是蛮力搜索,也就是线性扫描,
    # 当训练集很大时,计算非常耗时。kd_tree,构造kd树存储数据以便对其进行快速检索的树形数据结构,kd树也就是数据结构中的二叉树。
    # 以中值切分构造的树,每个结点是一个超矩形,在维数小于20时效率高。ball tree是为了克服kd树高纬失效而发明的,其构造过程是以质心C和半径r分割样本空间,每个节点是一个超球体。
    # leaf_size:默认是30,这个是构造的kd树和ball树的大小。这个值的设置会影响树构建的速度和搜索速度,同样也影响着存储树所需的内存大小。需要根据问题的性质选择最优的大小。
    # metric:用于距离度量,默认度量是minkowski,也就是p=2的欧氏距离(欧几里德度量)。
    # p:距离度量公式。在上小结,我们使用欧氏距离公式进行距离度量。除此之外,还有其他的度量方法,例如曼哈顿距离。这个参数默认为2,也就是默认使用欧式距离公式进行距离度量。也可以设置为1,使用曼哈顿距离公式进行距离度量。
    # metric_params:距离公式的其他关键参数,这个可以不管,使用默认的None即可。
    # n_jobs:并行处理设置。默认为1,临近点搜索并行工作数。如果为-1,那么CPU的所有cores都用于并行工作。
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(train_x,train_y)
    return knn

def knn_predict(knn):
    # 正确计数
    count = 0
    # 训练集标签存放
    test_labels = []
    # 遍历训练集文件
    test_file_list = listdir('testDigits')
    # 行数=文件数
    row = len(test_file_list)
    # 填充矩阵
    for i in range(row):
        filename = test_file_list[i]
        class_num = int(filename.split('_')[0])
        test_x = img2vector(f'testDigits/{filename}')
        # 存入标签矩阵
        test_labels.append(class_num)
        predict_y = knn.predict(test_x)
        # print("分类返回结果为%d\t真实结果为%d" % (predict_y,class_num))
        if predict_y == class_num:
            count += 1
    print(f'正确率为{count/row*100}%')

if __name__ == '__main__':
    train_x, train_y = get_train_data()
    knn_model = model_train(train_x,train_y)
    knn_predict(knn_model)

result:

Published 44 original articles · won praise 16 · views 10000 +

Guess you like

Origin blog.csdn.net/YYIverson/article/details/101530494