KNN算法python实现(鸢尾花,iris)

读取数据集

我们需要的就是iris中的data和target两个list

import operator
import random

import numpy as np
import sklearn.datasets as sd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 读取数据集
iris = sd.load_iris()  # 载入数据据
# print(iris['target_names'])  # 'target_names': array(['setosa', 'versicolor', 'virginica']
# print(iris[
#           'feature_names'])  # 'feature_names':['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
x_data = iris['data']
y_data = iris['target']
features = iris['feature_names']
labels = iris['target_names']

切分、打乱数据集

自动切分

# 自动切分数据集方法
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

手动切分打乱

def split_dataset(x_data, y_data, test_size=40):
    # 手动打乱数据集

    data_size = len(x_data)  # 获取行数
    id_ = [i for i in range(data_size)]  # 列表解析式建立index列表
    random.shuffle(id_)  # random.shuffle打乱顺序
    x_data = x_data[id_]  # 数据集调整
    y_data = y_data[id_]

    # 手动切分数据集方法

    x_train = x_data[test_size:]  # 40:150为训练集
    y_train = y_data[test_size:]
    x_test = x_data[:test_size]  # 0:40为测试集
    y_test = y_data[:test_size]
    return x_train, y_train, x_test, y_test

定义KNN函数

def KNN(x_test_, x_train, y_train, K):
    # 定义KNN函数

    x_test_0 = np.tile(x_test_, (len(x_train), 1))  # 复制x_test[0],用于计算delta
    delta_mat = x_train - x_test_0  # 计算差值
    delta2 = delta_mat ** 2  # 计算平方
    distance2 = []  # 求和
    for i in range(len(delta2)):
        distance2.append(sum(delta2[i]))

    distance = np.sqrt(distance2)  # 开方
    sorted_distance = distance.argsort()#对distance进行排序
    # print(sorted_distance)
    dict_ = {}
    for i in range(K):#取前K个数据
        label_ = y_train[sorted_distance[i]]
        dict_[label_] = dict_.get(label_, 0) + 1  # 没有则设为0;有则+1
    # sorted_dict = sorted(dict_, key=dict_.__getitem__, reverse=True)  # 对字典进行排序
    # 或者
    sorted_dict = sorted(dict_.items(), key=operator.itemgetter(1), reverse=True)  # 返回list[(1, 3), (0, 2)]
    return sorted_dict[0][0]

测试

if __name__ == '__main__':
    # x_train, y_train, x_test, y_test = split_dataset(x_data, y_data, test_size=40)
    prediction = []
    for i in range(len(x_test)):
        x_test_ = x_test[i]
        label = KNN(x_test_, x_train, y_train, K=5)
        prediction.append(label)
    print(prediction)
    print(y_test)
    print(classification_report(y_test, prediction))  # 精度报告
    print(confusion_matrix(y_test, prediction))

结果:输出精度报告(classification_report),混淆矩阵(confusion_matrix)

      precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.82      0.90      0.86        10
           2       0.93      0.88      0.90        16

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38

[[12  0  0]
 [ 0  9  1]
 [ 0  2 14]]

遇到 UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. ‘precision’, ‘predicted’, average, warn_for)

遇到这个问题的原因通常是,在预测的标签中缺少实际的标签:如缺少2


# [0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1]
# [2 1 1 0 0 0 0 0 2 0 2 2 2 0 1 2 2 1 2 1 1 1 0 1 1 1 1 1 1 1 2 2 0 0 1 0 1 2 2 0]
发布了24 篇原创文章 · 获赞 8 · 访问量 2170

猜你喜欢

转载自blog.csdn.net/weixin_44839513/article/details/103567439