KNN-Taking the iris data set as an example

For the principle part, you can refer to other articles. There are already many on the Internet, and the code is directly listed below. 

### 导入所需包 ###
import numpy as np
import pandas as pd

##读入数据
irsflowers = pd.read_csv("iris.csv")

'''
    数据预处理
    查看是否有缺失值
'''
# print(irsflowers.isnull().sum())

'''缺失值处理:直接删除含有缺失值的行'''
irsflowers.dropna(axis=0, inplace=True)

'''再次查看,缺失值处理完毕'''
# print(irsflowers.isnull().sum())


## 数据编码与标准化
"""
    数据编码
    将三种花种类分好标签
    setosa:0
    versicolor:1
    virginica:2
"""
datas = irsflowers.values
datas[datas == 'setosa'] = 0
datas[datas == 'versicolor'] = 1
datas[datas == 'virginica'] = 2

train_datas = datas[:, :-1].astype('float32')
train_labels = datas[:, -1:].astype('int64')

'''
    标准化:采用(0,1)标准化
'''


def Normalization(data):
    max = data.max(axis=0)
    min = data.min(axis=0)
    m = data.shape[0]
    after_normalize = data - np.tile(min, (m, 1))
    after_normalize = after_normalize / np.tile((max - min), (m, 1))
    return after_normalize


## 划分训练集与测试集(可尝试用不同方法实现)
'''
    随机分配百分之80的数据作为训练集,随机分配百分之20的数据作为测试集
    使用sklearn.model_selection里的train_test_split模块用于分割数据
'''

from sklearn.model_selection import train_test_split


def splitdata(after_normalize_def, train_labels_def):
    data_list = map(lambda x: x[0], train_labels_def)
    labels = pd.Series(data_list)
    after_normalize_data = pd.DataFrame(after_normalize_def)
    Y = labels
    X = after_normalize_data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=9)
    return X_train, Y_train, X_test, Y_test


normalization = Normalization(train_datas)
X_train, Y_train, X_test, Y_test = splitdata(normalization, train_labels)

X_train_list = X_train.values
Y_train_list = Y_train.values
X_test_list = X_test.values
Y_test_list = Y_test.values

import operator

class MYKNN:
    def __init__(self, k,_distance_type):
        self.k = k
        self._distance_type = _distance_type
        self.x_train = None
        self.y_train = None

    def fit(self, X_train, Y_train):
        self.x_train = X_train
        self.y_train = Y_train

    def predict(self, X_predict):
        return np.array([self.predict_method(x) for x in X_predict])

    def predict_method(self, x):
        dict_eachtype = {"0": 0, "1": 0, "2": 0}
        # 计算样本和已知点的距离,有三种距离计算方法,一般选用欧氏距离
        '''  0:欧式距离    1:曼哈顿距离   2:切比雪夫距离'''
        if self._distance_type == 0:
            distances = np.sqrt(np.sum((x - self.x_train) ** 2, axis=1))
        elif self._distance_type == 1:
            distances = np.sum(abs(self.x_train - x), axis=1)
        else:
            distances = np.max(abs(self.x_train - x), axis=1)
        # 计算样本与前K个已知点的所有分类结果
        for i in range(self.k):
            #找距离最小的值及其下标
            min_index, min_number = min(enumerate(distances), key=operator.itemgetter(1))
            type_flower = self.y_train[min_index]
            if type_flower == 0:
                dict_eachtype["0"] += 1
            elif type_flower == 1:
                dict_eachtype["1"] += 1
            elif type_flower == 2:
                dict_eachtype["2"] += 1
            #将该最小值改为一个较大的数,这样下一次循环就能找到第二小的值及其下标
            distances[min_index] = 2

        # 分样本类,谁最多就把样本记为哪类
        max1 = max(dict_eachtype, key=dict_eachtype.get)
        return int(max1)

    # 计算准确率
    def acc(self, x_test, label):
        y_predict = self.predict(x_test)
        all_sample = len(label)
        right = 0
        for i, e in enumerate(label):
            if y_predict[i] == e:
                right += 1
        ACC = right/all_sample
        return ACC,y_predict


#可视化roc
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def visualization(target, predictions):
    target, predictions, thersholds = roc_curve(target, predictions, pos_label=2)
    roc_auc = auc(target, predictions)
    plt.plot(target, predictions, label='ROC (area = {0:.2f})'.format(roc_auc), lw=2,color='red')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()


## 结果导出result.csv
def to_csv(_result,_predictions,_acc_csv):
    with open('result_KNN.csv', 'w', encoding='utf-8') as f:
        for i in range(len(_result)):
            f.write("target is : {} , prediction_result is : {}".format(_result[i],_predictions[i]))
            f.write("\n")
        f.write("The Best Acc is : {}".format(_acc_csv))


#调用方法
_acc = 0
best_prediction=0
for k in range(15):
    for distance_type in range(3):
        KNN = MYKNN(k+1,distance_type)
        KNN.fit(X_train_list, Y_train_list)
        result=KNN.acc(X_test_list, Y_test_list)
        print("K is {},distance_type is {},ACC is {}".format(k+1,distance_type,result[0]))
        if result[0] > _acc:
            _acc = result[0]
            best_prediction=result[-1]
to_csv(Y_test_list,best_prediction,_acc)
visualization(Y_test_list, best_prediction)


Select different K values, different distance calculation methods, and output results:

Guess you like

Origin blog.csdn.net/huiaixing/article/details/123885081