机器学习-P1 手动实现knn算法

算法实现

1,主函数

算出测试数据到每个训练数据的欧拉距离
收集前k个最小距离
获得前k个最小距离多对应数据的标签
返回出现最多次的标签

def kNN_classify(k, x, x_train, y_train):
   
    # 断言,检查各项是否合理
    assert 1 <= k <= x_train.shape[0], 
    				"k 必须大于零且小于训练样本个数"
    assert x_train.shape[0] == y_train.shape[0], 
    				"每一个训练样本必须有一一对应的分类(标签)"
    assert x_train.shape[1] == x.shape[1], 
    				"训练样本和测试样本必须有相同的属性个数"

    # 计算测试样本到每一个训练样本的举例
    dateSize = x_train.shape[0]
    New_x = np.tile(x, (dateSize, 1))
    dist = (np.sum((New_x - x_train) ** 2, axis=1)) ** 0.5

    # 得到最小的距离
    nearest = np.argsort(dist)
    topK_y = [y_train[i] for i in nearest[:k]]

    # 找到并返回,与测试样本产生最小距离的训练样本所对应的分类
    return Counter(topK_y).most_common(1)[0][0]

2,建立自己的kNN算法

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter


class kNNClassifier:

    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k 必须大于0"
        self.k = k
        self._x_train = None
        self._y_train = None

    def fit(self, x_train, y_train):
        """根据训练数据集训练kNN分类器"""
        assert self.k <= x_train.shape[0], "数据集中样本数量必须大于k"
        assert x_train.shape[0] == y_train.shape[0], "每一个训练样本必须有一一对应的分类(标签)"

        self._x_train = x_train
        self._y_train = y_train
        return self

    def predict(self, x_predict):
        """x_predict 是一个与 x_train维度相同的矩阵"""
        """给定测试数据集合,返回x_predict的结果向量"""
        assert self._x_train is not None and self._y_train is not None, "请加入数据训练集"
        assert self._x_train.shape[1] == x_predict.shape[1], "训练样本和测试样本必须有相同的属性个数"

        y_predict = [self._predict(i) for i in x_predict]
        return np.array(y_predict)

    # 用于对测试数据集中的每个元素进行求解,用于上边函数迭代
    def _predict(self, x):
        """x 是从 x_predict中迭代出来的一个样本(应该是一个List)"""
        """对单一测试数据进行预测"""
        #assert len(x) == self._x_train[1], "训练样本和测试样本必须有相同的属性个数"

        New_x = np.tile(x, (self._x_train.shape[0], 1))
        dist = (np.sum((New_x - self._x_train) ** 2, axis=1)) ** 0.5

        nearest = np.argsort(dist)
        topK_y = [self._y_train[i] for i in nearest[:self.k]]

        return Counter(topK_y).most_common(1)[0][0]
        
    def __repr__(self):
        return "kNN: k=%d" % self.k

算法评估

1,随机取样 获得训练集&测试集

方法一:将x,y中所有的索引进行乱序,然后再抽取索引对应的样本

将索引进行乱序

shuffle_indexes = np.random.permutation(len(x))
shuffle_indexes

这样就获得了一个乱序索引List,然后再根据索引提取数据

# 首先给出测试集占比,及大小
test_ratio = 0.2
test_size = int(len(x) * test_ratio)
# 锁定索引范围
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

通过索引提取 训练集和测试集

# 训练数据集
x_train = x[train_indexes]
y_train = y[train_indexes]
# 测试数据集
x_test = x[test_indexes]
y_test = y[test_indexes]
方法二:将x,y拼接在一起,在进行乱序,再拆成新的x,y
# step1:将y改为[150,1]的矩阵,方便和x横向拼接
new_y = y.reshape(150,-1)
# step2:横向拼接(把y接在x的后边
trains = np.concatenate([x,new_y],axis=1)
# step3:对拼接好的矩阵进行乱序(改变每行的顺序,但不要改变属性对应的位置
np.random.shuffle(trains)
# step4:拆分乱序后的矩阵成为新的x,y
new_x, new_y = np.hsplit(trains, [-1])
# step5:记得要将new_y的shape改回原来的
new_y.reshape(150,)
随机取样函数实现
def train_test_split(x, y, test_radio=0.2, seed=None):
    """将数据按照比例分割为测试数据集和训练数据集"""

    assert x.shape[0] == y.shape[0], "样本个数与标签个数需一致"
    assert 0.0 <= test_radio <= 1.0, "比例范围[0,1]"

    if seed:
        np.random.seed(seed)

    shuffle_indexes = np.random.permutation(len(x))
    test_size = int(len(x) * test_radio)

    test_indexes = shuffle_indexes[:test_size]
    train_indexes = shuffle_indexes[test_size:]

    # 训练数据集
    x_train = x[train_indexes]
    y_train = y[train_indexes]
    # 测试数据集
    x_test = x[test_indexes]
    y_test = y[test_indexes]

    return x_train, y_train, x_test, y_test

2,如何找出最好的k值

best_scroe = 0.0
best_k = -1
for k in range(1,11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(x_train, y_train)
    score = knn_clf.score(x_test, y_test)
    if score > best_scroe:
        best_scroe = score
        best_k = k

print(best_k)
print(best_scroe)

3,最值归一化

公式:newValue = (oldValue - min)/(max - min)

def autoNorm(dataSet):
    """
    最值归一化 -- newValue = (oldValue - min)/(max - min)
    :param dataSet: 原数据集
    :return: 归一化数据集
    """
    minValue = dataSet.min(axis=0)
    maxValue = dataSet.max(axis=0)

    m = dataSet.shape[0]

    ranges = np.tile(maxValue - minValue, (m, 1))
    normDataSets = np.zeros(dataSet.shape)
    normDataSets = (dataSet - np.tile(minValue, (m, 1))) / ranges

    return normDataSets

4,均值方差归一化 Standardization

new_x = (x - x_mean)/x_std
发布了17 篇原创文章 · 获赞 4 · 访问量 520

猜你喜欢

转载自blog.csdn.net/weixin_46072771/article/details/104865804
今日推荐