文章目录
算法实现
1,主函数
算出测试数据到每个训练数据的欧拉距离
收集前k个最小距离
获得前k个最小距离多对应数据的标签
返回出现最多次的标签
def kNN_classify(k, x, x_train, y_train):
# 断言,检查各项是否合理
assert 1 <= k <= x_train.shape[0],
"k 必须大于零且小于训练样本个数"
assert x_train.shape[0] == y_train.shape[0],
"每一个训练样本必须有一一对应的分类(标签)"
assert x_train.shape[1] == x.shape[1],
"训练样本和测试样本必须有相同的属性个数"
# 计算测试样本到每一个训练样本的举例
dateSize = x_train.shape[0]
New_x = np.tile(x, (dateSize, 1))
dist = (np.sum((New_x - x_train) ** 2, axis=1)) ** 0.5
# 得到最小的距离
nearest = np.argsort(dist)
topK_y = [y_train[i] for i in nearest[:k]]
# 找到并返回,与测试样本产生最小距离的训练样本所对应的分类
return Counter(topK_y).most_common(1)[0][0]
2,建立自己的kNN算法
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
class kNNClassifier:
def __init__(self, k):
"""初始化kNN分类器"""
assert k >= 1, "k 必须大于0"
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
"""根据训练数据集训练kNN分类器"""
assert self.k <= x_train.shape[0], "数据集中样本数量必须大于k"
assert x_train.shape[0] == y_train.shape[0], "每一个训练样本必须有一一对应的分类(标签)"
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, x_predict):
"""x_predict 是一个与 x_train维度相同的矩阵"""
"""给定测试数据集合,返回x_predict的结果向量"""
assert self._x_train is not None and self._y_train is not None, "请加入数据训练集"
assert self._x_train.shape[1] == x_predict.shape[1], "训练样本和测试样本必须有相同的属性个数"
y_predict = [self._predict(i) for i in x_predict]
return np.array(y_predict)
# 用于对测试数据集中的每个元素进行求解,用于上边函数迭代
def _predict(self, x):
"""x 是从 x_predict中迭代出来的一个样本(应该是一个List)"""
"""对单一测试数据进行预测"""
#assert len(x) == self._x_train[1], "训练样本和测试样本必须有相同的属性个数"
New_x = np.tile(x, (self._x_train.shape[0], 1))
dist = (np.sum((New_x - self._x_train) ** 2, axis=1)) ** 0.5
nearest = np.argsort(dist)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
return Counter(topK_y).most_common(1)[0][0]
def __repr__(self):
return "kNN: k=%d" % self.k
算法评估
1,随机取样 获得训练集&测试集
方法一:将x,y中所有的索引进行乱序,然后再抽取索引对应的样本
将索引进行乱序
shuffle_indexes = np.random.permutation(len(x))
shuffle_indexes
这样就获得了一个乱序索引List,然后再根据索引提取数据
# 首先给出测试集占比,及大小
test_ratio = 0.2
test_size = int(len(x) * test_ratio)
# 锁定索引范围
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
通过索引提取 训练集和测试集
# 训练数据集
x_train = x[train_indexes]
y_train = y[train_indexes]
# 测试数据集
x_test = x[test_indexes]
y_test = y[test_indexes]
方法二:将x,y拼接在一起,在进行乱序,再拆成新的x,y
# step1:将y改为[150,1]的矩阵,方便和x横向拼接
new_y = y.reshape(150,-1)
# step2:横向拼接(把y接在x的后边
trains = np.concatenate([x,new_y],axis=1)
# step3:对拼接好的矩阵进行乱序(改变每行的顺序,但不要改变属性对应的位置
np.random.shuffle(trains)
# step4:拆分乱序后的矩阵成为新的x,y
new_x, new_y = np.hsplit(trains, [-1])
# step5:记得要将new_y的shape改回原来的
new_y.reshape(150,)
随机取样函数实现
def train_test_split(x, y, test_radio=0.2, seed=None):
"""将数据按照比例分割为测试数据集和训练数据集"""
assert x.shape[0] == y.shape[0], "样本个数与标签个数需一致"
assert 0.0 <= test_radio <= 1.0, "比例范围[0,1]"
if seed:
np.random.seed(seed)
shuffle_indexes = np.random.permutation(len(x))
test_size = int(len(x) * test_radio)
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]
# 训练数据集
x_train = x[train_indexes]
y_train = y[train_indexes]
# 测试数据集
x_test = x[test_indexes]
y_test = y[test_indexes]
return x_train, y_train, x_test, y_test
2,如何找出最好的k值
best_scroe = 0.0
best_k = -1
for k in range(1,11):
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test, y_test)
if score > best_scroe:
best_scroe = score
best_k = k
print(best_k)
print(best_scroe)
3,最值归一化
公式:newValue = (oldValue - min)/(max - min)
def autoNorm(dataSet):
"""
最值归一化 -- newValue = (oldValue - min)/(max - min)
:param dataSet: 原数据集
:return: 归一化数据集
"""
minValue = dataSet.min(axis=0)
maxValue = dataSet.max(axis=0)
m = dataSet.shape[0]
ranges = np.tile(maxValue - minValue, (m, 1))
normDataSets = np.zeros(dataSet.shape)
normDataSets = (dataSet - np.tile(minValue, (m, 1))) / ranges
return normDataSets
4,均值方差归一化 Standardization
new_x = (x - x_mean)/x_std