1 介绍
kNN(k-Nearest Neighbors)
1.1 算法原理
给定目标值,找出k个(通常为3个以上)距离最近的样本,判断这些样本中哪个样本最多,则目标值就属于这个类别。
距离:一般使用欧拉距离,公式为: ,i 表示第 i 个特征,(a)表示第 a 个样本。
1.2 算法实现示例
1)主要步骤
- 准备样本数据值,包含数据特征值和数据标签。
- 准备目标值
- 计算目标值与每个样本值的距离:
- 对距离进行排序,获取相对应的索引值在原来的样本集中
- 根据索引值获取原来样本集中相对应的类别
- 汇总每个类别的数目,找出数目最多的类别
- 目标值属于数目最多的类别
1.3 判断机器学习算法的性能
1)训练数据与测试数据
- 训练数据用于产生模型。
- 测试数据,将其放入模型中运行,来判断模型的好坏,便于在模型进入真实环境前进行修改。
2)判断性能
- 通过分类的准确度来判断算法的性能如何。
- 公式:真实值与预测值相同的个数 / 真实值的个数
3)寻找最好的k
- 超参数:在算法运行前需要决定的参数。
- 模型参数:算法过程中学习的参数。
使用不同的k,结果也会有所不同,分类的准确度也会产生变化。所以通过测试不同的k的结果来找出使得结果最好的参数k。
4)网格搜索
下面代码在 jupyter notebook 中运行。
import numpy as np
from sklearn import datasets # sklearn库
digits=datasets.load_digits() # 从库中加载数据
x=digits.data
y=digits.target
from sklearn.model_selection import train_test_split
x_train,x_test,y_trian,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier(n_neighbors=4)
knn_clf.fit(x_train,y_trian)
knn_clf.score(x_test,y_test)
# 网格搜索
param_grid=[{'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)]},
{'weights':['distance'],
'n_neighbors':[i for i in range(1,11)],
'p':[i for i in range(1,6)]}]
knn_clf=KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(knn_clf,param_grid) # 另外参数 n_jobs=?(几个CPU核来运行),verbose=? (生成信息的详细程度)
%%time
grid_search.fit(x_train,y_trian)
grid_search.best_estimator_
grid_search.best_score_
grid_search.best_params_
knn_clf=grid_search.best_estimator_
knn_clf.score(x_test,y_test)
除了参数k,还有其他一些参数。
- 距离,除了欧拉距离外,还有其他距离算法,例如:明科夫斯基距离,
- 明科夫斯基距离公式中的p参数。
1.4 数据归一化(Feature Scaling)
数据归一化:将所有的数据映射到同一尺度。
1)最值归一化(Normalization)
- 将所有的数据映射到0-1之间
- 适用于分布有明显边界的情况
- 公式:
2)均值方差归一化(Standardization)
- 把所有数据归一到均值为0方差为1的分布中
- 适用于分布没有明显边界,有可能存在极端值的情况
- 公式: ,s为方差
3)对测试数据集进行归一化
公式: ,x_test 为测试数据集数据,mean_train 为训练数据集的平均值,std_train 为训练数据值的方差。
2 代码
2.1 kNN算法代码 kNN.py
import numpy as np
from math import sqrt
from collections import Counter
from .testCapability import accuracy_score
class KNNClassfier:
def __init__(self, k):
""" 初始化kNN分类器 """
assert k >= 1, "k must be valid"
self.k = k
self._x_train = None
self._y_train = None
def fit(self, x_train, y_train):
""" 根据训练数据集x_train和y_train训练kNN分类器 """
assert x_train.shape[0] == y_train.shape[0], "the size of x_train must be equal to the y_train"
assert self.k <= x_train.shape[0], "the size of x_train must be as least k."
self._x_train = x_train
self._y_train = y_train
return self
def predict(self, x_predict):
""" 给定预测数据集x_predict,返回表示x_predict的结果向量"""
assert self._x_train is not None and self._y_train is not None, \
"must fit before predict."
assert x_predict.shape[1] == self._x_train.shape[1], \
"the feature number of x_predict must be equal to x_train"
y_predict = [self._predict(x) for x in x_predict]
return np.array(y_predict)
def _predict(self, x):
""" 给定单个待预测的数据x,返回x的预测结果值"""
assert x.shape[0] == self._x_train.shape[1], \
"the feature number of x must be equal to x_train "
# 计算距离
distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._x_train]
# 索引排序
nearest = np.argsort(distances)
# 获取索引对应的类别值list
topk_y = [self._y_train[i] for i in nearest[:self.k]]
# 计数,对类别进行统计
votes = Counter(topk_y)
# 返回数目最多的类别
return votes.most_common(1)[0][0]
def score(self, x_test, y_test):
""" 根据测试数据集 x_test和y_test确定当前模型的准确度"""
y_predict = self.predict(x_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return "KNN(k=%d)" % self.k
2.2 将数据集分为训练和测试 model_selection.py
import numpy as np
def train_test_split(x, y, test_reaio=0.2, seed=None):
""" 将数据x和y按照test_ratio分割成x_train,x_test,y_train,y_test"""
assert x.shapq[0] == y.shape[0], "the size of x must be equal to the y"
assert 0.0 <= test_reaio <= 1.0, "test_ratio must be valid"
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(x))
test_size = int(len(x) * test_reaio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
x_train = x[train_indexes]
y_train = y[train_indexes]
x_test = x[test_indexes]
y_test = y[test_indexes]
return x_train, x_test, y_train, y_test
2.3 归一化处理 my_feature_scaling.py
import numpy as np
class StandardScaler:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self, x):
""" 根据训练数据集x获得数据的均值和方差"""
assert x.ndim == 2, "the dimension of x must be 2"
self.mean_ = np.array([np.mean(x[:, i]) for i in range(x.shape[1])])
self.scale_ = np.array([np.std(x[:, i]) for i in range(x.shape[1])])
return self
def transform(self, x):
""" 将x根据这个StandardScalar进行均值方差归一化处理"""
assert x.ndim == 2, "the dimension of x must be 2"
assert self.mean_ is not None and self.scale_ is not None, "must fit before transform."
assert x.shape[1] == len(self.mean_), "the feature num of x must be equal to mean_ and std_"
resX = np.empty(shape=x.shape, dtype=float)
for col in range(x.shape[1]):
resX[:, col] = (x[:, col] - self.mean_[col]) / self.scale_[col]
return resX
2.4 判断性能 testCapability.py
import numpy as np
from math import sqrt
def accuracy_score(y_true, y_predict):
""" 计算y_true和y_predict之间的准确率"""
assert len(y_true) == len(y_predict), \
"the size of y_true must be equal to the size of y_predict"
return np.sum(y_true == y_predict) / len(y_true)
2.5 测试 test.py
import numpy as np
import kNN_pro.kNN
from kNN_pro.model_selection import train_test_split
import kNN_pro.my_feature_scaling
if __name__ == '__main__':
raw_data_x = [[1, 2],
[1, 0],
[1, 1],
[3, 1],
[3, 3],
[4, 4]]
raw_data_y = [0, 0, 0, 1, 1, 1]
# 转换成numpy数组
x_src = np.array(raw_data_x)
y_src = np.array(raw_data_y)
# 目标值
xs = np.array([[3, 3.5]])
# 将原始数据集分为训练集与测试集
x_train, x_test, y_train, y_test = train_test_split(x_src, y_src)
# 数值归一化
my_scaler = kNN_pro.my_feature_scaling.StandardScaler()
my_scaler.fit(x_train)
print("训练集的均值方差归一化为:%s" % my_scaler.transform(x_train))
print("测试集的均值方差归一化为:%s" % my_scaler.transform(x_test))
# kNN求分类值
kNNobj = kNN_pro.kNN.KNNClassfier(k=3)
kNNobj.fit(x_train, y_train)
print("目标值的预测分类是:%s" % kNNobj.predict(xs))
print("训练模型的准确度为:%s" % kNNobj.score(x_test, y_test))
# 根据归一化值求目标值分类
kNNobj.fit(my_scaler.transform(x_train),y_train)
print(kNNobj.predict(xs))
print(kNNobj.score(my_scaler.transform(x_test),y_test))
# 寻找最佳的k
best_score = 0.0
best_k = -1
for ks in range(1, 6):
knn_clf = kNN_pro.kNN.KNNClassfier(k=ks)
knn_clf.fit(x_train, y_train)
score = knn_clf.score(x_test, y_test)
if score > best_score:
best_k = ks
best_score = score
print("best_k=", best_k)
print("best_score=", best_score)
运行结果:
训练集的均值方差归一化为:[[ 1.33333333 1.41421356]
[ 0.5 0.70710678]
[ 0.5 -0.70710678]
[-1.16666667 -1.41421356]
[-1.16666667 0. ]]
测试集的均值方差归一化为:[[-1.16666667 -0.70710678]]
目标值的预测分类是:[1]
训练模型的准确度为:1.0
[1]
1.0
best_k= 1
best_score= 1.0
3 总结
1)通过测试数据来模拟真实环境,很有可能无法得到所有测试数据的均值和方差。
2)kNN算法可以解决分类、多分类问题,也可以解决回归问题,它思想简单、效果强大。
3)kNN算法缺点:
- 效率低下,时间复杂度高,为O(m*n),m是样本数,n是特征数。
- 高度的数据相关性。
- 预测结果具有不可解释性。
- 维数灾难(随着维数的增加,“看似相近”的两个点的距离就越来越大)。