机器学习系列(二) kNN(k近邻算法)会用到scikit 2020.6.4

前言

本节学习kNN算法
这个应该算是最简单最基础的机器学习算法

  • 思想极度简单
  • 效果好
  • 可以解释机器学习中很多细节

本节内容包括

  • 自己实现底层逻辑
  • 使用scikit的库
  • 借用kNN了解机器学习里的一些细节问题

1、kNN的简单实现

kNN的思想简单讲就是
附近k个样本哪种多,就是哪种的概率大

  • 可以认为没有模型
  • 也可以认为训练数据集就是模型

在这里插入图片描述
如图,k设为3,绿点是新的点,附近红色有2个,蓝色有1个,故认为绿点应该被分类为红色

自己编写逻辑,简单实现如下

import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

# 原始数据集
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 作为训练集
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 新点
x = np.array([8.093607318, 3.365731514])
# 在图上展示
plt.scatter(X_train[y_train==0,0], X_train[y_train==0,1], color='g')
plt.scatter(X_train[y_train==1,0], X_train[y_train==1,1], color='r')
plt.scatter(x[0], x[1], color='b')
plt.show()

# kNN过程
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train] #欧拉距离
nearest = np.argsort(distances) #排序索引
k = 6 #设定k值
topK_y = [y_train[neighbor] for neighbor in nearest[:k]] #前k个点的y值
votes = Counter(topK_y) #这k个点的分类票数
predict_y = votes.most_common(1)[0][0] #票数最多的那个元素

封装成一个可调用的函数

import numpy as np
from math import sqrt
from collections import Counter

"""进行函数封装"""
# 原始版
"""
def kNN_classify(k, X_train, y_train, x):
    # 确保数据合法
    assert 1 <= k <= X_train.shape[0], "k must be valid"
    assert X_train.shape[0] == y_train.shape[0], \
        "the size of X_train must equal to the size of y_train"
    assert X_train.shape[1] == x.shape[0], \
        "the feature number of x must be equal to X_train"
    # kNN计算过程
    distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
    nearest = np.argsort(distances)
    topK_y = [y_train[i] for i in nearest[:k]]
    votes = Counter(topK_y)
    return votes.most_common(1)[0][0]
    """
    
# 优化版
class KNNClassifier:
    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None
        
    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."
        self._X_train = X_train
        self._y_train = y_train
        return self
        
    def predict(self, X_predict):
        """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"
        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)
        
    def _predict(self, x):
        """给定单个待预测数据x,返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"
        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)
        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]
        
    def __repr__(self):
        return "KNN(k=%d)" % self.k

调用scikit的库,实现如下

from sklearn.neighbors import KNeighborsClassifier
import numpy as np

"""调用scikit里的函数"""
# 原始数据集
raw_data_X = [[3.393533211, 2.331273381],
              [3.110073483, 1.781539638],
              [1.343808831, 3.368360954],
              [3.582294042, 4.679179110],
              [2.280362439, 2.866990263],
              [7.423436942, 4.696522875],
              [5.745051997, 3.533989803],
              [9.172168622, 2.511101045],
              [7.792783481, 3.424088941],
              [7.939820817, 0.791637231]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
# 作为训练集
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 新点
x = np.array([8.093607318, 3.365731514])

# kNN
kNN_classifier = KNeighborsClassifier(n_neighbors=6) #k值
kNN_classifier.fit(X_train, y_train) #拟合
# kNN_classifier.predict(x) #一维数组会出问题
X_predict = x.reshape(1, -1) #变成二维
y_predict = kNN_classifier.predict(X_predict)
print(y_predict[0])

2、进行性能判断

我们需要判断算法的性能

  • 先留出一部分数据作为test数据
  • 然后进行性能判断,如准确度

在这里插入图片描述

分离test数据集

自己实现test数据集分离如下

import numpy as np
from sklearn import datasets

"""分离出train和test的数据集"""
# 鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
# 对索引先乱序
shuffled_indexes = np.random.permutation(len(X))
# 测试数据集的大小
test_ratio = 0.2
test_size = int(len(X) * test_ratio)
# 分配索引
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
# 分离
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

封装成可调用函数如下

import numpy as np

"""进行train-test分离函数封装"""
def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"
    if seed:
        np.random.seed(seed)
    # 对索引先乱序
    shuffled_indexes = np.random.permutation(len(X))
    # 测试数据集的大小
    test_size = int(len(X) * test_ratio)
    # 分配索引
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]
    # 分离
    X_train = X[train_indexes]
    y_train = y[train_indexes]
    X_test = X[test_indexes]
    y_test = y[test_indexes]
    return X_train, X_test, y_train, y_test

调用scikit的库实现如下

import numpy as np
from sklearn import datasets
"""分离出train和test的数据集"""
# 鸢尾花数据集
iris = datasets.load_iris()
X = iris.data
y = iris.target
"""scikit里的函数"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

进行准确度判断

函数封装

import numpy as np

"""accuracy封装"""
def accuracy_score(y_true, y_predict):
    '''计算y_true和y_predict之间的准确率'''
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_true must be equal to the size of y_predict"
    return sum(y_true == y_predict) / len(y_true)

调用scikit的库

"""scikit中的accuracy"""
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
y_predict = knn_clf.predict(X_test)
accuracy_score(y_test, y_predict)
knn_clf.score(X_test, y_test)

3、超参数

  • 超参数:运行算法前就需要指定的参数
  • 模型参数:算法过程中学习的参数

调参调的就是超参数

kNN的超参数是k和距离权重,没有模型参数
还要考虑下明科夫斯基距离的p

在这里插入图片描述

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

"""寻找最好的超参数k、距离权重和明可夫斯基距离的p"""
# 数据集准备
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
# kNN
"""
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
knn_clf.score(X_test, y_test)
"""

# 寻找最好的k,不考虑距离权重
best_score = 0.0
best_k = -1
for k in range(1, 11):
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    score = knn_clf.score(X_test, y_test)
    if score > best_score:
        best_k = k
        best_score = score
print("best_k =", best_k)
print("best_score =", best_score)

# 考虑距离权重
best_score = 0.0
best_k = -1
best_method = ""
for method in ["uniform", "distance"]:
    for k in range(1, 11):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights=method)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_score = score
            best_method = method
print("best_method =", best_method)
print("best_k =", best_k)
print("best_score =", best_score)

# 最好的p
best_score = 0.0
best_k = -1
best_p = -1
for k in range(1, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(n_neighbors=k, weights="distance", p=p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_k = k
            best_p = p
            best_score = score
print("best_k =", best_k)
print("best_p =", best_p)
print("best_score =", best_score)

# 网格搜索
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2) #n_jobs是核数,-1表示所有
grid_search.fit(X_train, y_train)
knn_clf = grid_search.best_estimator_ #返回最佳分类器
grid_search.best_score_ #最佳的准确度
grid_search.fit(X_train, y_train)

4、数值归一化

将所有数据映射到统一尺度,以防某特征影响过大

  • 最值归一化:0-1之间,如图所示,适用于有明显边界的情况
  • 均值方差归一化:均值0方差1的分布,适用于无边界

数值归一化原理

最值归一化原理如下
在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt

# 最值归一化 Normalization
# 对数组
x = np.random.randint(0, 100, 100)
x = (x - np.min(x)) / (np.max(x) - np.min(x))
print(x)
# 对矩阵
X = np.random.randint(0, 100, (50, 2))
X = np.array(X, dtype=float) #先要变成浮点数
X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))
X[:,1] = (X[:,1] - np.min(X[:,1])) / (np.max(X[:,1]) - np.min(X[:,1]))
print(X)
plt.scatter(X[:,0], X[:,1])
plt.show()

均值方差归一化原理如下

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt
# 均值方差归一化 Standardization
X2 = np.random.randint(0, 100, (50, 2))
X2 = np.array(X2, dtype=float)
X2[:,0] = (X2[:,0] - np.mean(X2[:,0])) / np.std(X2[:,0])
X2[:,1] = (X2[:,1] - np.mean(X2[:,1])) / np.std(X2[:,1])
print(X2)
plt.scatter(X2[:,0], X2[:,1])
plt.show()

在kNN中运用数值归一化

注意对测试数据集的归一化用训练数据集的均值方差

函数封装

import numpy as np

"""归一化的函数封装"""
class StandardScaler:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None
        
    def fit(self, X):
        """根据训练数据集X获得数据的均值和方差"""
        assert X.ndim == 2, "The dimension of X must be 2"
        self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
        self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])
        return self
        
    def transform(self, X):
        """将X根据这个StandardScaler进行均值方差归一化处理"""
        assert X.ndim == 2, "The dimension of X must be 2"
        assert self.mean_ is not None and self.scale_ is not None, \
               "must fit before transform!"
        assert X.shape[1] == len(self.mean_), \
               "the feature number of X must be equal to mean_ and std_"
        resX = np.empty(shape=X.shape, dtype=float)
        for col in range(X.shape[1]):
            resX[:,col] = (X[:,col] - self.mean_[col]) / self.scale_[col]
        return resX

调用scikit的库实现如下

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

"""scikit中的归一化scaler"""
# 数据集准备
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=666)

# scikit-learn中的StandardScaler
from sklearn.preprocessing import StandardScaler
standardScalar = StandardScaler()
standardScalar.fit(X_train)
print(standardScalar.mean_) #返回均值
print(standardScalar.scale_) #返回标准差
X_train = standardScalar.transform(X_train)
X_test_standard = standardScalar.transform(X_test)
print(X_train)
print(X_test_standard)

# kNN
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
score = knn_clf.score(X_test_standard, y_test) #注意,此时不能传入没有归一化的数据!
print(score)

结语

本节较为完整的学习了kNN的思想、原理以及实现
并通过kNN了解了性能判定、超参数和数值归一化

kNN自身虽简单好用,但有问题如下:

  • 效率低下:m个样本,n个特征,预测一个数据O(m*n)
  • 预测结果不具有可解释性
  • 维数灾难:维度增加,使得近邻点的距离越来越大

猜你喜欢

转载自blog.csdn.net/weixin_44604541/article/details/106543573
今日推荐