Machine Learning (1) - K nearest neighbor algorithm

KNN function wording

import numpy as np
from math import sqrt
from collections import Counter

def KNN_classify(k,X_train,y_train,x):
    assert 1<=k<X_train.shape[0],"k must be valid"
    assert X_train.shape[0] == y_train.shape[0],\
    "the size of X_train must equal to the size of y_train"
    assert X_train.shape[1] == x.shape[0],\
    "the feature number of x must be equal to X_train"

    distances=[sqrt(np.sum((X_train-x)**2)) for x_train in X_train]
    nearest=np.argsort(distances)
    topK_y=[y_train[i] for i in nearest[:k]]
    votes=Counter(topK_y)
    return votes.most_common(1)[0][0] #返回类型

Scikit-learn using the KNN

from sklearn.neighbors import KNeighborsClassifier
import numpy as np
KNN_classifier=KNeighborsClassifier(n_neighbors=6) #传入k的值
#这里我随便搞的训练数据
x_train=np.arange(0,100).reshape(-1,2) #x是矩阵
y_train=np.random.randint(0,2,50) #y是数组
KNN_classifier.fit(x_train,y_train) #传入训练数据集
x=np.array([1,3]) #测试数据
x=x.reshape(1,-1) #测试数据只能传矩阵为参数
y=KNN_classifier.predict(x)[0] #因为我只测试了一组数据,所以取[0]即可

KNN class writing

KNN.py

import numpy as np
from math import sqrt
from collections import Counter
from K近邻算法包.metrics import accuracy_score

class KNNClassifier:
    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None #私有变量
        self._y_train = None

    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."

        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"

        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)

    def _predict(self, x):
        """给定单个待预测数据x,返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"

        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)

        return votes.most_common(1)[0][0]

    def __repr__(self): #自我描述,在创建对象时打印
        return "KNN(k=%d)" % self.k

Test algorithm accuracy

model_selection.py:

import numpy as np


def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"

    if seed: #固定随机种子,好调试
        np.random.seed(seed)

    shuffled_indexes = np.random.permutation(len(X)) #len(矩阵)是行数

    test_size = int(len(X) * test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]

    X_train = X[train_indexes]
    y_train = y[train_indexes]

    X_test = X[test_indexes]
    y_test = y[test_indexes]

    return X_train, X_test, y_train, y_test

You can call it try jupyter notebook in which:

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris()
X=iris.data
y=iris.target

%run F:/python3玩转机器学习/K近邻算法/model_selection.py

X_train, X_test, y_train, y_test=train_test_split(X,y,test_ratio=0.2)

%run F:/python3玩转机器学习/K近邻算法/KNN.py
    
my_knn_clf.fit(X_train,y_train)

y_predict=my_knn_clf.predict(X_test)

sum(y_predict==y_test)/len(y_test)

scikit-learn中的model_selection:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=666)

Classification accuracy

Write your own package

Write metrics.py:

import numpy as np


def accuracy_score(y_true, y_predict):
    '''计算y_true和y_predict之间的准确率'''
    assert y_true.shape[0] == y_predict.shape[0], \
        "the size of y_true must be equal to the size of y_predict"

    return sum(y_true == y_predict) / len(y_true)

Call KNN.py in:

import numpy as np
from math import sqrt
from collections import Counter
from K近邻算法包.metrics import accuracy_score

class KNNClassifier:
    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None #私有变量
        self._y_train = None

    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."

        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        """给定待预测数据集X_predict,返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"

        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)

    def _predict(self, x):
        """给定单个待预测数据x,返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"

        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)

        return votes.most_common(1)[0][0]

    def score(self, X_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""

        y_predict = self.predict(X_test)
        return accuracy_score(y_test, y_predict)

    def __repr__(self): #自我描述,在创建对象时打印
        return "KNN(k=%d)" % self.k

1568941342454

Accuracy test with sklearn

data=datasets.load_digits()

X=data.data

y=data.target

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

from sklearn.neighbors import KNeighborsClassifier

KNN_clf=KNeighborsClassifier(n_neighbors=3)

KNN_clf.fit(X_train,y_train)

y_predict=KNN_clf.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score (y_test, y_predict), this may be: KNN_clf.score (X_test, y_test)

Hyperparameter

Before running the algorithm parameters to be decided: hyperparameter

Model Parameters: algorithm parameters during operation

KNN no model parameters, KNN is typical of hyper-parameters k

The above handwritten digital data set, violence find the best k:

best_score=0.0
best_k=-1
for k in range(1,11):
    knn_clf=KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train,y_train)
    score=knn_clf.score(X_test,y_test)
    if score>best_score:
        best_k=k
        best_score=score

print("best_k=",best_k)
print("best_score=",best_score)

Sometimes, the weight may have an impact right away, such as:1568944373833

The most recent is red, then we press the weights is the ratio of the distance taking the inverse summation.

Minkowski distance:1569216402795

She won a super-parameter p

Find the best p and k (grid search):

%%time

best_p=-1
best_score=0.0
best_k=-1
for k in range(1,11):
    for p in range(1,6):
        knn_clf=KNeighborsClassifier(n_neighbors=k,weights="distance")
        knn_clf.fit(X_train,y_train)
        score=knn_clf.score(X_test,y_test)
        if score > best_score :
            best_k=k
            best_p=p
            best_score=score
            
print("best_p=",best_p)
print("best_k=",best_k)
print("best_score=",best_score)

Using a grid search of scikit-learn:

Defines a grid parameters:

param_grid =[
{
'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)],
},
{
'weights':['distance'],
'n_neighbors':[i for i in range(1,11)],
'p':[i for i in range(1,6)]
}]

Initialize a classifier object:

knn_clf=KNeighborsClassifier()

Guide grid search:

from sklearn.model_selection import GridSearchCV

Instantiation:

grid_search = GridSearchCV(knn_clf,param_grid)

Fitting:

%%time
grid_search.fit(X_train,y_train)

Optimal classifier:

grid_search.best_estimator_

grid_search.best_score_

grid_search.best_params_

The knn_clf Fu optimal parameters of the classifier:

knn_clf=grid_search.best_estimator_

knn_clf.score(X_test,y_test)

Accelerate, display specific information:

grid_search = GridSearchCV (knn_clf, param_grid, n_jobs = -1, verbose = 2) # - 1 are all the details of the core is parallel verbose output information

%%time
grid_search.fit(X_train,y_train)

Data normalization

All data will be mapped to the same scale

Most values ​​were normalized (normalization)

All the data mapped to between 0 and 1

1569223086450

Applicable to the distribution of a clear boundary conditions, by the influence of outlier

For vector:

x1=np.random.randint(0,100,size=100)

(X1-np.min (x1)) / (np.max (x1) -np.min (x1))

Matrix:

X=np.random.randint(0,100,(50,2))

X=np.array(X,dtype=float)

Wherein each column normalized value:

X [: 0] = (X [: 0] -np.min (X [: 0])) / (np.max (X [: 0]) - np.min (X [: 0 ]))

X [: 1] = (X [: 1] -np.min (X [: 1])) / (np.max (X [: 1]) - np.min (X [: 1 ]))

Draw a scatter plot:

plt.scatter(X[:,0],X[:,1])
plt.show()

Mean-variance normalization (standardization)

All the data were normalized to zero mean and unit variance distribution 1

It applied to the data no clear boundaries, free from the influence of extreme values

1569223351856

S is the standard deviation.

Example:

x2=np.random.randint(0,100,(50,2))

x2=np.array(x2,dtype=float)

x2[:,0]=(x2[:,0]-np.mean(x2[:,0]))/np.std(x2[:,0])

x2[:,1]=(x2[:,1]-np.mean(x2[:,1]))/np.std(x2[:,1])

plt.scatter(x2[:,0],x2[:,1])
plt.show()

How to test data set normalized?

Not only to simply set the test data normalization should be used (x_test-x_mean_train) / std_train

Use scikit-learn normalization

from sklearn import datasets
import numpy

iris=datasets.load_iris()

X=iris.data
y=iris.target

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.2,random_state=666)

from sklearn.preprocessing import StandardScaler

standardScaler=StandardScaler()

standardScaler.fit(X_train)

standardScaler.mean_ # Mean

standardScaler.scale_ # standard deviation, std has become unusable

X_train = standardScaler.transform (X_train) # returns normalized matrix

X_test_standard = standardScaler.transform (X_test) # test data set to use the training data set is normalized

After testing the accuracy of a return of:

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=3)

knn_clf.fit (X_train, y_train) # using normalized training data X Training

knn_clf.score (X_test_standard, y_test) # using normalized test data X Test

Return 1.0, because the iris data is relatively small, highly accurate but also naturally.

sklearn.preprocessing there MinMaxScaler (most value normalized), similar to the usage.

Write your own StandardScaler class

preprocessing.py:

import numpy as np


class StandardScaler:

    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        """根据训练数据集X获得数据的均值和方差"""
        assert X.ndim == 2, "The dimension of X must be 2"

        self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
        self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])

        return self

    def transform(self, X):
        """将X根据这个StandardScaler进行均值方差归一化处理"""
        assert X.ndim == 2, "The dimension of X must be 2"
        assert self.mean_ is not None and self.scale_ is not None, \
               "must fit before transform!"
        assert X.shape[1] == len(self.mean_), \
               "the feature number of X must be equal to mean_ and std_"

        resX = np.empty(shape=X.shape, dtype=float)
        for col in range(X.shape[1]):
            resX[:,col] = (X[:,col] - self.mean_[col]) / self.scale_[col]
        return resX

About K-nearest neighbor algorithm

The biggest drawback: inefficient

If there are m samples of the training set, n-features, then each prediction data, to O (m * n)

You can use KD-Tree, Ball-Tree optimization, but still inefficient

2 disadvantages: highly relevant data

3 disadvantages: the predicted results are not interpretable having

Curse of dimensionality: With increasing dimension, "the distance between two points may seem similar will be increasing."

Solution: dimensionality reduction, such as PCA

Machine learning process:

1569228933482

Guess you like

Origin www.cnblogs.com/mcq1999/p/KNN_1.html