《统计学习方法》第三章:K近邻法 ——python实现


K近邻法理论推导
:https://blog.csdn.net/ACM_hades/article/details/89644882

代码

  • 数据集:ris鸢尾花数据集,它包含3个不同品种的鸢尾花:[Setosa,Versicolour,and Virginica]数据,特征:[‘sepal length’, ‘sepal width’, ‘petal length’, ‘petal width’],一共150个数据。

代码

# encoding=utf-8
from collections import Counter
import pandas as pd
import numpy as np
import cv2
import random
import time
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# 一个和快排一样的思想
def partition_sort(arr, k, axis):
    """
    以位置k为中心将数组划分为两部分, 左侧的元素不大于位置k值;右侧的元素大于位置k值
    :param arr: 待划分数组
    :param p: 枢纽前部元素个数
    :param key: 比较方式
    :return: None
    """
    start, end = 0, len(arr) - 1
    assert 0 <= k <= end
    while True:
        i, j, pivot = start, end, deepcopy(arr[start])
        while i < j:
            # 从右向左查找较小元素
            while i < j and pivot[axis] <= arr[j][axis]:
                j -= 1
            if i == j:
                break
            arr[i] = arr[j]

            i += 1
            # 从左向右查找较大元素
            while i < j and arr[i][axis] <= pivot[axis]:
                i += 1
            if i == j:
                break
            arr[j] = arr[i]
            j -= 1
        arr[i] = pivot

        if i == k:
            return
        elif i < k:
            start = i + 1
        else:
            end = i - 1

class Node:
    def __init__(self, data, depth=0, lchild=None, rchild=None):
        self.data = data
        self.depth = depth
        self.lchild = lchild
        self.rchild = rchild


class KdTree:
    def __init__(self, dimens):
        self.KdTree = None
        self.dimens = dimens
        self.nearest = None
        self.Max_depth = -1

    def create(self, dataSet, depth=0):
        if len(dataSet) > 0:
            m = np.shape(dataSet)[0]  # 样本数
            axis = depth % self.dimens  # 切割轴(最后的轴是label)
            mid = m >> 1  # 中位数索引
            partition_sort(dataSet, mid, axis)  # 拆分
            node = Node(dataSet[mid], depth)  # 构建当前节点
            if depth == 0:  # 根节点
                self.KdTree = node
            node.lchild = self.create(dataSet[:mid], depth + 1)  # 递归构树
            node.rchild = self.create(dataSet[mid + 1:], depth + 1)
            return node
        return None

    def Search(self, x, count=1):
        nearest = []
        for i in range(count):  # 初始化[k个最近点]
            nearest.append([-1, None])
        self.nearest = np.array(nearest)
        self.Count_node=0
        self.Count_node1=0
        def recurve(node):  # DFS
            if node is not None:
                self.Count_node+=1
                axis = node.depth % self.dimens  # 当前节点的轴
                daxis = x[axis] - node.data[axis]  # x与当前节点在对应轴上的距离
                if daxis < 0:  # 递归到叶
                    recurve(node.lchild)
                else:
                    recurve(node.rchild)
               
                dist = np.sqrt(np.sum((x - node.data[:-1]) ** 2))  # x与当前节点的距离
                # 更新最小的k个距离,主要应该k一般比较小所以这里使用插入排序的思想
                #如果k较大可以使用堆来降低时间复杂度
                for i, d in enumerate(self.nearest):
                    if d[0] < 0 or dist < d[0]:
                        self.nearest = np.insert(self.nearest, i, [dist, node], axis=0)
                        self.nearest = self.nearest[:-1]  # 长度不变
                        break

                n = list(self.nearest[:, 0]).count(-1)  # 统计-1的个数
                #k给最小距离的最大距离形成的球体是否与当前节点的分割面相交,
                # 相交就要考虑另一个子结点
                # print(axis,":",self.nearest[-n - 1, 0],":",abs(daxis))
                if self.nearest[-n - 1, 0] > abs(daxis):
                    if daxis < 0:  # 递归其对应的另一个子结点
                        recurve(node.rchild)
                    else:
                        recurve(node.lchild)
                else:
                    self.Count_node1+=1


        recurve(self.KdTree)
        knn = self.nearest[:, 1]#取出k个最近样本
        belong = []
        for i in knn:
            belong.append(i.data[-1])#取样本label

        b = Counter(belong).most_common(1)[0][0]
        return  b, self.Count_node,self.Count_node1

    def preOrder(self, node):
        if node is not None:
            self.preOrder(node.lchild)
            self.preOrder(node.rchild)
            if node.depth>self.Max_depth:
                self.Max_depth=node.depth



def Predict(testset,kdt,k):
    predict = []
    C=0
    S=time.time()
    for test_vec in testset:
        C+=1
        Label,C_n,C_m=kdt.Search(test_vec[:-1],k)
        # print(C,":",C_n,":",C_m)
        predict.append(Label)
        if C%1000==0:
            print("当前样本数:",C,"  Cost: ",time.time()-S)
            S = time.time()

    return np.array(predict)


if __name__ == '__main__':
    print('Start read data')
    S = time.time()
    iris = load_iris()
    Data=iris.data
    Label=iris.target
    Data_set=np.hstack((Data, np.reshape(Label,(-1,1))))
    # 选取 2/3 数据作为训练集, 1/3 数据作为测试集
    Train_set, Test_set = train_test_split(Data_set, test_size=0.33, random_state=23323)

    print("Data shape:", Data.shape,type(Data))
    print("Label shape:", Label.shape,type(Label))
    print("Data_set shape:", Data_set.shape, type(Data_set))
    print("Train_set shape:", Train_set.shape, type(Train_set))
    print("Train_set shape:", Test_set.shape, type(Test_set))
    print('read data cost ', time.time() - S, ' second')

    print('Start Train (build KdTree)')
    S = time.time()
    kdt = KdTree(Train_set.shape[-1]-1)
    kdt.create(Train_set)
    # kdt.preOrder(kdt.KdTree)
    # print("树的最大深度:",kdt.Max_depth)
    print('training cost ', time.time() - S, ' second')

    print('Start predicting')
    S = time.time()
    k = 5
    test_predict = Predict(Test_set,kdt,k)
    print('predicting cost ', time.time() - S, ' second')

    score = accuracy_score(Test_set[:,-1], test_predict)
    print("The accruacy socre is ", score)

结果:
	Start read data
	Data shape: (150, 4) <class 'numpy.ndarray'>
	Label shape: (150,) <class 'numpy.ndarray'>
	Data_set shape: (150, 5) <class 'numpy.ndarray'>
	Train_set shape: (100, 5) <class 'numpy.ndarray'>
	Train_set shape: (50, 5) <class 'numpy.ndarray'>
	read data cost  0.0020248889923095703  second
	Start Train (build KdTree)
	training cost  0.0019648075103759766  second
	Start predicting
	predicting cost  0.04487943649291992  second
	The accruacy socre is  0.98

猜你喜欢

转载自blog.csdn.net/ACM_hades/article/details/89645095