1. A summary of K's neighbors in one sentence: those who are close to Zhu are red, those who are close to ink are black + voting
Not much to say, on the code
2. Generate a dataset with sklearn
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split
n_samples = 5000 n_bins = 3 # use 3 bins for calibration_curve as we have 3 clusters here
centers = [(-1, -1), (5, 5)] X, y = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, centers=centers, shuffle=False, random_state=42) y[:n_samples // 2] = 0 y[n_samples // 2:] = 1 sample_weight = np.random.RandomState(42).rand(y.shape[0]) X_train, X_test, y_train, y_test, sw_train, sw_test = \ train_test_split(X, y, sample_weight, test_size=0.1, random_state=42)
3. Classification - KNN
k = 500 sum1 = 0def result(dist,k): index = dist.argsort() #Get the sorted array index index = index[:k] out = y_train[index].tolist() return out.count(0) < k - out.count(0) for i in range(len(X_test)): dist = distance(X_test[i], X_train) sum1 = sum1 + np.equal(y_test[i],result(dist,k))#y_train[dist.index(min(dist))])print(np.float(sum1/len(X_test)))
4. Drawing display
plt.scatter(X_train[:,0],X_train[:,1],c=y_train) plt.scatter(X_test[:,0],X_test[:,1],c='b') plt.show()
5. Expansion--KD number storage and search
class Node: def __init__(self, data, lchild = None, rchild = None): self.data = data self.lchild = lchild self.rchild = rchild def create ( self , dataSet , depth): # Create a kd tree and return the root node if ( len (dataSet) > 0 ): m , n = np.shape(dataSet) # Find the sample row, column midIndex = m / 2 # The index position of the middle number axis = depth % n # Determine which axis to use to divide the data, corresponding to the algorithm 3.2 (2) formula in the book j() sortedDataSet = self .sort(dataSet , axis) # Sort node = Node(sortedDataSet[midIndex]) # Set the node data field to the median, please refer to the following book for details # print sortedDataSet[midIndex] leftDataSet = sortedDataSet[ : midIndex] # Create 2 copies to the left of the median rightDataSet = sortedDataSet[midIndex + 1 :] print (leftDataSet) print (rightDataSet) node.lchild = self .create(leftDataSet , depth + 1 ) # recursively create the tree by passing in the median left sample node.rchild = self .create(rightDataSet , depth + 1 ) return node else : return None def sort ( self , dataSet , axis): # Use bubble sort and use aixs as the axis to divide sortDataSet = dataSet[:] # Since the original sample cannot be destroyed, create a copy here m , n = np.shape(sortDataSet) for i in range (m): for j in range(0, m - i - 1): if (sortDataSet[j][axis] > sortDataSet[j + 1][axis]): temp = sortDataSet[j] sortDataSet[j] = sortDataSet[j + 1] sortDataSet[j + 1] = temp print(sortDataSet) return sortDataSet def preOrder(self, node): if node != None: print("tttt->%s" % node.data) self.preOrder(node.lchild) self.preOrder(node.rchild) #kd tree search def search ( self , tree , x): # search for self .nearestPoint = None # save the nearest point self .nearestValue = 0 # save the nearest value def travel (node , depth= 0 ): # recursive search if node != None : # recursive termination condition n = len (x) # feature number axis = depth % n # calculation axis if x[axis] < node.data[axis]: # if the data is less than the node, then go to the left Click to find travel(node.lchild , depth + 1) else: travel(node.rchild, depth + 1) # The following is the end of the recursion, corresponding to algorithm 3.3(3) distNodeAndX = self.dist(x , node.data) # The distance between the target and the node is judged if (self.nearestPoint == None ): # Determine the current point and update the nearest point and the nearest value, corresponding to Algorithm 3.3(3)(a) self.nearestPoint = node.data self.nearestValue = distNodeAndX elif (self.nearestValue > distNodeAndX): self.nearestPoint = node.data self.nearestValue = distNodeAndX print (node.data , depth , self.nearestValue , node.data[axis] , x[axis]) if ( abs (x[axis] - node.data[axis]) <= self.nearestValue): # Determine if You need to go to the area of the child node to find (circle judgment), corresponding to algorithm 3.3(3)(b) if x[axis] < node.data[axis]: travel(node.rchild, depth + 1) else: travel(node.lchild, depth + 1) travel(tree) return self.nearestPoint def dist ( self , x1 , x2): # Euclidean distance calculation return ((np.array(x1) - np.array(x2)) ** 2 ).sum() ** 0.5