from numpy import * import operator from os import listdir import matplotlib import matplotlib.pyplot as plt # The path to the dataset filePath = 'E:\\bigData\\ml\\dataset\\datingTestSet2.txt' # kNN core algorithm ''' classify0 function: Parameter explanation: inX: samples to be classified dataSet: Classified dataset (training set) labels: the categories of the classified dataset k: select the number of samples with the closest distance Function explanation: By calculating the distance between the sample inX and each sample in the dataSet, Select the k closest samples, pick the species that appears most frequently in these samples, Predict the category as the category of inX ''' def classify0(inX, dataSet, labels, k): # Read the number of vectors (samples) in the training set matrix dataSetSize = dataSet.shape[0] # The difference between the training set vector (sample) and the vector to be classified (sample), similar to (x1-y1) diffMat = tile(inX, (dataSetSize, 1)) - dataSet # Similar to (x1-y1)^2 sqDiffMat = diffMat**2 # something like (x1-y1)^2 + (x2-y2)^2 + ... + (xn-yn)^2 sqDistances = sqDiffMat.sum(axis=1) # Root sqDistances to get the Euclidean distance of the two samples distances = sqDistances**0.5 # After calculating the distance between the sample to be classified and each classified sample, # Sort these distances from small to large, extract their corresponding index (index), and output to sortedDistIndicies # For example: the distance between the first sample and the sample to be classified ranks 706th in all distances, then record 706 in the sortedDistIndicies list sortedDistIndicies = distances.argsort() classCount = {} # Take the k closest samples. Select the category that appears most frequently in these samples, then this category is the prediction result for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] # Convert from file to matrix def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines, 3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector # Normalize the eigenvalues and map them to between 0-1 def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m,1)) normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals # Test the kNN classification algorithm def datingClassTest(): hoRatio = 0.50 #hold out 10% datingDataMat, datingLabels = file2matrix(filePath) #load data setfrom file normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])) if (classifierResult != datingLabels[i]): errorCount += 1.0 print("the total error rate is: %f" % (errorCount/float(numTestVecs))) print(errorCount) # Classification test algorithm def classifyPerson(): resultList = ['not at all', 'in small doses', 'in large doses'] percentTats = float(input("percentage of time spent playing video games?")) ffMiles = float(input("frequent flier miles earned per year?")) iceCream = float(input("litres of ice cream consumed per year?")) datingDataMat, datingLabels = file2matrix(filePath) normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3) print("Your will probably like this person:", resultList[classifierResult-1]) # Data visualization function (optional) def writeData2Picture(): datingDataMat, datingLabels = loadingData() fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0*array(datingLabels), 15.0*array(datingLabels)) plt.show() # data loading function (optional) def loadingData(): datingDataMat, datingLabels = file2matrix('E:\\bigData\\ml\\dataset\\datingTestSet2.txt') # print(datingDataMat) # print(datingLabels) return datingDataMat, datingLabels # Test function for the first and second functions, run it in the main function (optional) def myKNNTest1(): group, labels = loadingData() print(group) # test output code category = classify0([0, 0], group, labels, 3) print(category) # main function if __name__ == '__main__': classifyPerson()
Screenshot of code running
Dataset download address:
Link: https://pan.baidu.com/s/1MR7CnBU8bZztb1tlpR4XyQ
Password: jeec