机器学习实战笔记--KNN-约会网站配对

可视化 

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

def file2matrix(filename):
    fr=open(filename)
    arrayOLines=fr.readlines()
    numberOfLines=len(arrayOLines)
    returnMat=np.zeros((numberOfLines,3))
    classLabelVector=[]
    
    index=0
    for line in arrayOLines:
        line=line.strip()
        listFromLine=line.split('\t')
        returnMat[index,:]=listFromLine[0:3]
        classLabelVector.append(str(listFromLine[-1]))
        index +=1
    return returnMat,classLabelVector
#返回数据和相应的标签


def showdata(datingDataMat,datingLabels):
    color=[]
    for i,value in enumerate (datingLabels):
        if value=='largeDoses':
            color.append('red')
        if value=='smallDoses':
            color.append('orange')
        if value=='didntLike':
            color.append('black')
    fig,axs=plt.subplots()
    axs.scatter(x=datingDataMat[:,1], y=datingDataMat[:,2],color=color,s=15, alpha=.5)
    axs.set_xlabel(u'玩视频游戏所耗时间百分比',fontproperties='SimHei')
    axs.set_ylabel(u'每周所消费的冰淇淋公升数',fontproperties='SimHei')

    #设置图例
    didntLike = mlines.Line2D([], [], color='black', marker='.',markersize=6, label='didntLike')
    smallDoses = mlines.Line2D([], [], color='orange', marker='.',markersize=6, label='smallDoses')
    largeDoses = mlines.Line2D([], [], color='red', marker='.',markersize=6, label='largeDoses')
    #添加图例
    axs.legend(handles=[didntLike,smallDoses,largeDoses])
    plt.show()


datingDataMat,datingLabels=file2matrix('D:/anicode/spyderworkspace/datingTestSet.txt')
showdata(datingDataMat,datingLabels)

str(listFromLine[-1],机器学习实战上为int有误

归一化+分类

def file2matrix(filename):
    fr=open(filename)
    arrayOLines=fr.readlines()
    numberOfLines=len(arrayOLines)
    returnMat=np.zeros((numberOfLines,3))
    classLabelVector=[]
    
    index=0
    for line in arrayOLines:
        line=line.strip()
        listFromLine=line.split('\t')
        returnMat[index,:]=listFromLine[0:3]
        classLabelVector.append(str(listFromLine[-1]))
        index +=1
    return returnMat,classLabelVector

#归一化
def autoNorm(dataSet):
    minVals=dataSet.min(0)#返回每列的最小值,共3列,返回三个值
    maxVals=dataSet.max(0)#返回每列的最大值,共3列,返回三个值
    ranges=maxVals-minVals
    normDataSet=np.zeros(dataSet.shape)
    m=dataSet.shape[0]
    normDataSet=dataSet-np.tile(minVals,(m,1))
    normDataSet=normDataSet/np.tile(ranges,(m,1))
    return normDataSet,ranges,minVals

#分类
def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    diffMat=np.tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    sqDistances=sqDiffMat.sum(axis=1)
    distances=sqDistances**0.5
    sortedDistIndicies=distances.argsort()#返回数组从小到大的索引值
    classCount={}
    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]#下标对应着相应的标签 
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#返回的是字典        
        #items()将字典中的项按列表返回
        #key,按照key给的关键字进行排序
        sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),
                                reverse=True)
        return sortedClassCount[0][0]

#测试1
datingDataMat,datingLabels=file2matrix('D:/anicode/spyderworkspace/datingTestSet.txt')
normDataSet,ranges,minVals=autoNorm(datingDataMat)
#获取所有数据的10%
hoRatio=0.10
m=normDataSet.shape[0]
numTestVecs=int(m*hoRatio)
#分类错误计数
errorCount=0.0
for i in range(numTestVecs):
    clsResult=classify0(normDataSet[i,:],normDataSet[numTestVecs:m,:],
                        datingLabels[numTestVecs:m],4)
    print("分类结果:%s\t真实类别:%s"%(clsResult,datingLabels[i]))
    if clsResult!=datingLabels[i]:
        errorCount+=1.0
print("错误率:%f%%"%(errorCount/float(numTestVecs)*100))

#测试2
percentTats=float(input("玩视频游戏所耗时间百分比:"))
ffMiles=float(input("每年获得的飞行常客里程数:"))
iceCream=float(input("每周消费的冰淇淋公升数:"))

datingDataMat,datingLabels=file2matrix('D:/anicode/spyderworkspace/datingTestSet.txt')
normDataSet,ranges,minVals=autoNorm(datingDataMat)

#生成numpy数组,以便进行测试
inArr=np.array([ffMiles,percentTats,iceCream])
#测试集归一化
norminArr=(inArr-minVals)/ranges
#返回分类结果
clsResult=classify0(norminArr,normDataSet,datingLabels,3)
print("你可能%s这个人"%(clsResult))

 

猜你喜欢

转载自blog.csdn.net/sinat_38998284/article/details/81569844