import numpy as np
#import operator
filename = path + '/datingTestSet.txt'
# 加载数据
def loadDataset(filename):
dataSet = []
labelSet = []
with open(filename) as fr:
for line in fr.readlines():
line_arr = line.split('\t')
dataSet.append([float(line_arr[0]),float(line_arr[1]),float(line_arr[2])]) # 确保是数值型数据
labelSet.append(line_arr[-1])
return dataSet, labelSet
# 归一化数据 : newValue = (oldValue - min) / (max - min)
def autoNorm(dataSet):
dataSet = np.array(dataSet) # 转化为数组用于运算
m = dataSet.shape[0] # 得到数据集行数
minVal = dataSet.min(0);
minVals = np.tile(minVal,(m,1)) # 获取每一特征最小/大值,并将其拓展到与数据集相同数量用于运算
maxVal = dataSet.max(0); maxVals = np.tile(maxVal,(m,1))
normDataset = np.zeros(np.shape(dataSet)) # 提前定义数组用于存放归一化后的数据
normDataset = (dataSet - minVals)/(maxVals - minVals) # 归一化共识 newValue = (oldValue - min) / (max - min)
return normDataset
# 建立分类模型
def Knn_Classifier_Model(inX,dataSet,labelSet,k):
m = dataSet.shape[0]
inXs = np.tile(inX, (m,1)) # 将输入的单行数据拉长
distance = np.sqrt(((inXs - dataSet)**2).sum(axis=1))# 欧式距离公式,sum 中的参数 0 为纵向求和
sort_asc_index = distance.argsort() # 升序排序,输出对应位置的索引
class_count = {} # 统计前k个类别,用投票方式确定 calssification result
for i in range(k):
label = labelSet[sort_asc_index[i]]
class_count[label] = class_count.get(label,0) + 1 # get 方法,字典中有label 返回 label对应的值;否则将label添加进字典并返回value=0
dic = sorted(class_count.items(),key = lambda x:x[1],reverse = True) # dic.items(),返回元素为(key,value)的列表
#sorted(class_count.items(),key = operator.itemgetter(1),reverse = True),参数1为位置参数
return dic[0][0]
# 测试代码
def test():
hoRatio = 0.1 # 10%作测试集
dataSet , labelSet = loadDataset(filename)
normDataset = autoNorm(dataSet)
m = len(normDataset)
m_test = int(m * hoRatio) # 获取测试集数量
error_count = 0.0
for i in range(m_test):
classifier_result = Knn_Classifier_Model(normDataset[i],normDataset[m_test:m],labelSet[m_test:m],3)
if classifier_result != labelSet[i]:
error_count += 1.0
print("错误率:",error_count / float(m_test))
if __name__ == '__main__':
test()
【机器学习实战】 Knn-约会对象
猜你喜欢
转载自blog.csdn.net/weixin_37392582/article/details/80434771
今日推荐
周排行