一、手动构造k近邻分类
import numpy as np
import operator
group = np.array([[1,101], [5,89], [108,5], [115,8]])
labels = ['爱情片', '爱情片', '动作片', '动作片'
#构造一个k近邻分类
def classifier(inX, dataset, labels, k):
datasize = dataset.shape[0] #得到训练集的个数
diffMat = np.tile(inX, (datasize,1)) - dataset #把待预测的点坐标扩展成与训练集一样维度的矩阵,然后相减
sqdiffMat = diffMat ** 2
sumdiffMat = sqdiffMat.sum(axis=1)
distances = sumdiffMat ** .5 #开方,计算出待预测点与所有训练集中点的距离
sorteddistindex = distances.argsort() #返回按照元素从小到大排序的索引值
classcount = {} #定义一个空字典,存放标签和对应出现的次数
for i in range(k):
votelabel = labels[sorteddistindex[i]]
classcount[votelabel] = classcount.get(votelabel,0) + 1
sortedclasscount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=True) #按照出现次数降序排列
return sortedclasscount[0][0]
test = [101, 20]
test_class = classifier(test, group, labels, 3)
print(test_class)
动作片
二、引用sklearn包中的KNeighborsClassifier进行实例练习
加载数据
import pandas as pd
data_path = r'C:\Users\Machine Learning\dating.txt'
dating = pd.read_csv(data_path, sep='\t',)
print(dating.head())
40920 8.326976 0.953952 largeDoses
0 14488 7.153469 1.673904 smallDoses
1 26052 1.441871 0.805124 didntLike
2 75136 13.147394 0.428964 didntLike
3 38344 1.669788 0.134296 didntLike
4 72993 10.141740 1.032955 didntLike
dating = np.array(dating) #转换成数组
dating_data = dating[:,:-1] #特征向量
dating_target = dating[:,-1] #标签分类
可视化
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
data_labels = np.unique(dating_target)
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) #显示汉字
LabelsColors = []
for i in data_labels:
if i == 'didntLike':
LabelsColors='black'
if i == 'largeDoses':
LabelsColors='red'
if i == 'smallDoses':
LabelsColors='orange'
plt.scatter(dating_data[dating_target==i][:,0], dating_data[dating_target==i][:,1], c=LabelsColors, alpha=.7)
plt.title(u'每年获得的飞行常客里程数与玩视频游戏所消耗时间占比',FontProperties=font)
plt.xlabel(u'每年获得的飞行常客里程数',FontProperties=font)
plt.ylabel(u'玩视频游戏所消耗时间占',FontProperties=font)
plt.legend(['didntLike','largeDoses','smallDoses']) #注意对应顺序
plt.show()
LabelsColors = []
for i in data_labels:
if i == 'didntLike':
LabelsColors='black'
if i == 'largeDoses':
LabelsColors='orange'
if i == 'smallDoses':
LabelsColors='red'
plt.scatter(dating_data[dating_target==i][:,1], dating_data[dating_target==i][:,2], c=LabelsColors, alpha=.7)
plt.title(u'玩视频游戏所消耗时间占比与每周消费的冰激淋公升数',FontProperties=font)
plt.xlabel(u'玩视频游戏所消耗时间占',FontProperties=font)
plt.ylabel(u'每周消费的冰激淋公升数',FontProperties=font)
plt.legend(['didntLike','smallDoses','largeDoses'])
plt.show()
LabelsColors = []
for i in data_labels:
if i == 'didntLike':
LabelsColors='black'
if i == 'largeDoses':
LabelsColors='orange'
if i == 'smallDoses':
LabelsColors='red'
plt.scatter(dating_data[dating_target==i][:,0], dating_data[dating_target==i][:,2], c=LabelsColors, alpha=.7)
plt.title(u'每年获得的飞行常客里程数与每周消费的冰激淋公升数',FontProperties=font)
plt.xlabel(u'每年获得的飞行常客里程数',FontProperties=font)
plt.ylabel(u'每周消费的冰激淋公升数',FontProperties=font)
plt.legend(['didntLike','smallDoses','largeDoses'])
plt.show()
归一化,三列特征的数量级不同,直接计算距离不妥,先进行归一化
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
data = min_max_scaler.fit_transform(dating_data)
print(data)
[[0.15873259 0.34195467 0.98724416]
[0.28542943 0.06892523 0.47449629]
[0.82320073 0.62848007 0.25248929]
...
[0.29115949 0.50910294 0.51079493]
[0.52711097 0.43665451 0.4290048 ]
[0.47940793 0.3768091 0.78571804]]
F:\Anaconda3\lib\site-packages\sklearn\utils\validation.py:595: DataConversionWarning: Data with input dtype object was converted to float64 by MinMaxScaler.
warnings.warn(msg, DataConversionWarning)
验证分类效果
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
train_data, test_x, train_y, test_y = train_test_split(data, dating_target, test_size=0.2, random_state=3)
knn = KNeighborsClassifier(5)
knn.fit(train_data, train_y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
data_pred = knn.predict(test_x)
# len(data_pred)
for i in range(0,len(data_pred)):
print(data_pred[i], test_y[i], '\n')
smallDoses smallDoses
didntLike didntLike
didntLike largeDoses
smallDoses smallDoses
smallDoses smallDoses
largeDoses largeDoses
smallDoses smallDoses
smallDoses smallDoses
smallDoses didntLike
smallDoses smallDoses
didntLike didntLike
knn.score(test_x, test_y) #模型评分
#用交叉验证法进行评估
scores = cross_val_score(knn, data, dating_target, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())
[0.96078431 0.91089109 0.92 0.93 0.97 0.97
0.93939394 0.98989899 0.97979798 0.94949495]
0.952026126142026