Introduction to machine learning (1) k-nearest neighbor algorithm

Introduction to machine learning k-nearest neighbor algorithm

[Machine Learning of Ju An Jiang] Phase 1 k-nearest neighbor algorithm

KNN classification of action movies and romance movies

#outs:
D:\Anaconda3\envs\pcd\python.exe D:/机器学习/01K_Nearest.py
['爱情片']

Process finished with exit code 0

import pandas as pd
import numpy  as np
#1.构建数据集
rowdata = {
    
    '电影名称':['无问西东','后来的我们','前任3','红海行动','唐人街探案','战狼'],
           '打斗镜头':[1,5,12,108,112,115],
           '接吻镜头':[101,89,97,5,9,8],
           '电影类型':['爱情片','爱情片','爱情片','动作片','动作片','动作片']}

movie_data = pd.DataFrame(rowdata)

#2.计算距离
new_data = [24,67]
dist = ((movie_data.iloc[:6,1:3] - new_data)**2).sum(1)**0.5

#3.将距离升序排序,然后选取距离最小的K个点
k = 4
dist_l = pd.DataFrame({
    
    'dist':dist, 'labels': (movie_data.iloc[:6,3])})
dr = dist_l.sort_values(by = 'dist')[:k]    #排序

#确定前k个点所在的类别的出现频率
re = dr.loc[:,'labels'].value_counts()

result = []
result.append(re.index[0])

#4.封装函数
# 函数功能: KNN分类器
# 参数说明:
#     inX:需要预测分类的数据集
#     dataSet:已知分类标签的数据集(训练集)
#     k:k-邻近算法参数,选择距离最小的k个点
# 返回:
# result: 分类结果

def classify0(inX,dataSet,k):
    result = []
    dist = ((movie_data.iloc[:6, 1:3] - new_data) ** 2).sum(1) ** 0.5
    dist_l = pd.DataFrame({
    
    'dist': dist, 'labels': (movie_data.iloc[:6, 3])})
    dr = dist_l.sort_values(by='dist')[:k]  # 排序
    re = dr.loc[:, 'labels'].value_counts()
    result = []
    result.append(re.index[0])
    return  result

if __name__ == '__main__':
    inX = new_data
    dataSet = movie_data
    k = 4
    result = classify0(inX,dataSet,k)
    print(result)

Dating Site KNN Classification

Scatter chart data analysis

Insert picture description here

Code piece

    # 1.导入数据集
    datingTest = pd.read_table('dataset/datingTestSet.txt',header=None)
    datingTest.head()
    # 2.分析数据
    Colors = []
    for i in range(datingTest.shape[0]):
        m = datingTest.iloc[i,-1]    #提取最后一列,label
        if m == 'didntLike':
            Colors.append('black')
        if m == 'smallDoses':
            Colors.append('orange')
        if m == 'largeDoses':
            Colors.append('red')

    #绘制特征散点分布图
    plt.rcParams['font.sans-serif']=['Simhei']  #图中字体设置为黑体
    pl = plt.figure(figsize=(12,8))

    fig1 = pl.add_subplot(221)
    plt.scatter(datingTest.iloc[:,1],datingTest.iloc[:,2],marker='.',c=Colors)
    plt.xlabel('玩游戏视频所占时间比')
    plt.ylabel('每周消费冰淇淋的公升数')

    fig2 = pl.add_subplot(222)
    plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,1],marker='.',c=Colors)
    plt.xlabel('每年飞行常客里程')
    plt.ylabel('玩游戏视频所占时间比')

    fig3 = pl.add_subplot(223)
    plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,2],marker='.',c=Colors)
    plt.xlabel('每年飞行常客里程')
    plt.ylabel('每周消费冰淇淋的公升数')

All codes

D:\Anaconda3\envs\pcd\python.exe D:/机器学习/01K_Nearest.py
模型预测准确率为0.92
D:/机器学习/01K_Nearest.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predict'] = result

Process finished with exit code 0

import pandas as pd
import numpy  as np

import matplotlib as mpl
import matplotlib.pyplot as plt

#4.封装函数
# 函数功能: KNN分类器
# 参数说明:
#     inX:需要预测分类的数据集
#     dataSet:已知分类标签的数据集(训练集)
#     k:k-邻近算法参数,选择距离最小的k个点
# 返回:
# result: 分类结果
def datingClass(train,test,k):
    n = train.shape[1] - 1
    m = test.shape[0]
    result = []
    for i in range(m):
        dist = list((((train.iloc[:, :n] - test.iloc[i, :n]) ** 2).sum(1)) ** 0.5)
        dist_l = pd.DataFrame({
    
    'dist': dist, 'labels': (train.iloc[:, n])})
        dr = dist_l.sort_values(by = 'dist')[: k]     # 查找4个最临近的点
        re = dr.loc[:, 'labels'].value_counts()       # 统计各个label的个数
        result.append(re.index[0])                    # 将概率最大的label 添加到result中
    result = pd.Series(result)
    test['predict'] = result
    acc = (test.iloc[:,-1]==test.iloc[:,-2]).mean()
    print(f'模型预测准确率为{acc}')
    return test
#归一化
def minmax(dataSet):
    minDf = dataSet.min()
    maxDf = dataSet.max()
    normSet = (dataSet - minDf) / (maxDf - minDf)
    return normSet

def randSplit(dataset, rate=0.9):
    n = dataset.shape[0]
    m = int(n*rate)
    train = dataset.iloc[:m,:]
    test = dataset.iloc[m:,:]
    test.index = range(test.shape[0])
    return train,test

if __name__ == '__main__':
    # 1.导入数据集
    datingTest = pd.read_table('dataset/datingTestSet.txt',header=None)
    datingTest.head()
    # 2.分析数据
    Colors = []
    for i in range(datingTest.shape[0]):
        m = datingTest.iloc[i,-1]    #提取最后一列,label
        if m == 'didntLike':
            Colors.append('black')
        if m == 'smallDoses':
            Colors.append('orange')
        if m == 'largeDoses':
            Colors.append('red')

    #绘制特征散点分布图
    plt.rcParams['font.sans-serif']=['Simhei']  #图中字体设置为黑体
    pl = plt.figure(figsize=(12,8))

    fig1 = pl.add_subplot(221)
    plt.scatter(datingTest.iloc[:,1],datingTest.iloc[:,2],marker='.',c=Colors)
    plt.xlabel('玩游戏视频所占时间比')
    plt.ylabel('每周消费冰淇淋的公升数')

    fig2 = pl.add_subplot(222)
    plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,1],marker='.',c=Colors)
    plt.xlabel('每年飞行常客里程')
    plt.ylabel('玩游戏视频所占时间比')

    fig3 = pl.add_subplot(223)
    plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,2],marker='.',c=Colors)
    plt.xlabel('每年飞行常客里程')
    plt.ylabel('每周消费冰淇淋的公升数')

    plt.show()

    # 3.数据集的归一化处理
    datingT = pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]], axis=1)  #减少个别特征的权重影响
    train,test = randSplit(datingT)  # 0.9 训练集 0.1 测试集
    datingClass(train=train,test=test,k=4)




Guess you like

Origin blog.csdn.net/weixin_41281151/article/details/108720591