采用 KNN 算法实现一个简单的推荐系统

# coding=utf-8
# @Time    : 2019/12/7 15:46
# @Author  : Z
# @Email   : S
# @File    : KNNtest.py
import numpy as np
import pandas as pd
from sklearn import model_selection as ms
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
import warnings

#采用 KNN 算法实现一个简单的推荐系统

warnings.filterwarnings("ignore")

#训练集和测试集的划分标准
def train_test_split(fileName,type=1):
    header=['user_id','item_id','rating','timestamp']
    if(type==1):
        #sep:指定分隔符
        #names:指定列名
        df=pd.read_csv(fileName,sep='\t',names=header)
    else:
        #engine:选择读取的引擎
        df=pd.read_csv(fileName,sep='::',names=header,engine='python')
    #unique()为去重.shape[0]行个数
    #就是总共有多少个user_id
    n_users=df.user_id.unique().shape[0]
    #取出最大的user_id
    users=df.user_id.max()
    # 就是总共有多少个item_id
    n_items=df.item_id.unique().shape[0]
    # 取出最大的item_id
    items=df.item_id.max()

    print('Number of users =' + str(n_users) + ' | Number of movies = ' + str(n_items))
    print('The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items))

    #test_size:样本占比
    train_data,test_data=ms.train_test_split(df,test_size=0.1)
    train_data=pd.DataFrame(train_data)
    test_data=pd.DataFrame(test_data)
    #创建两个用户项矩阵,一个用于训练,另一个用于测试
    train_data_matrix=np.zeros((users,items))
    #itertuples(): 将DataFrame迭代为元祖
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1,line[2]-1]=line[3]
    test_data_matrix=np.zeros((users,items))
    for line in train_data.itertuples():
        test_data_matrix[line[1]-1,line[2]-1]=line[3]
    return train_data_matrix,test_data_matrix

def fast_similarity(ratings,kind='user',epsilon=1e-9):
    #epsilon->用于处理被零个错误分割的小数值
    if kind == 'user':
        sim=ratings.dot(ratings.T)+epsilon
    elif kind == 'item':
        sim=ratings.T.dot(ratings)+epsilon
    #diagonal:查看矩阵对角线上的元素
    norms=np.array([np.sqrt(np.diagonal(sim))])
    return (sim/norms/norms.T)

# 使用sklearn计算MSE,首先去除数据矩阵中的无效0值,然后直接调用sklearn里面的mean_squared_error函数计算MSE
def get_rmse_mae(pred,actual):
    #忽略非零项
    # nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。它的返回值是一个长度为a.ndim(数组a的轴数)
    # 的元组,元组的每个元素都是一个整数数组,其值为非零元素的下标在对应轴上的值
    #a.flatten():a是个数组,a.flatten()就是把a降到一维,默认是按行的方向降 。
    pred=pred[actual.nonzero()].flatten()
    actual=actual[actual.nonzero()].flatten()
    # MSE:Mean Squared Error.
    # 均方误差是指参数的估计值与参数的真实值之差的平方的期望.
    # MSE可以评价数据的变化程度,MSE越小,说明模型的拟合实验数据能力强.
    return sqrt(mean_squared_error(pred,actual)),mean_squared_error(pred,actual)

# 将Top-k和偏置消除算法结合起来,计算基于User的和基于Item的MSE,并分别取k=25,50,75,100,125,150
def predict_topk_nobias(ratings,similarity,kind='user',k=40):
    # shape函数是numpy.core.fromnumeric中的函数,它的功能是查看矩阵或者数组的维数。
    pred=np.zeros(ratings.shape)
    if kind == 'user':
        # mean()函数功能:求取均值
        # 经常操作的参数为axis,以m * n矩阵举例:
        # axis不设置值,对m * n个数求均值,返回一个实数
        # axis = 0:压缩行,对各列求均值,返回1 * n矩阵
        # axis = 1 :压缩列,对各行求均值,返回m * 1矩阵
        user_bias=ratings.mean(axis=1)
        # np.newaxis的作用就是选取部分的数据增加一个维度
        ratings=(ratings - user_bias[:,np.newaxis]).copy()
        for i in range(ratings.shape[0]):
            top_k_users=[np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
        pred+=user_bias[:,np.newaxis]
    if kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
        pred+=item_bias[np.newaxis, :]

    return pred

if __name__ == "__main__":
    print("Please choose which dataset you want to use\n1:ml-100k\n2:ml-1m")
    n = input("Your choose:")
    if(n==1):
        train, test = train_test_split('./u.data', 1)
    else:
        train, test = train_test_split('./ratings.dat', 2)
    user_similarity = fast_similarity(train, kind='user')
    item_similarity = fast_similarity(train, kind='item')

    print(item_similarity[:4, :4])

    print("%3s%20s%20s" % ('K', "RMSE", 'MAE'))

    for k in [25, 50, 75, 100, 125, 150]:
        user_pred = predict_topk_nobias(train, user_similarity, kind='user', k=k)
        item_pred = predict_topk_nobias(train, item_similarity, kind='item', k=k)
        user_test_rmse, user_test_mae = get_rmse_mae(user_pred, test)
        item_test_rmse, item_test_mae = get_rmse_mae(item_pred, test)
        print("%3d%19.3f%19.3f%20s" % (k, user_test_rmse, user_test_mae, "user_test"))
        print("%3d%19.3f%19.3f%20s" % (k, item_test_rmse, item_test_mae, "item_test"))

# Please choose which dataset you want to use
# 1:ml-100k
# 2:ml-1m
# Your choose:2
# Number of users =6040 | Number of movies = 3706
# The biggest ID of users = 6040 | The biggest ID of movies = 3952
# [[1.         0.35834266 0.24283893 0.16486061]
#  [0.35834266 1.         0.22002872 0.13618566]
#  [0.24283893 0.22002872 1.         0.18803194]
#  [0.16486061 0.13618566 0.18803194 1.        ]]
#   K                RMSE                 MAE
#  25              2.071              4.288           user_test
#  25              2.094              4.385           item_test
#  50              2.202              4.848           user_test
#  50              2.264              5.124           item_test
#  75              2.263              5.123           user_test
#  75              2.349              5.518           item_test
# 100              2.303              5.304           user_test
# 100              2.405              5.786           item_test
# 125              2.332              5.440           user_test
# 125              2.448              5.993           item_test
# 150              2.356              5.552           user_test
# 150              2.482              6.161           item_test
发布了189 篇原创文章 · 获赞 13 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/NewBeeMu/article/details/103439094
今日推荐