# coding=utf-8
# @Time : 2019/12/7 15:46
# @Author : Z
# @Email : S
# @File : KNNtest.py
import numpy as np
import pandas as pd
from sklearn import model_selection as ms
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
import warnings
#采用 KNN 算法实现一个简单的推荐系统
warnings.filterwarnings("ignore")
#训练集和测试集的划分标准
def train_test_split(fileName,type=1):
header=['user_id','item_id','rating','timestamp']
if(type==1):
#sep:指定分隔符
#names:指定列名
df=pd.read_csv(fileName,sep='\t',names=header)
else:
#engine:选择读取的引擎
df=pd.read_csv(fileName,sep='::',names=header,engine='python')
#unique()为去重.shape[0]行个数
#就是总共有多少个user_id
n_users=df.user_id.unique().shape[0]
#取出最大的user_id
users=df.user_id.max()
# 就是总共有多少个item_id
n_items=df.item_id.unique().shape[0]
# 取出最大的item_id
items=df.item_id.max()
print('Number of users =' + str(n_users) + ' | Number of movies = ' + str(n_items))
print('The biggest ID of users = ' + str(users) + ' | The biggest ID of movies = ' + str(items))
#test_size:样本占比
train_data,test_data=ms.train_test_split(df,test_size=0.1)
train_data=pd.DataFrame(train_data)
test_data=pd.DataFrame(test_data)
#创建两个用户项矩阵,一个用于训练,另一个用于测试
train_data_matrix=np.zeros((users,items))
#itertuples(): 将DataFrame迭代为元祖
for line in train_data.itertuples():
train_data_matrix[line[1]-1,line[2]-1]=line[3]
test_data_matrix=np.zeros((users,items))
for line in train_data.itertuples():
test_data_matrix[line[1]-1,line[2]-1]=line[3]
return train_data_matrix,test_data_matrix
def fast_similarity(ratings,kind='user',epsilon=1e-9):
#epsilon->用于处理被零个错误分割的小数值
if kind == 'user':
sim=ratings.dot(ratings.T)+epsilon
elif kind == 'item':
sim=ratings.T.dot(ratings)+epsilon
#diagonal:查看矩阵对角线上的元素
norms=np.array([np.sqrt(np.diagonal(sim))])
return (sim/norms/norms.T)
# 使用sklearn计算MSE,首先去除数据矩阵中的无效0值,然后直接调用sklearn里面的mean_squared_error函数计算MSE
def get_rmse_mae(pred,actual):
#忽略非零项
# nonzero函数是numpy中用于得到数组array中非零元素的位置(数组索引)的函数。它的返回值是一个长度为a.ndim(数组a的轴数)
# 的元组,元组的每个元素都是一个整数数组,其值为非零元素的下标在对应轴上的值
#a.flatten():a是个数组,a.flatten()就是把a降到一维,默认是按行的方向降 。
pred=pred[actual.nonzero()].flatten()
actual=actual[actual.nonzero()].flatten()
# MSE:Mean Squared Error.
# 均方误差是指参数的估计值与参数的真实值之差的平方的期望.
# MSE可以评价数据的变化程度,MSE越小,说明模型的拟合实验数据能力强.
return sqrt(mean_squared_error(pred,actual)),mean_squared_error(pred,actual)
# 将Top-k和偏置消除算法结合起来,计算基于User的和基于Item的MSE,并分别取k=25,50,75,100,125,150
def predict_topk_nobias(ratings,similarity,kind='user',k=40):
# shape函数是numpy.core.fromnumeric中的函数,它的功能是查看矩阵或者数组的维数。
pred=np.zeros(ratings.shape)
if kind == 'user':
# mean()函数功能:求取均值
# 经常操作的参数为axis,以m * n矩阵举例:
# axis不设置值,对m * n个数求均值,返回一个实数
# axis = 0:压缩行,对各列求均值,返回1 * n矩阵
# axis = 1 :压缩列,对各行求均值,返回m * 1矩阵
user_bias=ratings.mean(axis=1)
# np.newaxis的作用就是选取部分的数据增加一个维度
ratings=(ratings - user_bias[:,np.newaxis]).copy()
for i in range(ratings.shape[0]):
top_k_users=[np.argsort(similarity[:,i])[:-k-1:-1]]
for j in range(ratings.shape[1]):
pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
pred+=user_bias[:,np.newaxis]
if kind == 'item':
item_bias = ratings.mean(axis=0)
ratings = (ratings - item_bias[np.newaxis, :]).copy()
for j in range(ratings.shape[1]):
top_k_items = [np.argsort(similarity[:, j])[:-k - 1:-1]]
for i in range(ratings.shape[0]):
pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
pred+=item_bias[np.newaxis, :]
return pred
if __name__ == "__main__":
print("Please choose which dataset you want to use\n1:ml-100k\n2:ml-1m")
n = input("Your choose:")
if(n==1):
train, test = train_test_split('./u.data', 1)
else:
train, test = train_test_split('./ratings.dat', 2)
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print(item_similarity[:4, :4])
print("%3s%20s%20s" % ('K', "RMSE", 'MAE'))
for k in [25, 50, 75, 100, 125, 150]:
user_pred = predict_topk_nobias(train, user_similarity, kind='user', k=k)
item_pred = predict_topk_nobias(train, item_similarity, kind='item', k=k)
user_test_rmse, user_test_mae = get_rmse_mae(user_pred, test)
item_test_rmse, item_test_mae = get_rmse_mae(item_pred, test)
print("%3d%19.3f%19.3f%20s" % (k, user_test_rmse, user_test_mae, "user_test"))
print("%3d%19.3f%19.3f%20s" % (k, item_test_rmse, item_test_mae, "item_test"))
# Please choose which dataset you want to use
# 1:ml-100k
# 2:ml-1m
# Your choose:2
# Number of users =6040 | Number of movies = 3706
# The biggest ID of users = 6040 | The biggest ID of movies = 3952
# [[1. 0.35834266 0.24283893 0.16486061]
# [0.35834266 1. 0.22002872 0.13618566]
# [0.24283893 0.22002872 1. 0.18803194]
# [0.16486061 0.13618566 0.18803194 1. ]]
# K RMSE MAE
# 25 2.071 4.288 user_test
# 25 2.094 4.385 item_test
# 50 2.202 4.848 user_test
# 50 2.264 5.124 item_test
# 75 2.263 5.123 user_test
# 75 2.349 5.518 item_test
# 100 2.303 5.304 user_test
# 100 2.405 5.786 item_test
# 125 2.332 5.440 user_test
# 125 2.448 5.993 item_test
# 150 2.356 5.552 user_test
# 150 2.482 6.161 item_test
采用 KNN 算法实现一个简单的推荐系统
猜你喜欢
转载自blog.csdn.net/NewBeeMu/article/details/103439094
今日推荐
周排行