一个简单的推荐系统实现

引言

利用jupyter notebook实现一个简单的推荐系统,数据来源是Moivelen中的5M压缩文件

数据导入

准备两份数据来进行训练

(1)数据是拥有1682份数据的电影集

(2)用户点评数据

(3)电影明细表

#先把两份文件的数据读到文件内存里,用pandas来做数据读取。
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise  import cosine_similarity
import time

def init():
    #初始化,读入文件:return:用户表,评分表,电影明细表
    user_file="E:\\Mycode\\Recommended_system\\ml-100k\\u.user"
    data_file="E:\\Mycode\\Recommended_system\\ml-100k\\u.data"
    item_file ="E:\\Mycode\\Recommended_system\\ml-100k\\u.item"
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv(user_file, sep="|", names=u_cols, encoding='latin-1')
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(data_file, sep='\t', names=r_cols,encoding='latin-1')
    i_cols = ['movie_id', 'movie_title' ,'release date','video release date', 'IMDb URL', 'unknown',
              'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
    items = pd.read_csv(item_file, sep='|', names=i_cols, encoding='latin-1')
    return users, ratings, items
users.head()
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-2-505e5cd46540> in <module>
----> 1 users.head()


NameError: name 'users' is not defined
user_file="E:\\Mycode\\Recommended_system\\ml-100k\\u.user"
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(user_file, sep="|", names=u_cols, encoding='latin-1')
users
user_id age sex occupation zip_code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213
5 6 42 M executive 98101
6 7 57 M administrator 91344
7 8 36 M administrator 05201
8 9 29 M student 01002
9 10 53 M lawyer 90703
10 11 39 F other 30329
11 12 28 F other 06405
12 13 47 M educator 29206
13 14 45 M scientist 55106
14 15 49 F educator 97301
15 16 21 M entertainment 10309
16 17 30 M programmer 06355
17 18 35 F other 37212
18 19 40 M librarian 02138
19 20 42 F homemaker 95660
20 21 26 M writer 30068
21 22 25 M writer 40206
22 23 30 F artist 48197
23 24 21 F artist 94533
24 25 39 M engineer 55107
25 26 49 M engineer 21044
26 27 40 F librarian 30030
27 28 32 M writer 55369
28 29 41 M programmer 94043
29 30 7 M student 55436
... ... ... ... ... ...
913 914 44 F other 08105
914 915 50 M entertainment 60614
915 916 27 M engineer N2L5N
916 917 22 F student 20006
917 918 40 M scientist 70116
918 919 25 M other 14216
919 920 30 F artist 90008
920 921 20 F student 98801
921 922 29 F administrator 21114
922 923 21 M student E2E3R
923 924 29 M other 11753
924 925 18 F salesman 49036
925 926 49 M entertainment 01701
926 927 23 M programmer 55428
927 928 21 M student 55408
928 929 44 M scientist 53711
929 930 28 F scientist 07310
930 931 60 M educator 33556
931 932 58 M educator 06437
932 933 28 M student 48105
933 934 61 M engineer 22902
934 935 42 M doctor 66221
935 936 24 M other 32789
936 937 48 M educator 98072
937 938 38 F technician 55038
938 939 26 F student 33319
939 940 32 M administrator 02215
940 941 20 M student 97229
941 942 48 F librarian 78209
942 943 22 M student 77841

943 rows × 5 columns

data_file="E:\\Mycode\\Recommended_system\\ml-100k\\u.data"
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(data_file, sep='\t', names=r_cols,encoding='latin-1')
ratings
user_id movie_id rating unix_timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596
5 298 474 4 884182806
6 115 265 2 881171488
7 253 465 5 891628467
8 305 451 3 886324817
9 6 86 3 883603013
10 62 257 2 879372434
11 286 1014 5 879781125
12 200 222 5 876042340
13 210 40 3 891035994
14 224 29 3 888104457
15 303 785 3 879485318
16 122 387 5 879270459
17 194 274 2 879539794
18 291 1042 4 874834944
19 234 1184 2 892079237
20 119 392 4 886176814
21 167 486 4 892738452
22 299 144 4 877881320
23 291 118 2 874833878
24 308 1 4 887736532
25 95 546 2 879196566
26 38 95 5 892430094
27 102 768 2 883748450
28 63 277 4 875747401
29 160 234 5 876861185
... ... ... ... ...
99970 449 120 1 879959573
99971 661 762 2 876037121
99972 721 874 3 877137447
99973 821 151 4 874792889
99974 764 596 3 876243046
99975 537 443 3 886031752
99976 618 628 2 891308019
99977 487 291 3 883445079
99978 113 975 5 875936424
99979 943 391 2 888640291
99980 864 685 4 888891900
99981 750 323 3 879445877
99982 279 64 1 875308510
99983 646 750 3 888528902
99984 654 370 2 887863914
99985 617 582 4 883789294
99986 913 690 3 880824288
99987 660 229 2 891406212
99988 421 498 4 892241344
99989 495 1091 4 888637503
99990 806 421 4 882388897
99991 676 538 4 892685437
99992 721 262 3 877137285
99993 913 209 2 881367150
99994 378 78 3 880056976
99995 880 476 3 880175444
99996 716 204 5 879795543
99997 276 1090 1 874795795
99998 13 225 2 882399156
99999 12 203 3 879959583

100000 rows × 4 columns

item_file ="E:\\Mycode\\Recommended_system\\ml-100k\\u.item"
i_cols = ['movie_id', 'movie_title' ,'release date','video release date', 'IMDb URL', 'unknown',
              'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary',
              'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
items = pd.read_csv(item_file, sep='|', names=i_cols, encoding='latin-1')
items
movie_id movie_title release date video release date IMDb URL unknown Action Adventure Animation Children's ... Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
5 6 Shanghai Triad (Yao a yao yao dao waipo qiao) ... 01-Jan-1995 NaN http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 7 Twelve Monkeys (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Twelve%20Monk... 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
7 8 Babe (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Babe%20(1995) 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
8 9 Dead Man Walking (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Dead%20Man%20... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 10 Richard III (1995) 22-Jan-1996 NaN http://us.imdb.com/M/title-exact?Richard%20III... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
10 11 Seven (Se7en) (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Se7en%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
11 12 Usual Suspects, The (1995) 14-Aug-1995 NaN http://us.imdb.com/M/title-exact?Usual%20Suspe... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
12 13 Mighty Aphrodite (1995) 30-Oct-1995 NaN http://us.imdb.com/M/title-exact?Mighty%20Aphr... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 14 Postino, Il (1994) 01-Jan-1994 NaN http://us.imdb.com/M/title-exact?Postino,%20Il... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
14 15 Mr. Holland's Opus (1995) 29-Jan-1996 NaN http://us.imdb.com/M/title-exact?Mr.%20Holland... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 16 French Twist (Gazon maudit) (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Gazon%20maudi... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
16 17 From Dusk Till Dawn (1996) 05-Feb-1996 NaN http://us.imdb.com/M/title-exact?From%20Dusk%2... 0 1 0 0 0 ... 0 0 1 0 0 0 0 1 0 0
17 18 White Balloon, The (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Badkonake%20S... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 19 Antonia's Line (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Antonia%20(1995) 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 20 Angels and Insects (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Angels%20and%... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
20 21 Muppet Treasure Island (1996) 16-Feb-1996 NaN http://us.imdb.com/M/title-exact?Muppet%20Trea... 0 1 1 0 0 ... 0 0 0 1 0 0 0 1 0 0
21 22 Braveheart (1995) 16-Feb-1996 NaN http://us.imdb.com/M/title-exact?Braveheart%20... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
22 23 Taxi Driver (1976) 16-Feb-1996 NaN http://us.imdb.com/M/title-exact?Taxi%20Driver... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
23 24 Rumble in the Bronx (1995) 23-Feb-1996 NaN http://us.imdb.com/M/title-exact?Hong%20Faan%2... 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 25 Birdcage, The (1996) 08-Mar-1996 NaN http://us.imdb.com/M/title-exact?Birdcage,%20T... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 26 Brothers McMullen, The (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Brothers%20Mc... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 27 Bad Boys (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Bad%20Boys%20... 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 28 Apollo 13 (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Apollo%2013%2... 0 1 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
28 29 Batman Forever (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Batman%20Fore... 0 1 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 30 Belle de jour (1967) 01-Jan-1967 NaN http://us.imdb.com/M/title-exact?Belle%20de%20... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1652 1653 Entertaining Angels: The Dorothy Day Story (1996) 27-Sep-1996 NaN http://us.imdb.com/M/title-exact?Entertaining%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1653 1654 Chairman of the Board (1998) 01-Jan-1998 NaN http://us.imdb.com/Title?Chairman+of+the+Board... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1654 1655 Favor, The (1994) 01-Jan-1994 NaN http://us.imdb.com/M/title-exact?Favor,%20The%... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1655 1656 Little City (1998) 20-Feb-1998 NaN http://us.imdb.com/M/title-exact?Little+City+(... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1656 1657 Target (1995) 28-Feb-1996 NaN http://us.imdb.com/M/title-exact?Target%20(1995) 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1657 1658 Substance of Fire, The (1996) 06-Dec-1996 NaN http://us.imdb.com/M/title-exact?Substance%20o... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1658 1659 Getting Away With Murder (1996) 12-Apr-1996 NaN http://us.imdb.com/Title?Getting+Away+With+Mur... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1659 1660 Small Faces (1995) 09-Aug-1996 NaN http://us.imdb.com/M/title-exact?Small%20Faces... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1660 1661 New Age, The (1994) 01-Jan-1994 NaN http://us.imdb.com/M/title-exact?New%20Age,%20... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1661 1662 Rough Magic (1995) 30-May-1997 NaN http://us.imdb.com/M/title-exact?Rough%20Magic... 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1662 1663 Nothing Personal (1995) 30-Apr-1997 NaN http://us.imdb.com/M/title-exact?Nothing%20Per... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1663 1664 8 Heads in a Duffel Bag (1997) 18-Apr-1997 NaN http://us.imdb.com/Title?8+Heads+in+a+Duffel+B... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1664 1665 Brother's Kiss, A (1997) 25-Apr-1997 NaN http://us.imdb.com/M/title-exact?Brother%27s%2... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1665 1666 Ripe (1996) 02-May-1997 NaN http://us.imdb.com/M/title-exact?Ripe%20%28199... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1666 1667 Next Step, The (1995) 13-Jun-1997 NaN http://us.imdb.com/M/title-exact?Next%20Step%2... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1667 1668 Wedding Bell Blues (1996) 13-Jun-1997 NaN http://us.imdb.com/M/title-exact?Wedding%20Bel... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1668 1669 MURDER and murder (1996) 20-Jun-1997 NaN http://us.imdb.com/M/title-exact?MURDER+and+mu... 0 0 0 0 0 ... 0 0 0 0 1 0 0 0 0 0
1669 1670 Tainted (1998) 01-Feb-1998 NaN http://us.imdb.com/M/title-exact?Tainted+(1998) 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1670 1671 Further Gesture, A (1996) 20-Feb-1998 NaN http://us.imdb.com/M/title-exact?Further+Gestu... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1671 1672 Kika (1993) 01-Jan-1993 NaN http://us.imdb.com/M/title-exact?Kika%20(1993) 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1672 1673 Mirage (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Mirage%20(1995) 0 1 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1673 1674 Mamma Roma (1962) 01-Jan-1962 NaN http://us.imdb.com/M/title-exact?Mamma%20Roma%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1674 1675 Sunchaser, The (1996) 25-Oct-1996 NaN http://us.imdb.com/M/title-exact?Sunchaser,%20... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1675 1676 War at Home, The (1996) 01-Jan-1996 NaN http://us.imdb.com/M/title-exact?War%20at%20Ho... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1676 1677 Sweet Nothing (1995) 20-Sep-1996 NaN http://us.imdb.com/M/title-exact?Sweet%20Nothi... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1677 1678 Mat' i syn (1997) 06-Feb-1998 NaN http://us.imdb.com/M/title-exact?Mat%27+i+syn+... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1678 1679 B. Monkey (1998) 06-Feb-1998 NaN http://us.imdb.com/M/title-exact?B%2E+Monkey+(... 0 0 0 0 0 ... 0 0 0 0 0 1 0 1 0 0
1679 1680 Sliding Doors (1998) 01-Jan-1998 NaN http://us.imdb.com/Title?Sliding+Doors+(1998) 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 0
1680 1681 You So Crazy (1994) 01-Jan-1994 NaN http://us.imdb.com/M/title-exact?You%20So%20Cr... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1681 1682 Scream of Stone (Schrei aus Stein) (1991) 08-Mar-1996 NaN http://us.imdb.com/M/title-exact?Schrei%20aus%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1682 rows × 24 columns

构建用户矩阵

# 用户总数
num_users = users.user_id.unique().shape[0]
# 电影总数
num_items = ratings.movie_id.unique().shape[0]
data_matrix=np.zeros((num_users,num_items))

def constructUserMovieMatrix(users, ratings):
    '''
    构造用户-电影矩阵
    :param users: 用户表
    :param ratings: 打分表
    :return: 用户对电影评分的矩阵
    '''
    num_users = users.user_id.unique().shape[0]   #用户数
    num_items = ratings.movie_id.unique().shape[0]   #电影总数
    data_matrix = np.zeros((num_users, num_items))
    for line in ratings.itertuples():
        data_matrix[line[1]-1, line[2]-1] = line[3]
    return data_matrix
constructUserMovieMatrix(users, ratings)
array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

1.unique()方法去重;
2.shape[0]返回表示DataFrame维度的元祖;
3.line的内容

def calculationSimilarity(data_matrix):
    '''
    转置计算电影之间相似度矩阵,不转置计算用户之间相似度矩阵
    :param data_matrix: 评分矩阵
    :return: 电影之间的相似度矩阵
    '''
    user_similarity = cosine_similarity(data_matrix, dense_output=True)
    item_similarity = cosine_similarity(data_matrix.T, dense_output=True)
    return item_similarity

def rec_sys(items,ratings, item_similarity, keywords, k):
    '''
    推荐系统
    :param items: 电影明细表
    :param ratings: 评分表
    :param item_similarity: 电影相似度矩阵
    :param keywords: 输入的电影名称或关键字
    :param k: 推荐个数
    :return: 推荐电影结果列表
    '''
    movie_list = []     # 存储推荐电影结果列表
    movie_id = list(items[items['movie_title'].str.contains(keywords)].movie_id)[0]   # 获得电影的id
    movie_similarity = item_similarity[movie_id - 1]    # 计算该电影的余弦相似度数组
    movie_similarity_index = np.argsort(-movie_similarity)[1:k + 1]     # 返回前k+1个最高相似度的索引位置
    for index in movie_similarity_index:
        rec_movie = list(items[items['movie_id'] == index + 1].movie_title)     # 电影名
        rec_movie.append(movie_similarity[index])    # 相似度
        rec_movie.append(ratings[ratings['movie_id'] == index+1].rating.mean()) # 平均评分
        rec_movie.append(len(ratings[ratings['movie_id'] == index+1]))    # 评分用户数
        movie_list.append(rec_movie)
    return movie_list
if __name__ == '__main__':
    beginTime = time.time()
    keywords = "Assassins"
    k = 5
    keywords = keywords.title()
    users, ratings, items = init()
    data_matrix = constructUserMovieMatrix(users, ratings)
    similarity = calculationSimilarity(data_matrix)
    movie_list = rec_sys(items, ratings, similarity, keywords, k)
    print(movie_list)
    print("推荐耗时:", time.time()-beginTime)
[['Outbreak (1995)', 0.477148560717635, 3.2403846153846154, 104], ['Client, The (1994)', 0.4185073174104336, 3.381443298969072, 97], ['Net, The (1995)', 0.40070132781155243, 3.0083333333333333, 120], ['Under Siege 2: Dark Territory (1995)', 0.3980230845871082, 2.4583333333333335, 48], ['Demolition Man (1993)', 0.39672213644098525, 3.152173913043478, 92]]
推荐耗时: 0.5808792114257812

参考链接

博客

猜你喜欢

转载自blog.csdn.net/Zengmeng1998/article/details/109029031