Machine Learning Collaborative Filtering Algorithm

Collaborative filtering algorithm

Collaborative filtering algorithm is to infer unknown data based on existing data, find data with a similarity within a specified range from massive data, and these data become your neighbors, and the system will recommend items you like.

cosine similarity

insert image description here
Evaluate the similarity of two vectors by computing the cosine of the angle between them

modified cosine similarity

insert image description here

Find the cosine similarity after centering

Pearson coefficient method

insert image description here
The calculation of the Pearson coefficient and the modified cosine similarity is the same, the difference is that the denominator of the Pearson coefficient uses the rating set that is the common rating set of two users, while the modified cosine uses the rating sets of the two users.

Specific steps:
1. Find a collection of users with similar interests to the target user
2. Calculate the similarity
3. Find items in this collection that the user likes and that the target user does not use, and recommend them to the user

import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd

#创建数据
data = {
    
    "Tom":{
    
    "The Avengers":3.0,"The Martin":4.0,"Guardians of the Galaxy":3.5,"Edge of Tomorrow":5.0,"The Maze Runner":3.0},
        "Jane":{
    
    "The Avengers":3.0,"The Martin":4.0,"Guardians of the Galaxy":4.0,"Edge of Tomorrow":3.0,"The Maze Runner":3.0,"Unbroken":4.5},
       "Jim":{
    
    "The Martin":5.0,"Guardians of the Galaxy":4.0,"Edge of Tomorrow":1.0,"The Maze Runner":2.0,"Unbroken":4.0}}

data.get("Tom")
'''
{'The Avengers': 3.0,
 'The Martin': 4.0,
 'Guardians of the Galaxy': 3.5,
 'Edge of Tomorrow': 5.0,
 'The Maze Runner': 3.0}
'''

cosine similarity

#余弦相似法
def user_similarity_on_cosine(data,user1,user2):
    #选出用户1与用户2共有的评分电影
    common = [movie for movie in data[user1] if movie in data[user2]]
    #movie 返回的是data[user1]的键,即为电影名
    if len(common)==0:
        return 0
    #分子
    multiply_sum = sum((data[user1][movie])*(data[user2][movie]) for movie in common)#用户1用户2的电影评分相乘相加
    #分母
    pow_sum1 = sum(math.pow(data[user1][movie],2) for movie in data[user1])#用户1的电影评分平方和
    pow_sum2 = sum(math.pow(data[user2][movie],2) for movie in data[user2])#用户2的电影评分平方和
    
    modified_cosine_similarity = float(multiply_sum)/math.sqrt(pow_sum1*pow_sum2)#余弦相似性系数
    return modified_cosine_similarity
r1 = user_similarity_on_cosine(data,"Tom","Jane")
print('Tom与Jane的相似性:',r1)

modified cosine similarity


#修正余弦相似法
def user_similarity_on_modified_cosine(data,user1,user2):
    #选出用户1与用户2共有的评分电影
    common = [movie for movie in data[user1] if movie in data[user2]]
    #movie 返回的是data[user1]的键,即为电影名
    if len(common)==0:
        return 0
    #分子
    average1 = float(sum(data[user1][movie] for movie in data[user1]))/len(data[user1]) #用户1的电影评分均值
    average2 = float(sum(data[user2][movie] for movie in data[user2]))/len(data[user2]) #用户2的电影评分均值
    
    #分子
    #用户1和用户2的电影评分中心化后,在对应相乘相加
    multiply_sum =sum((data[user1][movie]-average1)*(data[user2][movie]-average2) for movie in common)
    
    #分母
    pow_sum1 = sum(math.pow(data[user1][movie]-average1,2) for movie in data[user1])#用户1的电影评分中心化后平方和
    pow_sum2 = sum(math.pow(data[user2][movie]-average2,2) for movie in data[user2])#用户2的电影评分中心化后平方和
    
    modified_cosine_similarity = float(multiply_sum)/math.sqrt(pow_sum1*pow_sum2)#修正余弦相似性系数
    return modified_cosine_similarity
r2 = user_similarity_on_modified_cosine(data,"Tom","Jane")
print('Tom与Jane的相似性:',r2)

Pearson coefficient

#皮尔森系数
def similarUserWithPearson(data,user1,user2):
    #选出用户1与用户2共有的评分电影
    common = [movie for movie in data[user1] if movie in data[user2]]
    #movie 返回的是data[user1]的键,即为电影名
    if len(common)==0:
        return 0
    #分子
    average1 = float(sum(data[user1][movie] for movie in common))/len(common) #用户1的电影评分均值
    average2 = float(sum(data[user2][movie] for movie in common))/len(common) #用户2的电影评分均值
    
    #分子
    #用户1和用户2的电影评分中心化后,在对应相乘相加
    multiply_sum =sum((data[user1][movie]-average1)*(data[user2][movie]-average2) for movie in common)
    
    #分母
    pow_sum1 = sum(math.pow(data[user1][movie]-average1,2) for movie in common)#用户1的电影评分中心化后平方和
    pow_sum2 = sum(math.pow(data[user2][movie]-average2,2) for movie in common)#用户2的电影评分中心化后平方和
    
    modified_cosine_similarity = float(multiply_sum)/math.sqrt(pow_sum1*pow_sum2)#修正余弦相似性系数
    return modified_cosine_similarity
r3 = similarUserWithPearson(data,"Tom","Jane")
print('Tom与Jane的相似性:',r3)

Visual analysis to find users with the highest similarity

r11 = user_similarity_on_cosine(data,"Jane","Jim")
r12 = user_similarity_on_cosine(data,"Jane","Tom")
r21 = user_similarity_on_modified_cosine(data,"Jane","Jim")
r22 = user_similarity_on_modified_cosine(data,"Jane","Tom")
r31 = similarUserWithPearson(data,"Jane","Jim")
r32 = similarUserWithPearson(data,"Jane","Tom")

matplotlib.rcParams['font.family'] = 'SimHei'
plt.figure(figsize=(6,4))
r1 = [r11,r21,r31]
r2 = [r12,r22,r32]
l = len(r1)
width = 0.3#条形宽度
x1 = np.arange(l)
x2 = np.arange(l)+width#将第二种图形分开,不设置的话会重合
s = ['余弦法','修正余弦法','皮尔森法']
plt.bar(x1,r1,width=0.3,label='Jane,Jim')
plt.bar(x2,r2,width=0.3,label='Jane,Tom')

plt.xticks(x1+width,s)
plt.ylim(0,1.2)#拉长纵坐标
plt.yticks([])#取消坐标刻度
plt.ylabel(u'相似性系数值',fontproperties='SimHei')
plt.legend()#图例放到图中
plt.title('Jane与其他用户的相似系数值')

insert image description here

recommendation = list(set(data.get("Jane")).difference(data.get("Jim")))
print("Jane看过而Jim没看过的电影:",recommendation)#Jane看过而Jim没看过的电影: ['The Avengers']

recommendation = list(set(data.get("Jane")).difference(data.get("Tom")))
print("Jane看过而Tom没看过的电影:",recommendation)#Jane看过而Tom没看过的电影: ['Unbroken']

Guess you like

Origin blog.csdn.net/weixin_56260304/article/details/130353041