python数据挖掘入门与实践-----------通过亲和力分析推荐电影(Apriori)

嘤~本节代码比着书上的源码看了一遍并加上了自己的理解注释,但并没有运行成功,因为他运行警告,我还不会改错

亲和力分析:从频繁出现的商品中选取共同出现额商品组成频繁项集,生成关联规则

import os
import pandas as pd
import sys
#数据读取
ratings_filename = "D:\\python27\\study\\code\\Chapter4\\ml-1M\\ratings.dat"

#数据规范化
all_ratings = pd.read_csv(ratings_filename, delimiter="\t", header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]
 
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]

#选取部分数据
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]  #取前200名用户的打分数据

favorable_ratings = ratings[ratings["Favorable"]]  # 只包括用户喜欢电影的数据行
favorable_ratings[:5]

favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])#知道每个用户各喜欢哪个些电影
len(favorable_reviews_by_users)

num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()#知道某个电影的影迷有多少

num_favorable_by_movie.sort("Favorable", ascending=False)[:5]#查看最受欢迎的五部电影

frequent_itemsets = {}  # 以项集长度为字典的键
min_support = 50   #最小支持度

#为每一部电影生成只包含它自己的项集,检测它是否够频繁
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

from collections import defaultdict
#接收新发现的频繁项集,创建超集,检测频繁程度
def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])
print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()

for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
del frequent_itemsets[1]#删除只有一个元素的项集
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

#抽取关联规则
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))
print(candidate_rules[:5])    #查看前五条规则

correct_counts = defaultdict(int) #规则应验
incorrect_counts = defaultdict(int)  #规则不适用

for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
                
#计算每条规则的置信度
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}
min_confidence = 0.9
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))
#输出置信度最高的前五条规则
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

movie_name_filename = "D:\\python27\\study\\code\\Chapter4\\ml-1M\\movies.dat"
movie_name_data = pd.read_csv(movie_name_filename, delimiter="|", header=None, encoding = "mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
                           "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
                           "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
 

猜你喜欢

转载自blog.csdn.net/qq_39065788/article/details/82313744