#!/usr/bin/env python3# -*- coding: utf-8 -*-# @Time : 2021/10/28 11:32# @Author : 邱天衡# @FileName: main.py# @Software: PyCharm# @Blog :https://blog.csdn.net/Mr_Clutchimport itertools
import json
import pandas as pd
# 非频繁项global non_frequency_d
# 频繁项集global frequency_d
defApriori(d, support, confidence):global non_frequency_d
global frequency_d
confidence_dic ={
}
dir_data ={
}
item_d ={
}# 存储候选n项集for index, rows in d.iterrows():
dir_data[index]= rows['items'][1:-1]
item = rows['items'][1:-1].split(',')for i in item:if i in item_d:
item_d[i]+=1else:
item_d[i]=1# 生成频繁1项集
frequency_d ={
k: v for k, v in item_d.items()if v >= support}
confidence_dic[1]= frequency_d
print(1,len(frequency_d))
non_frequency_d ={
k: v for k, v in item_d.items()if v < support}.keys()
count =2while1:
k_list =[]# 存储当前项集合
item_d ={
}# 存储候选n项集for k, _ in frequency_d.items():
k_list.extend(k.split(','))# 去重
k_list =list(set(k_list))
combination_data =list(itertools.combinations(k_list, count))for item in combination_data:# 利用apriori性质去除一些候选项, 优化算法
f_keys =list(itertools.combinations(item, count -1))
flag =Falsefor f in f_keys:for j in non_frequency_d:ifall(i in j for i in f):
flag =Truebreakif flag:continue# 生成频繁count项集for _, v in dir_data.items():ifall(i in v.split(',')for i in item):
key =','.join(item)if key in item_d:
item_d[key]+=1else:
item_d[key]=1# 统计频繁count项集
frequency_d ={
k: v for k, v in item_d.items()if v >= support}
non_frequency_d ={
k: v for k, v in item_d.items()if v < support}.keys()# 打印对应频繁项集数目print(count,len(frequency_d))iflen(frequency_d)==0:breakeliflen(frequency_d)==1:
confidence_dic[count]= frequency_d
else:# 进行强关联规则分析
confidence_dic[count]= frequency_d
count +=1print('----------频繁项目集----------')print(json.dumps(confidence_dic, indent=4, ensure_ascii=False, sort_keys=False, separators=(',',':')))if confidence_dic.get(count,0)==0:
count -=1print('----------强关联规则----------')for i inrange(count,1,-1):
item = confidence_dic[i]for k, v in item.items():for j inrange(1, i):
group =list(itertools.combinations(k.split(','), j))# 生成强关联规则for g in group:
names = k.split(',')for z in g:
names.remove(z)
A = confidence_dic[len(names)][','.join(names)]
union = confidence_dic[len(k.split(','))][k]# 计算置信度
percent =round(union / A,2)if percent > confidence:print(','.join(names)+' -> '+','.join(g),'\t|confidence|\t', percent)if __name__ =='__main__':# 最小支持度计数
minsup_count =150# 最小置信度
min_confidence =0.3
data = pd.read_csv('./Groceries.csv', header=0, index_col=0)
Apriori(data, minsup_count, min_confidence)