import numpy as np
from collections import defaultdict
d=np.array(["milk,water,juice","milk,water","milk,rice,water","water","tomato,juice","juice,cucumber"])
def Apriori(data,min_spport=2):
FP=[]
Data=[]
# 初始化项集
I=[]
# 因为字符串不好做运算,所以我们映射成数组
str2num=defaultdict(int)
cnt=0
for i in data:
for j in i.split(","):
if j not in str2num.keys():
str2num[j]=cnt
cnt+=1
for i in data:
t=[]
for j in i.split(","):
I.append(j)
t.append(str2num[j])
Data.append(set(t))
I=[[i] for i in set(I)]
def scan(setI):
tem=defaultdict(int)
for i in setI:
for j in Data:
# 做个映射
for k in i:
if str2num[k] not in j:
break
else:
tem[",".join(i)]+=1
return [[i,j] for i,j in tem.items() if j>=min_spport]
def comb(setI):
# 记录
FP.append(setI)
# 按字典序排序
I=[sorted([i[0]]) for i in setI]
T=[]
for i,v in enumerate(I):
for j in I[i+1:]:
if len(v)>1 and v[:-1]==j[:-1]:
T.append(v[:-1]+v[-1]+j[-1])
elif len(v)==1:
T.append(v+j)
return [set(i) for i in T]
while 1:
# 每次都需要扫描
I=scan(I)
# 这个I是已经进行剪枝后的了
# 然后要进行连接步,要求前K-2项相同
if I==[]:
return FP
I=comb(I)
print(Apriori(d))
输出结果为
[[['juice', 3], ['milk', 3], ['water', 4]], [['water,milk', 3]]]