1.Apriori算法发现频繁集
1.1生成候选项集
对数据中的每条交易记录tran
检查一下can是否是tran的子集:
如果是,则增加can的计数值
对每个候选项集:
如果支持度不低于最小值,则保留该项集
返回所有频繁项集列表
def loadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]#交易记录
def createC1(dataSet):#大小为1的所欲候选集的集合,即商品的每个种类
c1=[]
for transaction in dataSet:
for item in transaction:
if [item] not in c1:
c1.append([item])
c1.sort()#[[1], [2], [3], [4], [5]]
return list(map(frozenset,c1))
#[frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5})]
def scanD(D,CK,minSupport):
ssCnt={}
for tid in D:#交易记录
for can in CK:#商品
if can.issubset(tid):
if not can in ssCnt:
ssCnt[can]=1
else :#商品can是某交易记录的子集
ssCnt[can]+=1
numItems=float(len(D))#交易记录数目
retList=[]
supportData={}
for key in ssCnt:
support=ssCnt[key]/numItems
if support>=minSupport:
retList.insert(0,key)#列表首部插入key
supportData[key]=support
return retList,supportData
1.2 组织完整的Apriori算法
伪代码如下:
当集合中项的个数大于0时
构建一个k项组成的候选项集的列表
检查数据以确认每个项集都是频繁的
保留频繁项集并构建k+1项组成的候选项集的列表
def apriori(dataSet,minSupport=0.5):
C1=createC1(dataSet) #大小为1的所有候选项集的集合
D=list(map(set,dataSet))
L1,supportData=scanD(D,C1,minSupport)#频繁项集;最频繁项集的支持度
L=[L1]
k=2
while(len(L[k-2]) > 0):#频繁项集中项的个数大于0
#[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],2
CK=aprioriGen(L[k-2],k)
LK,supK=scanD(D,CK,minSupport)
supportData.update(supK)
L.append(LK)
k+=1
return L,supportData
# =============================================================================
# x=[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]
# pprint.pprint(list(x[1])) #>>>[2]
# pprint.pprint(list(x[1])[0]) #>>>2
# pprint.pprint(list(x[1])[:0]) #>>>[]
# =============================================================================
2.从频繁集中挖掘关联规则
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)):# 只获取两个以上元素的集合
#L[1]:[frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]
for freqSet in L[i]:
H1=[frozenset([item]) for item in freqSet]
if(i>1):#若频繁集的元素数超过2,则合并
rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
else:#若项集中只有2个元素,则计算其可信度值
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList#包含可信度的规则列表
def calcConf(freqSet,H,supportData,br1,minConf=0.7):
prunedH=[]
for conseq in H:
conf=supportData[freqSet]/supportData[freqSet-conseq]
if conf>=minConf:
print(freqSet-conseq,'--->',conseq,'conf:',conf)
br1.append((freqSet-conseq,conseq,conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet,H,supportData,br1,minConf=0.7):
m=len(H[0])
if(len(freqSet)>(m+1)):
Hmp1=aprioriGen(H,m+1)#生成H中元素的无重复组合
Hmp1=calcConf(freqSet,Hmp1,supportData,br1,minConf)
if(len(Hmp1)>1):
rulesFromConseq(freqSet,H,supportData,br1,minConf=0.7)
#-----------------------------------------------------------------------------------------
#freqSet=frozenset({2,3})
#
#H=[frozenset([item]) for item in freqSet]
#
#Out[45]: [frozenset({2}), frozenset({3})]
#-----------------------------------------------------------------------------------------
#