A python implementation of the Apriori algorithm: finding frequent subsets by limiting candidate generation

The Apriori algorithm was proposed by Agrawl and R.Srikant in 1994, an original algorithm for mining frequent itemsets for Boolean association rules [AS94b]. The algorithm uses prior knowledge of the properties of frequent itemsets, using a method called layer-by-layer iteration. In order to improve the efficiency of generating frequent itemsets layer by layer, the algorithm uses prior properties to compress the search space.

Prior property : All non-empty subsets of frequent itemsets must also be frequent.

The Apriori algorithm mainly consists of two steps— connection step and pruning step .
Reference link: Association rules, Apriori algorithm and python implementation
There is no pruning operation in the original text. This article mainly adds the code of the pruning part. By traversing the subset of the candidate item set, the subset added to the item is not in the frequent item set. If the item is not a frequent item, delete it.

# -*- coding: utf-8 -*-
import copy

def PowerSetsBinary(items):
    """
    找出集合的所有子集
    """
    #generate all combination of N items
    N = len(items)
    #enumerate the 2**N possible combinations
    for i in range(2**N):
        combo = []
        for j in range(N):
            #test jth bit of integer i
            if(i >> j ) % 2 == 1:
                combo.append(items[j])
        yield combo


def loadDataSet():
    """
        创建一个用于测试的简单的数据集
    """
    D = [[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3], [1, 2, 3, 5], [1, 2, 3]]
    return D


def createC1(dataSet):
    """
        构建初始候选项集的列表,即所有候选项集只包含一个元素,
        C1是大小为1的所有候选项集的集合
    """
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if [item] not in C1:
                C1.append([item])
    C1.sort()
    # return map( frozenset, C1 )
    # return [var for var in map(frozenset,C1)]
    return [frozenset(var) for var in C1]


def scanDataSet(D, Ck, minSupport):
    """
        计算Ck中的项集在数据集合D(记录或者transactions)中的支持度,
        返回满足最小支持度的项集的集合,和所有项集支持度信息的字典。
    """
    subSetCount = {}
    # D=[{},{},{}] tid.type==set
    for tid in D:
        # Ck = [{},{},{}],can.type==frozenset
        for can in Ck:
            # 检查候选k项集中的每一项的所有元素是否都出现在每一个事务中,若true,则加1
            if can.issubset(tid):
                # subSetCount为候选支持度计数,get()返回值,如果值不在字典中则返回默认值0。
                subSetCount[can] = subSetCount.get(can, 0) + 1
    numItems = float(len(D))
    returnList = []
    # 选择出来的频繁项集,未使用先验性质
    supportData = {}
    for key in subSetCount:
        # 计算绝对支持度。
        support = subSetCount[key] / numItems  # 每个项集的支持度
        if support >= minSupport:  # 将满足最小支持度的项集,加入returnList
            returnList.insert(0, key)
        supportData[key] = support  # 汇总支持度数据
    return returnList, supportData


def aprioriGen(Lk, k):  # Aprior算法
    """
        由初始候选项集的集合Lk生成新的生成候选项集,
        k表示生成的新项集中所含有的元素个数
    """
    returnList = []
    for i in range(len(Lk)):
        L1 = list(Lk[i])[: k - 2]
        for j in range(i + 1, len(Lk)):
            # Lk[i].type == frozenset
            # 只需取前k-2个元素相等的候选频繁项集即可组成元素个数为k+1的候选频繁项集
            L2 = list(Lk[j])[: k - 2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                # print("k:{}---L1:{}---L2:{}".format(k, Lk[i], Lk[j]))
                # 返回一个包含Lk[i]和Lk[j]中每一个元素的集合set,相当于集合的union方法
                returnList.append(Lk[i] | Lk[j])

    # print("returnList:{}".format(returnList))
    return returnList

def has_infrequent_subset(L, Ck, k):
    # 这里涉及到深拷贝、浅拷贝的知识
    Ckc = copy.deepcopy(Ck)
    for i in Ck:
        p = [t for t in i]
        i_subset = PowerSetsBinary(p)
        subsets = [i for i in i_subset]
        # print(subsets)
        for each in subsets:
            # print(each)
            if each!=[] and each!=p and len(each)<k:
                # [t for z in L for t in z]将列表中的frozenset全部移到一层中
                if frozenset(each) not in [t for z in L for t in z]:
                    Ckc.remove(i)
                    break
    return Ckc


def apriori(dataSet, minSupport):
    # 构建初始候选项集C1
    C1 = createC1(dataSet)
    # 将dataSet集合化,以满足scanDataSet的格式要求
    D = [set(var) for var in dataSet]
    # 构建初始的频繁项集,即所有项集只有一个元素
    L1, suppData = scanDataSet(D, C1, minSupport)
    # 最初的L1中的每个项集含有一个元素,新生成的
    L = [L1]
    # 项集应该含有2个元素,所以 k=2
    k = 2

    while (len(L[k - 2]) > 0):
        Ck = aprioriGen(L[k - 2], k)
        # 剪枝
        Ck2 = has_infrequent_subset(L, Ck, k)
        # 候选支持度计数和min_sup进行比较
        Lk, supK = scanDataSet(D, Ck2, minSupport)
        # 将新的项集的支持度数据加入原来的总支持度字典中
        suppData.update(supK)
        # 将符合最小支持度要求的项集加入L
        L.append(Lk)
        # 新生成的项集中的元素个数应不断增加
        k += 1
    # 返回所有满足条件的频繁项集的列表,和所有候选项集的支持度信息
    return L[:-1], suppData


if __name__ == '__main__':
    myDat = loadDataSet()
    L, suppData = apriori(myDat, 0.22)
    print("频繁项集L:", L)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325282854&siteId=291194637