动手实现Apriori关联规则算法(Python版)

直接暴力实现!

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Time    : 2021/10/28 11:32
# @Author  : 邱天衡
# @FileName: main.py
# @Software: PyCharm
# @Blog    :https://blog.csdn.net/Mr_Clutch

import itertools
import json
import pandas as pd

# 非频繁项
global non_frequency_d
# 频繁项集
global frequency_d


def Apriori(d, support, confidence):
    global non_frequency_d
    global frequency_d
    confidence_dic = {
    
    }
    dir_data = {
    
    }
    item_d = {
    
    }  # 存储候选n项集
    for index, rows in d.iterrows():
        dir_data[index] = rows['items'][1:-1]
        item = rows['items'][1:-1].split(',')
        for i in item:
            if i in item_d:
                item_d[i] += 1
            else:
                item_d[i] = 1
    # 生成频繁1项集
    frequency_d = {
    
    k: v for k, v in item_d.items() if v >= support}
    confidence_dic[1] = frequency_d
    print(1, len(frequency_d))
    non_frequency_d = {
    
    k: v for k, v in item_d.items() if v < support}.keys()
    count = 2
    while 1:
        k_list = []  # 存储当前项集合
        item_d = {
    
    }  # 存储候选n项集
        for k, _ in frequency_d.items():
            k_list.extend(k.split(','))
        # 去重
        k_list = list(set(k_list))
        combination_data = list(itertools.combinations(k_list, count))
        for item in combination_data:
            # 利用apriori性质去除一些候选项, 优化算法
            f_keys = list(itertools.combinations(item, count - 1))
            flag = False
            for f in f_keys:
                for j in non_frequency_d:
                    if all(i in j for i in f):
                        flag = True
                        break
            if flag:
                continue
            # 生成频繁count项集
            for _, v in dir_data.items():
                if all(i in v.split(',') for i in item):
                    key = ','.join(item)
                    if key in item_d:
                        item_d[key] += 1
                    else:
                        item_d[key] = 1
        # 统计频繁count项集
        frequency_d = {
    
    k: v for k, v in item_d.items() if v >= support}
        non_frequency_d = {
    
    k: v for k, v in item_d.items() if v < support}.keys()
        # 打印对应频繁项集数目
        print(count, len(frequency_d))
        if len(frequency_d) == 0:
            break
        elif len(frequency_d) == 1:
            confidence_dic[count] = frequency_d
        else:  # 进行强关联规则分析
            confidence_dic[count] = frequency_d
            count += 1
    print('----------频繁项目集----------')
    print(json.dumps(confidence_dic, indent=4, ensure_ascii=False, sort_keys=False, separators=(',', ':')))
    if confidence_dic.get(count, 0) == 0:
        count -= 1
    print('----------强关联规则----------')
    for i in range(count, 1, -1):
        item = confidence_dic[i]
        for k, v in item.items():
            for j in range(1, i):
                group = list(itertools.combinations(k.split(','), j))
                # 生成强关联规则
                for g in group:
                    names = k.split(',')
                    for z in g:
                        names.remove(z)
                    A = confidence_dic[len(names)][','.join(names)]
                    union = confidence_dic[len(k.split(','))][k]
                    # 计算置信度
                    percent = round(union / A, 2)
                    if percent > confidence:
                        print(','.join(names) + ' -> ' + ','.join(g), '\t|confidence|\t', percent)


if __name__ == '__main__':
    # 最小支持度计数
    minsup_count = 150
    # 最小置信度
    min_confidence = 0.3
    data = pd.read_csv('./Groceries.csv', header=0, index_col=0)
    Apriori(data, minsup_count, min_confidence)

运行结果

在这里插入图片描述

在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/Mr_Clutch/article/details/121042002