FP-Tree(二)

public class FPTree {
    // FP树根节点
    FPNode root = new FPNode("Root", -1);

    // FP树节点线索头
    Map<String, FPNode> firstNodeTable = new HashMap<>();

    // FP树节点线索尾
    Map<String, FPNode> lastNodeTable = new HashMap<>();

    // 支持度
    private int support = 1;

    public List<FPNode> table = new ArrayList<>();

    private static List<LogTemplate> templates = new ArrayList<>();

    public FPTree(List<List<String>> data, int support) {
        int size = data.size();
        List<Integer> count = new ArrayList<>();
        for (int i = 0; i < size; i++) {
            count.add(1);
        }
        buildTree(data, count, support, false);
    }

    public FPTree(List<LogTemplate> templates, int support, boolean sorted) {
        List<List<String>> data = new ArrayList<>();
        List<Integer> count = new ArrayList<>();
        for (LogTemplate template : templates) {
            data.add(template.getWords());
            count.add(template.getCount());
        }
        buildTree(data, count, support, sorted);
    }

    public void buildTree(List<List<String>> data, List<Integer> count, int support, boolean sorted) {
        this.support = support;
        if (!sorted) {
            data = sort(data, count);
        }
        // line为一行日志
        int i = 0;
        for (List<String> line : data) {
            FPNode curNode = root;
            for (String word : line) {
                if (curNode.getChildren().containsKey(word)) {
                    // 子节点存在则访问次数加一
                    curNode.getChildren().get(word).increase(count.get(i));
                } else {
                    // 子节点不存在则新增子节点
                    FPNode child = new FPNode(word, count.get(i));
                    curNode.getChildren().put(word, child);
                    child.setFather(curNode);
                }
                curNode = curNode.getChildren().get(word);
                // 当前节点有线索指向,则不必重复建立线索
                if (curNode.isVisited()) {
                    continue;
                }
                // 创建线索
                if (firstNodeTable.containsKey(word)) {
                    lastNodeTable.get(word).setNext(curNode);
                } else {
                    firstNodeTable.put(word, curNode);
                }
                lastNodeTable.put(word, curNode);
                curNode.setVisited(true);
            }
            i++;
        }
    }

    private List<List<String>> sort(List<List<String>> data, List<Integer> count) {
        Map<String, Integer> wordCount = new HashMap<>();
        // 统计单词出现的次数
        int i = 0;
        for (List<String> line : data) {
            for (String word : line) {
                if (wordCount.containsKey(word)) {
                    wordCount.put(word, wordCount.get(word) + count.get(i));
                } else {
                    wordCount.put(word, count.get(i));
                }
            }
            i++;
        }

        for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
            if (entry.getValue() >= this.support) {
                table.add(new FPNode(entry.getKey(), entry.getValue()));
            }
        }
        if(0 != table.size()) {
            table = table.stream().sorted(Comparator.comparing(FPNode::getCount).reversed())
                    .collect(Collectors.toList());
        }

        List<List<String>> result = new ArrayList<>();
        // 单词排序
        for (List<String> line : data) {
            List<String> newLine = line.stream()
                    .filter(word -> wordCount.get(word) >= support)
                    .sorted(Comparator.comparing(word -> wordCount.get(word)).reversed())
                    .collect(Collectors.toList());
            if (null != newLine && 0 != newLine.size()) {
                result.add(newLine);
            }
        }
        return result;
    }

    public void print() {
        root.print(0);
    }

    public void growth(FPTree fpTree, List<String> last, List<FPNode> table) {
        FPNode tree = fpTree.getRoot();
        if (isSingleTree(tree)) {
            // 获取单树路径上所有节点
            List<FPNode> wordCount = new ArrayList<>();
            FPNode child = getFirstChild(tree);
            while (null != child) {
                wordCount.add(child);
                child = getFirstChild(child);
            }
            // 获取wordCount所有非空子集
            List<LogTemplate> templates = getSonSet(wordCount);
            for (LogTemplate template : templates) {
                // 子集合出现次数大于支撑度则保留为模板
                if (template.getCount() >= support) {
                    this.templates.add(template);
                    template.getWords().addAll(last);
                }
            }
        } else {
            FPNode root = tree;
            Collections.reverse(table);
            for (FPNode node : table) {
                List<String> pre = new ArrayList<>();
                pre.add(node.getWord());
                pre.addAll(last);

                // 当前节点当做一个日志模板
                LogTemplate template = new LogTemplate();
                template.setCount(node.getCount());
                List<String> words = new ArrayList<>();
                words.add(node.getWord());
                template.setWords(words);
                this.templates.add(template);

                FPNode link = this.firstNodeTable.get(node.getWord());
                List<LogTemplate> linkTemplates = new ArrayList<>();
                // 一条线索上有多个节点,每个节点从下往上对应一条日志模板路径
                while (null != link) {
                    FPNode me = link;
                    LogTemplate meTemplate = new LogTemplate();
                    List<String> meWords = new ArrayList<>();
                    me = me.getFather();
                    // 线索上每个节点往上走
                    while (null != me.getFather()) {
                        meWords.add(me.getWord());
                        me = me.getFather();
                    }
                    Collections.reverse(meWords);
                    meTemplate.setWords(meWords);
                    meTemplate.setCount(link.getCount());
                    linkTemplates.add(meTemplate);
                    link = link.getNext();
                }

                // 统计每个单词出现的次数
                Map<String, Integer> wordCount = new HashMap<>();
                for (LogTemplate linkTemplate : linkTemplates) {
                    for (String word : linkTemplate.getWords()) {
                        if (wordCount.containsKey(word)) {
                            wordCount.put(word, wordCount.get(word) + linkTemplate.getCount());
                        } else {
                            wordCount.put(word, linkTemplate.getCount());
                        }
                    }
                }

                // 以上述节点构造新树
                FPTree newTree = new FPTree(linkTemplates, 1, false);
                List<FPNode> newTable = new ArrayList<>();
                for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
                    if (entry.getValue() >= this.support) {
                        newTable.add(new FPNode(entry.getKey(), entry.getValue()));
                    }
                }
                if(0 != newTable.size()) {
                    newTable = newTable.stream().sorted(Comparator.comparing(FPNode::getCount).reversed())
                            .collect(Collectors.toList());
                    newTree.growth(newTree, pre, newTable);
                }

            }
        }
    }

    private List<LogTemplate> getSonSet(List<FPNode> wordCount) {
        List<LogTemplate> result = new ArrayList<>();
        int length = wordCount.size();
        int mark = 0;
        int nEnd = 1 << length;
        // 对于length位二进制数,每个数字对应一个子集合取法
        for (mark = 0; mark < nEnd; mark++) {
            LogTemplate template = new LogTemplate();
            // 循环查找每位是否应该放入集合
            for (int i = 0; i < length; i++) {
                //该位有元素输出
                if (((1 << i) & mark) != 0) {
                    template.getWords().add(wordCount.get(i).getWord());
                    template.setCount(wordCount.get(i).getCount());
                }
            }
            // 空集合
            if (template.getCount() != 0) {
                result.add(template);
            }
        }
        return result;
    }

    private boolean isSingleTree(FPNode tree) {
        if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
            return true;
        }
        // 有多个子节点则不是单树
        if (1 < tree.getChildren().size()) {
            return false;
        } else {
            return isSingleTree(getFirstChild(tree));
        }
    }

    private FPNode getFirstChild(FPNode tree) {
        if (null == tree || null == tree.getChildren() || 0 == tree.getChildren().size()) {
            return null;
        } else {
            for (FPNode child : tree.getChildren().values()) {
                return child;
            }
            return null;
        }
    }

    public static void main(String[] args) {
        List<String> line1 = new ArrayList<>();
        line1.add("C");
        line1.add("A");
        line1.add("B");
        List<String> line2 = new ArrayList<>();
        line2.add("A");
        line2.add("B");
        line2.add("D");
        List<String> line3 = new ArrayList<>();
        line3.add("A");
        line3.add("B");
        List<String> line4 = new ArrayList<>();
        line4.add("C");
        line4.add("E");
        List<List<String>> data = new ArrayList<>();
        data.add(line1);
        data.add(line2);
        data.add(line3);
        data.add(line4);

        FPTree tree = new FPTree(data, 1);
        tree.print();
        tree.growth(tree, new ArrayList<>(), tree.table);
        for(LogTemplate template : templates) {
            template.print();
        }
    }

    public FPNode getRoot() {
        return root;
    }

    public void setRoot(FPNode root) {
        this.root = root;
    }
}

猜你喜欢

转载自www.cnblogs.com/coshaho/p/12163496.html