数据结构之字典树

  Tire Tree,又名字典树,是一种树形结构。常用于统计、排序和保存大量的字符串,比如说,在自然语言处理中,常用来统计词频。优点是查询效率高。

  Trie Tree 具有以下三个性质:
1. 根节点不包含字符,除根节点意外每个节点只包含一个字符;
2. 从根节点到某一个节点,路径上经过的字符连接起来,为该节点对应的字符串;
3. 每个节点的所有子节点包含的字符串不相同。

  • Tire Tree 的基本实现
package com.feng.nlp.changedalgorithm;

import lombok.Data;
import org.apache.commons.lang3.StringUtils;

import java.util.HashMap;
import java.util.Map;

/**
 * TrieTree 字典树实现
 * <p/>
 * Created by lionel on 17/4/15.
 */
public class TrieTree {
    private TrieNode root;

    public TrieTree() {
        root = new TrieNode();
    }

    /**
     * 往字典树插入单词
     *
     * @param word 单词
     */
    public void insert(String word) {
        insert(root, word);
    }

    private void insert(TrieNode root, String word) {
        if (StringUtils.isBlank(word)) {
            return;
        }
        word = word.toLowerCase();
        char[] characters = word.toCharArray();
        int length = characters.length;
        for (int i = 0; i < length; i++) {
            int index = characters[i] - '0';
            if (root.childs[index] != null) {
                root.childs[index].prefixNum++;
            } else {
                root.childs[index] = new TrieNode(characters[i]);
            }
            if (i == length - 1) {
                root.childs[index].isleaf = true;
                root.childs[index].num++;
            }

            root = root.childs[index];
        }
    }

    /**
     * 遍历字典树 ,找出所有单词及其词频
     *
     * @return 所有单词及其词频
     */
    public HashMap<String, Integer> getAllWords() {
        return preOrderTraversal(this.root, "");
    }

    private HashMap<String, Integer> preOrderTraversal(TrieNode root, String prefixs) {
        HashMap<String, Integer> map = new HashMap<String, Integer>();
        if (root == null) {
            return map;
        }
        if (root.isleaf) {
            map.put(prefixs, root.getNum());
        }
        for (int i = 0; i < root.childs.length; i++) {
            if (root.childs[i] != null) {
                char ch = root.childs[i].getCharacter();
                String tmpStr = prefixs + ch;
                map.putAll(preOrderTraversal(root.childs[i], tmpStr));
            }
        }
        return map;
    }

    /**
     * 查询某字符串是否在字典树种
     *
     * @param word 单词
     * @return 在,返回 true;不在,返回 false
     */
    public boolean isExist(String word) {
        return isExist(this.root, word);
    }

    private boolean isExist(TrieNode root, String word) {
        if (StringUtils.isBlank(word)) {
            return false;
        }
        char[] characters = word.toLowerCase().toCharArray();
        for (char character : characters) {
            int index = character - '0';
            if (root.childs[index] == null) {
                return false;
            }
            root = root.childs[index];
        }
        return true;
    }

    /**
     * 得到以某字串为前缀的字串集,包括字串本身! 类似单词输入法的联想功能
     *
     * @param prefix 字串前缀
     * @return 串集以及出现次数
     */
    public Map<String, Integer> getWordsFroPrefix(String prefix) {
        return getWordsFroPrefix(this.root, prefix);
    }

    private Map<String, Integer> getWordsFroPrefix(TrieNode root, String prefix) {
        Map<String, Integer> map = new HashMap<String, Integer>();
        if (StringUtils.isBlank(prefix)) {
            return map;
        }
        char[] characters = prefix.toLowerCase().toCharArray();
        for (char character : characters) {
            int index = character - '0';
            if (root.childs[index] == null) {
                return null;
            }
            root = root.childs[index];
        }
        return preOrderTraversal(root, prefix);
    }

    @Data
    private class TrieNode {
        private char character;//节点存储字符
        private int num;//该词的出现次数
        private int prefixNum;//以该字串为前缀的字串数, 应该包括该字串本身
        private TrieNode[] childs;//节点孩子
        private boolean isleaf;

        public TrieNode() {
            character = '~';
            num = 0;
            prefixNum = 0;
            isleaf = false;
            childs = new TrieNode[40911];
        }

        public TrieNode(char character) {
            this.character = character;
            num = 0;
            prefixNum = 1;
            isleaf = false;
            childs = new TrieNode[40911];
        }
    }
}

猜你喜欢

转载自blog.csdn.net/lionel_fengj/article/details/74147575