字典树的运用,实现字符串的精确匹配与模糊匹配

背景

     实现 项目里面的一个 字符串检索功能(无关业务,只写实现,后续
 需要融入项目,会把树生成一段二进制blob 放到内存,变长的字节储存listsize
 以及一些偏移 会重新构造搜索算法,预计一周后会继续与大家分享)

需求

    现有一棵树(普通树)有10w个字符串,索引为ID,现要实现2个功能
1: 给定一字符串快速查找该字符串的位置
2:输入一字符串  每输入一字符,显示与之相关的所有字符串,直到该字符串
所有的字符输入完毕,查找唯一索引。大概意思就是搜索李白,输入李显示李
白,李时珍,李。。。继续输入白,出现唯一值李白

分析

    首先想到再建一棵字典树,也叫前缀树,以字符为索引,返回value(ID),
字典树的好处就是索引快 对于一个字符串,时间复杂度0(m),m是该字符串的
长度。缺点是空间开销较大0(26的m次方),所以考虑到项目的实际运用,下面
给出了一种改进版的字典树,利用firstchild,pNextBrother指针节省空间
开销,由于树的高度不深(<26)宽度较大,每一层利用二分查找,时间效率还
说的过去!

思路

     建立另一棵空间消耗改进版的字典树, 索引为字符, value(返回值为id  
 上一棵树的索引)存在叶子节点,未来还可能生成一段blob放到内存,这样建
 树的开销就省去了。下面为建树的实现,字符串精确匹配与模糊匹配

 开始时间 2018 -04- 19

 完成时间 2018 -04- 20

trie树

    trie树又称为字典树、单词查找树。是一种树形结构,哈希树的变种。
典型应用是用于统计、排序和保存大量的字符串(不仅仅限于字符串),经常
被搜索引擎系统用于文本词频统计。trie树是用空间换取时间的典型数据结构。

这里写图片描述

代码

    树节点的结构,“叶子节点”存储nodeid也就是字符串的id 和 字符串,
其他的节点对于0, NULL此处的叶子节点并非真正意义上的叶子节点,是存储
每个字符串的最后一个字符的节点
typedef struct STreeNode* pSTreeNode;
struct STreeNode
{
    int nodeid;
    int namevalue;
    char* word;
    pSTreeNode pFirstChild;
    pSTreeNode pNextBrother;
    vector<pSTreeNode> pChildList;

    STreeNode(int name) {
        nodeid = 0;
        namevalue = name;
        word = NULL;
        pFirstChild = NULL;
        pNextBrother = NULL;
    }
}; 
建树,用pFirstChild, pNextBrother节省树空间,用pChildList便于搜索
void CTree::InsertNode(int Nodeid, char* name)
{
    if (CTree::GetInstance() == NULL) {
        return;
    }

    pSTreeNode Root_tmp = CTree::GetInstance();
    int idx = 0;
    //int value_tmp;
    int nameLen = strlen(name);
    //char* name_t
    for (int i = 0; i < nameLen; i++) {

        idx = get_index(name[i]);
        if (0 <= idx && idx <= MAX) {
            if (Root_tmp->pFirstChild == NULL) {
                Root_tmp->pFirstChild = new STreeNode(idx);
                Root_tmp->pChildList.push_back(Root_tmp->pFirstChild);
                //value_tmp = Root_tmp->namevalue;
                Root_tmp = Root_tmp->pFirstChild;
                //value_tmp = Root_tmp->namevalue;
            }

            else if (idx == Root_tmp->pFirstChild->namevalue) {
                Root_tmp = Root_tmp->pFirstChild;
            }

            else {
                bool PushFlag = 0;
                pSTreeNode tmp = InsertBrother(Root_tmp->pFirstChild, idx, PushFlag);

                if (1 == PushFlag) {
                    Root_tmp = tmp;
                }

                else {
                    Root_tmp->pChildList.push_back(tmp);
                    //int value_tmp = Root_tmp->namevalue;
                    Root_tmp = tmp;
                }
            }
        }
        else {

        }
    }
    Root_tmp->nodeid = Nodeid;

    if (NULL == Root_tmp->word) {
        Root_tmp->word = new char[strlen(name) + 1];
        if (NULL == Root_tmp->word) {
            cout<<"alloc root_tmp failed"<<endl;
        }

        strcpy(Root_tmp->word, name);  
        //printf("name %s  Root_tmp->word %s\n", name, Root_tmp->word);
    }
}
pSTreeNode CTree::InsertBrother(pSTreeNode pBrotherNode, int name, bool &a_flag)
{
    if (pBrotherNode->pNextBrother != NULL) {

        if (name == pBrotherNode->pNextBrother->namevalue) {
            a_flag = 1;
            return pBrotherNode->pNextBrother;
        }
        InsertBrother( pBrotherNode->pNextBrother, name, a_flag);
    }
    else {
        pBrotherNode->pNextBrother = new STreeNode(name);
        return pBrotherNode->pNextBrother;
    }
}
字典树形输出字典树
void CTree::printAll(pSTreeNode pNode, int counts)
{

    int flag = 0;


    if (pNode == NULL) {
        return;
    }

    if (pNode != pRoot) {
        printf("%c ", pNode->namevalue + 'a');
    }
    if (pNode->nodeid > 0) {
        printf("\n");
    }

    if (0 <pNode->pChildList.size()) {        


        for (int i =0; i < pNode->pChildList.size(); ++i) {    

            if (pNode->nodeid > 0 || i > 0) {
                for(int k = 0; k < counts; ++k) {
                    printf("  ");
                }
            }            
            printAll(pNode->pChildList[i], counts + 1);    

        }
    }

}
精确匹配返回字符串索引
int CTree::Fullmatch(char* name)
{
    pSTreeNode Root_tmp = CTree::GetInstance();
    int idx = 0;  
    if(NULL == Root_tmp || NULL == name) {
        return 0;
    }
    while(NULL != Root_tmp && '\0' != *name) {
        idx = get_index(*name);  
        if(IDXERR == idx) {
            return INVALID;  
        }
        pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);

        if (NULL == tmp) {
            return INVALID;
        }
        else {
            Root_tmp = tmp;
        }
        name++;  
    }  
    if(Root_tmp != NULL) {
        return Root_tmp->nodeid;
    }  
    else {
        return INVALID;
    }  
}
    模糊匹配,每输入一个字符要返回所有该字符对于的节点的所有“叶子节点”
nodeid 以及要返回此节点(记忆化搜索),避免搜索此字符串的下个字符的
时候从根节点开始搜索,为了方便看到过程,下面代码传参数char* 形,中间过  
程会输出每个字符对于的符合条件的nodeid(也就是字符串,id跟word是绑定
的),同时下一个字符搜索要用到上一个字符对于的节点
pSTreeNode CTree::Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode)
{
    if(NULL == pNode || '\0' == name) {
        return NULL;
    }
    int idx = 0;  
    idx = get_index(name);  
    if(IDXERR == idx) {
        return INVALID;  
    }
    pSTreeNode pNode_tmp = SearchNodeThVal(pNode, idx);

    if (0 < pNode_tmp->nodeid) {
        nodeids.push_back(pNode_tmp->nodeid);
        return pNode_tmp;
    }

    else {
        ReFuzzymatch(pNode_tmp, nodeids);
    }

    if (NULL == pNode_tmp) {
        return NULL;
    }
    else {
        return pNode_tmp;
    }  
}

void CTree::ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids)
{

    if (pNode == NULL) {
        return;
    }

    if (0 <pNode->pChildList.size()) {

        for (int i =0; i < pNode->pChildList.size(); ++i) {

            if (0 < pNode->pChildList[i]->nodeid) {

                nodeids.push_back(pNode->pChildList[i]->nodeid);
            }
            ReFuzzymatch(pNode->pChildList[i], nodeids);
        }    

    }
}

完整代码

#include <iostream>
#include <vector>
#include <algorithm>

using namespace std;


#define  MAX 26
#define IDXERR -1  
#define INVALID 0  

typedef struct STreeNode* pSTreeNode;

struct STreeNode
{
    int nodeid;
    int namevalue;
    char* word;
    pSTreeNode pFirstChild;
    pSTreeNode pNextBrother;
    vector<pSTreeNode> pChildList;

    STreeNode(int name) {
        nodeid = 0;
        namevalue = name;
        word = NULL;
        pFirstChild = NULL;
        pNextBrother = NULL;
    }
}; 


class CTree
{
public:
    CTree();
    ~CTree();


public:

    static pSTreeNode GetInstance();

    static void DestroyInstance();

    static void FreeMemory(pSTreeNode pNode);   

    void InsertNode(int Nodeid, char* name); 

    pSTreeNode InsertBrother(pSTreeNode pParentNode, int name, bool &a_flag);

    void ReSerachName(pSTreeNode pNode);      

    int NonReSerachName( pSTreeNode pNode);

    void printAll(pSTreeNode pNode, int counts);

    void printfWord(pSTreeNode pNode);

    pSTreeNode SearchNodeThVal(pSTreeNode pNode, int value);

    int Fullmatch(char* name);

    //int Fuzzymatch(char* name);

    void ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids);

    pSTreeNode Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode = pRoot);

    pSTreeNode Fullmatch(char* name, vector<int>&nodeids, pSTreeNode pNode = pRoot);

    //void SortChildList(pSTreeNode pNode);

    pSTreeNode SearchNode(pSTreeNode pNode, int id);

    static int get_index(char ch); 

private:

    static pSTreeNode pRoot;
};

bool Cmpvalue(pSTreeNode pNode, pSTreeNode qNode)
{
    return pNode->namevalue < qNode->namevalue;
}

pSTreeNode CTree::pRoot = NULL;

CTree::CTree()
{
    pRoot = new STreeNode(100);
    if (pRoot == NULL) {
        return;
    }
}

pSTreeNode CTree::GetInstance()
{

    if (NULL == pRoot) {
        pRoot = new STreeNode(100);
    }
    return pRoot;
}

void CTree::DestroyInstance()
{

    if (NULL == pRoot) {
        return;
    }
    FreeMemory(pRoot);
}

CTree::~CTree()
{

}

int CTree::get_index(char ch)  
{
    int idx;
    if ('A' <= ch && 'Z' >= ch) {
        idx = ch-'A';
    }

    else if ('a' <= ch && 'z' >= ch){
        idx = ch-'a';
    }

    else {
        idx = -1;
        printf("get_index failed\n");
    }

    if(idx < 0 || idx >= MAX)  
    {  
        idx = IDXERR;  
    }  
    return idx;  
}  

void CTree::FreeMemory(pSTreeNode pNode)
{
    if (pNode == NULL) {
        return;
    }

    if (pNode->pFirstChild != NULL){
        FreeMemory(pNode->pFirstChild);
    }

    if (pNode->pNextBrother != NULL) {
        FreeMemory(pNode->pNextBrother);
    }

    delete pNode;
    pNode = NULL;
}

void CTree::InsertNode(int Nodeid, char* name)
{
    if (CTree::GetInstance() == NULL) {
        return;
    }

    pSTreeNode Root_tmp = CTree::GetInstance();
    int idx = 0;
    //int value_tmp;
    int nameLen = strlen(name);
    //char* name_t
    for (int i = 0; i < nameLen; i++) {

        idx = get_index(name[i]);
        if (0 <= idx && idx <= MAX) {
            if (Root_tmp->pFirstChild == NULL) {
                Root_tmp->pFirstChild = new STreeNode(idx);
                Root_tmp->pChildList.push_back(Root_tmp->pFirstChild);
                //value_tmp = Root_tmp->namevalue;
                Root_tmp = Root_tmp->pFirstChild;
                //value_tmp = Root_tmp->namevalue;
            }

            else if (idx == Root_tmp->pFirstChild->namevalue) {
                Root_tmp = Root_tmp->pFirstChild;
            }

            else {
                bool PushFlag = 0;
                pSTreeNode tmp = InsertBrother(Root_tmp->pFirstChild, idx, PushFlag);

                if (1 == PushFlag) {
                    Root_tmp = tmp;
                }

                else {
                    Root_tmp->pChildList.push_back(tmp);
                    //int value_tmp = Root_tmp->namevalue;
                    Root_tmp = tmp;
                }
            }
        }
        else {

        }
    }
    Root_tmp->nodeid = Nodeid;

    if (NULL == Root_tmp->word) {
        Root_tmp->word = new char[strlen(name) + 1];
        if (NULL == Root_tmp->word) {
            cout<<"alloc root_tmp failed"<<endl;
        }

        strcpy(Root_tmp->word, name);  
        //printf("name %s  Root_tmp->word %s\n", name, Root_tmp->word);
    }
}

pSTreeNode CTree::InsertBrother(pSTreeNode pBrotherNode, int name, bool &a_flag)
{
    if (pBrotherNode->pNextBrother != NULL) {

        if (name == pBrotherNode->pNextBrother->namevalue) {
            a_flag = 1;
            return pBrotherNode->pNextBrother;
        }
        InsertBrother( pBrotherNode->pNextBrother, name, a_flag);
    }
    else {
        pBrotherNode->pNextBrother = new STreeNode(name);
        return pBrotherNode->pNextBrother;
    }
}

/*
pSTreeNode CTree::SearchNode(pSTreeNode pNode, int id)
{
    if (pNode == NULL) {
        return NULL;
    }

    if (pNode->nodeid == id) {
        return pNode;
    }

    if (pNode->pFirstChild == NULL && pNode->pNextBrother == NULL) {
        return NULL;
    }

    else {
        if ( pNode->pFirstChild != NULL) {
            pSTreeNode pNodeTemp = SearchNode( pNode->pFirstChild, id);
            if (pNodeTemp != NULL) {
                return pNodeTemp;
            }
            else {
                return SearchNode(pNode->pNextBrother, id);
            }
        }
        else {
            return SearchNode( pNode->pNextBrother, id);
        }
    }
}
*/

//输出字典树
void CTree::printAll(pSTreeNode pNode, int counts)
{

    int flag = 0;


    if (pNode == NULL) {
        return;
    }

    if (pNode != pRoot) {
        printf("%c ", pNode->namevalue + 'a');
    }
    if (pNode->nodeid > 0) {
        printf("\n");
    }

    if (0 <pNode->pChildList.size()) {      


        for (int i =0; i < pNode->pChildList.size(); ++i) { 

            if (pNode->nodeid > 0 || i > 0) {
                for(int k = 0; k < counts; ++k) {
                    printf("  ");
                }
            }           
            printAll(pNode->pChildList[i], counts + 1); 

        }
    }

}

void CTree::printfWord(pSTreeNode pNode)
{
    if (pNode == NULL) {
        return;
    }

    if (0 <pNode->pChildList.size()) {

        for (int i =0 ; i < pNode->pChildList.size(); ++i) {

            if (0 < pNode->pChildList[i]->nodeid) {

                printf("word is: %s           nodeid is: %d\n", pNode->pChildList[i]->word, pNode->pChildList[i]->nodeid);
            }
            printfWord(pNode->pChildList[i]);
        }   

    }
}


//递归处理ChildList 
void CTree::ReSerachName(pSTreeNode pNode)
{
    if (pNode == NULL) {
        return;
    }

    if (1 < pNode->pChildList.size()) {

        sort(pNode->pChildList.begin(), pNode->pChildList.end(), Cmpvalue);
    }

    ReSerachName(pNode->pFirstChild);
    ReSerachName(pNode->pNextBrother);
}

//精确匹配
int CTree::Fullmatch(char* name)
{
    pSTreeNode Root_tmp = CTree::GetInstance();
    int idx = 0;  
    if(NULL == Root_tmp || NULL == name) {
        return 0;
    }
    while(NULL != Root_tmp && '\0' != *name) {
        idx = get_index(*name);  
        if(IDXERR == idx) {
            return INVALID;  
        }
        pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);

        if (NULL == tmp) {
            return INVALID;
        }
        else {
            Root_tmp = tmp;
        }
        name++;  
    }  
    if(Root_tmp != NULL) {
        return Root_tmp->nodeid;
    }  
    else {
        return INVALID;
    }  
}

//模糊匹配  test
/*
int CTree::Fuzzymatch(char* name)
{

    pSTreeNode Root_tmp = pRoot;
    int idx = 0;  
    if(NULL == Root_tmp || NULL == name) {
        return 0;
    }
    while(NULL != Root_tmp && '\0' != *name) {
        idx = get_index(*name);  
        if(IDXERR == idx) {
            return INVALID;  
        }
        pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);

        if (0 < tmp->nodeid) {
            printf("idx: %c:  ", idx + 'a');
            cout<<tmp->nodeid<<endl;
        }

        else {
            vector<int> nodeids;
            ReFuzzymatch(tmp, nodeids);
            printf("idx: %c:  ", idx + 'a');
            for (int k = 0; k < nodeids.size(); ++k) {
                cout<<nodeids[k]<<" ";
            }
            cout<< endl;
        }

        if (NULL == tmp) {
            return INVALID;
        }
        else {
            Root_tmp = tmp;
        }
        name++;  
    }  
    if(Root_tmp != NULL) {
        return Root_tmp->nodeid;    
    }  
    else {
        return INVALID;
    }  

}
*/

//递归搜索某个节点的’叶子节点‘
void CTree::ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids)
{

    if (pNode == NULL) {
        return;
    }

    if (0 <pNode->pChildList.size()) {

        for (int i =0; i < pNode->pChildList.size(); ++i) {

            if (0 < pNode->pChildList[i]->nodeid) {

                nodeids.push_back(pNode->pChildList[i]->nodeid);
            }
            ReFuzzymatch(pNode->pChildList[i], nodeids);
        }   

    }
}

//返回单个字符的nodeids
pSTreeNode CTree::Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode)
{
    if(NULL == pNode || '\0' == name) {
        return NULL;
    }
    int idx = 0;  
    idx = get_index(name);  
    if(IDXERR == idx) {
        return INVALID;  
    }
    pSTreeNode pNode_tmp = SearchNodeThVal(pNode, idx);

    if (0 < pNode_tmp->nodeid) {
        nodeids.push_back(pNode_tmp->nodeid);
        return pNode_tmp;
    }

    else {
        ReFuzzymatch(pNode_tmp, nodeids);
    }

    if (NULL == pNode_tmp) {
        return NULL;
    }
    else {
        return pNode_tmp;
    }  
}

//二分查找
pSTreeNode CTree::Fullmatch(char* name, vector<int>&nodeids, pSTreeNode pNode)
{
    int idx = 0;  
    if(NULL == pNode || NULL == name) {
        return NULL;
    }
    while(NULL != pNode && '\0' != *name) {
        nodeids.clear();
        idx = get_index(*name);  
        if(IDXERR == idx) {
            return INVALID;  
        }
        pSTreeNode tmp = SearchNodeThVal(pNode, idx);

        if (0 < tmp->nodeid) {
            nodeids.push_back(tmp->nodeid);
        }

        else {
            ReFuzzymatch(tmp, nodeids);
        }

        if (NULL == tmp) {
            return NULL;
        }
        else {
            pNode = tmp;
        }
        name++;  
    }  
    if(pNode != NULL) {
        return pNode;   
    }  
    else {
        return NULL;
    }  
}


pSTreeNode CTree::SearchNodeThVal(pSTreeNode pNode, int value)
{
    int Listleft = 0;
    int Listright = pNode->pChildList.size() - 1; 
    int Listmid = 0;

    while (Listleft <= Listright) {
        Listmid = (Listleft + Listright) / 2;

        if (value < pNode->pChildList[Listmid]->namevalue) {
            Listright = Listmid - 1;
        }
        else if (value > pNode->pChildList[Listmid]->namevalue) {
            Listleft = Listmid + 1;
        }

        else {
            return pNode->pChildList[Listmid];
        }
    }
    printf("SearchNodeThVal failed value is: %d\n", value);
    return NULL;
}

ps:主函数大家可以自行构造测试,欢迎评论交流!

猜你喜欢

转载自blog.csdn.net/breakpoints_/article/details/80023445
今日推荐