背景
实现 项目里面的一个 字符串检索功能(无关业务,只写实现,后续
需要融入项目,会把树生成一段二进制blob 放到内存,变长的字节储存listsize
以及一些偏移 会重新构造搜索算法,预计一周后会继续与大家分享)
需求
现有一棵树(普通树)有10w个字符串,索引为ID,现要实现2个功能
1: 给定一字符串快速查找该字符串的位置
2:输入一字符串 每输入一字符,显示与之相关的所有字符串,直到该字符串
所有的字符输入完毕,查找唯一索引。大概意思就是搜索李白,输入李显示李
白,李时珍,李。。。继续输入白,出现唯一值李白
分析
首先想到再建一棵字典树,也叫前缀树,以字符为索引,返回value(ID),
字典树的好处就是索引快 对于一个字符串,时间复杂度0(m),m是该字符串的
长度。缺点是空间开销较大0(26的m次方),所以考虑到项目的实际运用,下面
给出了一种改进版的字典树,利用firstchild,pNextBrother指针节省空间
开销,由于树的高度不深(<26)宽度较大,每一层利用二分查找,时间效率还
说的过去!
思路
建立另一棵空间消耗改进版的字典树, 索引为字符, value(返回值为id
上一棵树的索引)存在叶子节点,未来还可能生成一段blob放到内存,这样建
树的开销就省去了。下面为建树的实现,字符串精确匹配与模糊匹配
开始时间 2018 -04- 19
完成时间 2018 -04- 20
trie树
trie树又称为字典树、单词查找树。是一种树形结构,哈希树的变种。
典型应用是用于统计、排序和保存大量的字符串(不仅仅限于字符串),经常
被搜索引擎系统用于文本词频统计。trie树是用空间换取时间的典型数据结构。
代码
树节点的结构,“叶子节点”存储nodeid也就是字符串的id 和 字符串,
其他的节点对于0, NULL此处的叶子节点并非真正意义上的叶子节点,是存储
每个字符串的最后一个字符的节点
typedef struct STreeNode* pSTreeNode;
struct STreeNode
{
int nodeid;
int namevalue;
char* word;
pSTreeNode pFirstChild;
pSTreeNode pNextBrother;
vector<pSTreeNode> pChildList;
STreeNode(int name) {
nodeid = 0;
namevalue = name;
word = NULL;
pFirstChild = NULL;
pNextBrother = NULL;
}
};
建树,用pFirstChild, pNextBrother节省树空间,用pChildList便于搜索
void CTree::InsertNode(int Nodeid, char* name)
{
if (CTree::GetInstance() == NULL) {
return;
}
pSTreeNode Root_tmp = CTree::GetInstance();
int idx = 0;
//int value_tmp;
int nameLen = strlen(name);
//char* name_t
for (int i = 0; i < nameLen; i++) {
idx = get_index(name[i]);
if (0 <= idx && idx <= MAX) {
if (Root_tmp->pFirstChild == NULL) {
Root_tmp->pFirstChild = new STreeNode(idx);
Root_tmp->pChildList.push_back(Root_tmp->pFirstChild);
//value_tmp = Root_tmp->namevalue;
Root_tmp = Root_tmp->pFirstChild;
//value_tmp = Root_tmp->namevalue;
}
else if (idx == Root_tmp->pFirstChild->namevalue) {
Root_tmp = Root_tmp->pFirstChild;
}
else {
bool PushFlag = 0;
pSTreeNode tmp = InsertBrother(Root_tmp->pFirstChild, idx, PushFlag);
if (1 == PushFlag) {
Root_tmp = tmp;
}
else {
Root_tmp->pChildList.push_back(tmp);
//int value_tmp = Root_tmp->namevalue;
Root_tmp = tmp;
}
}
}
else {
}
}
Root_tmp->nodeid = Nodeid;
if (NULL == Root_tmp->word) {
Root_tmp->word = new char[strlen(name) + 1];
if (NULL == Root_tmp->word) {
cout<<"alloc root_tmp failed"<<endl;
}
strcpy(Root_tmp->word, name);
//printf("name %s Root_tmp->word %s\n", name, Root_tmp->word);
}
}
pSTreeNode CTree::InsertBrother(pSTreeNode pBrotherNode, int name, bool &a_flag)
{
if (pBrotherNode->pNextBrother != NULL) {
if (name == pBrotherNode->pNextBrother->namevalue) {
a_flag = 1;
return pBrotherNode->pNextBrother;
}
InsertBrother( pBrotherNode->pNextBrother, name, a_flag);
}
else {
pBrotherNode->pNextBrother = new STreeNode(name);
return pBrotherNode->pNextBrother;
}
}
字典树形输出字典树
void CTree::printAll(pSTreeNode pNode, int counts)
{
int flag = 0;
if (pNode == NULL) {
return;
}
if (pNode != pRoot) {
printf("%c ", pNode->namevalue + 'a');
}
if (pNode->nodeid > 0) {
printf("\n");
}
if (0 <pNode->pChildList.size()) {
for (int i =0; i < pNode->pChildList.size(); ++i) {
if (pNode->nodeid > 0 || i > 0) {
for(int k = 0; k < counts; ++k) {
printf(" ");
}
}
printAll(pNode->pChildList[i], counts + 1);
}
}
}
精确匹配返回字符串索引
int CTree::Fullmatch(char* name)
{
pSTreeNode Root_tmp = CTree::GetInstance();
int idx = 0;
if(NULL == Root_tmp || NULL == name) {
return 0;
}
while(NULL != Root_tmp && '\0' != *name) {
idx = get_index(*name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);
if (NULL == tmp) {
return INVALID;
}
else {
Root_tmp = tmp;
}
name++;
}
if(Root_tmp != NULL) {
return Root_tmp->nodeid;
}
else {
return INVALID;
}
}
模糊匹配,每输入一个字符要返回所有该字符对于的节点的所有“叶子节点”
nodeid 以及要返回此节点(记忆化搜索),避免搜索此字符串的下个字符的
时候从根节点开始搜索,为了方便看到过程,下面代码传参数char* 形,中间过
程会输出每个字符对于的符合条件的nodeid(也就是字符串,id跟word是绑定
的),同时下一个字符搜索要用到上一个字符对于的节点
pSTreeNode CTree::Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode)
{
if(NULL == pNode || '\0' == name) {
return NULL;
}
int idx = 0;
idx = get_index(name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode pNode_tmp = SearchNodeThVal(pNode, idx);
if (0 < pNode_tmp->nodeid) {
nodeids.push_back(pNode_tmp->nodeid);
return pNode_tmp;
}
else {
ReFuzzymatch(pNode_tmp, nodeids);
}
if (NULL == pNode_tmp) {
return NULL;
}
else {
return pNode_tmp;
}
}
void CTree::ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids)
{
if (pNode == NULL) {
return;
}
if (0 <pNode->pChildList.size()) {
for (int i =0; i < pNode->pChildList.size(); ++i) {
if (0 < pNode->pChildList[i]->nodeid) {
nodeids.push_back(pNode->pChildList[i]->nodeid);
}
ReFuzzymatch(pNode->pChildList[i], nodeids);
}
}
}
完整代码
#include <iostream>
#include <vector>
#include <algorithm>
using namespace std;
#define MAX 26
#define IDXERR -1
#define INVALID 0
typedef struct STreeNode* pSTreeNode;
struct STreeNode
{
int nodeid;
int namevalue;
char* word;
pSTreeNode pFirstChild;
pSTreeNode pNextBrother;
vector<pSTreeNode> pChildList;
STreeNode(int name) {
nodeid = 0;
namevalue = name;
word = NULL;
pFirstChild = NULL;
pNextBrother = NULL;
}
};
class CTree
{
public:
CTree();
~CTree();
public:
static pSTreeNode GetInstance();
static void DestroyInstance();
static void FreeMemory(pSTreeNode pNode);
void InsertNode(int Nodeid, char* name);
pSTreeNode InsertBrother(pSTreeNode pParentNode, int name, bool &a_flag);
void ReSerachName(pSTreeNode pNode);
int NonReSerachName( pSTreeNode pNode);
void printAll(pSTreeNode pNode, int counts);
void printfWord(pSTreeNode pNode);
pSTreeNode SearchNodeThVal(pSTreeNode pNode, int value);
int Fullmatch(char* name);
//int Fuzzymatch(char* name);
void ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids);
pSTreeNode Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode = pRoot);
pSTreeNode Fullmatch(char* name, vector<int>&nodeids, pSTreeNode pNode = pRoot);
//void SortChildList(pSTreeNode pNode);
pSTreeNode SearchNode(pSTreeNode pNode, int id);
static int get_index(char ch);
private:
static pSTreeNode pRoot;
};
bool Cmpvalue(pSTreeNode pNode, pSTreeNode qNode)
{
return pNode->namevalue < qNode->namevalue;
}
pSTreeNode CTree::pRoot = NULL;
CTree::CTree()
{
pRoot = new STreeNode(100);
if (pRoot == NULL) {
return;
}
}
pSTreeNode CTree::GetInstance()
{
if (NULL == pRoot) {
pRoot = new STreeNode(100);
}
return pRoot;
}
void CTree::DestroyInstance()
{
if (NULL == pRoot) {
return;
}
FreeMemory(pRoot);
}
CTree::~CTree()
{
}
int CTree::get_index(char ch)
{
int idx;
if ('A' <= ch && 'Z' >= ch) {
idx = ch-'A';
}
else if ('a' <= ch && 'z' >= ch){
idx = ch-'a';
}
else {
idx = -1;
printf("get_index failed\n");
}
if(idx < 0 || idx >= MAX)
{
idx = IDXERR;
}
return idx;
}
void CTree::FreeMemory(pSTreeNode pNode)
{
if (pNode == NULL) {
return;
}
if (pNode->pFirstChild != NULL){
FreeMemory(pNode->pFirstChild);
}
if (pNode->pNextBrother != NULL) {
FreeMemory(pNode->pNextBrother);
}
delete pNode;
pNode = NULL;
}
void CTree::InsertNode(int Nodeid, char* name)
{
if (CTree::GetInstance() == NULL) {
return;
}
pSTreeNode Root_tmp = CTree::GetInstance();
int idx = 0;
//int value_tmp;
int nameLen = strlen(name);
//char* name_t
for (int i = 0; i < nameLen; i++) {
idx = get_index(name[i]);
if (0 <= idx && idx <= MAX) {
if (Root_tmp->pFirstChild == NULL) {
Root_tmp->pFirstChild = new STreeNode(idx);
Root_tmp->pChildList.push_back(Root_tmp->pFirstChild);
//value_tmp = Root_tmp->namevalue;
Root_tmp = Root_tmp->pFirstChild;
//value_tmp = Root_tmp->namevalue;
}
else if (idx == Root_tmp->pFirstChild->namevalue) {
Root_tmp = Root_tmp->pFirstChild;
}
else {
bool PushFlag = 0;
pSTreeNode tmp = InsertBrother(Root_tmp->pFirstChild, idx, PushFlag);
if (1 == PushFlag) {
Root_tmp = tmp;
}
else {
Root_tmp->pChildList.push_back(tmp);
//int value_tmp = Root_tmp->namevalue;
Root_tmp = tmp;
}
}
}
else {
}
}
Root_tmp->nodeid = Nodeid;
if (NULL == Root_tmp->word) {
Root_tmp->word = new char[strlen(name) + 1];
if (NULL == Root_tmp->word) {
cout<<"alloc root_tmp failed"<<endl;
}
strcpy(Root_tmp->word, name);
//printf("name %s Root_tmp->word %s\n", name, Root_tmp->word);
}
}
pSTreeNode CTree::InsertBrother(pSTreeNode pBrotherNode, int name, bool &a_flag)
{
if (pBrotherNode->pNextBrother != NULL) {
if (name == pBrotherNode->pNextBrother->namevalue) {
a_flag = 1;
return pBrotherNode->pNextBrother;
}
InsertBrother( pBrotherNode->pNextBrother, name, a_flag);
}
else {
pBrotherNode->pNextBrother = new STreeNode(name);
return pBrotherNode->pNextBrother;
}
}
/*
pSTreeNode CTree::SearchNode(pSTreeNode pNode, int id)
{
if (pNode == NULL) {
return NULL;
}
if (pNode->nodeid == id) {
return pNode;
}
if (pNode->pFirstChild == NULL && pNode->pNextBrother == NULL) {
return NULL;
}
else {
if ( pNode->pFirstChild != NULL) {
pSTreeNode pNodeTemp = SearchNode( pNode->pFirstChild, id);
if (pNodeTemp != NULL) {
return pNodeTemp;
}
else {
return SearchNode(pNode->pNextBrother, id);
}
}
else {
return SearchNode( pNode->pNextBrother, id);
}
}
}
*/
//输出字典树
void CTree::printAll(pSTreeNode pNode, int counts)
{
int flag = 0;
if (pNode == NULL) {
return;
}
if (pNode != pRoot) {
printf("%c ", pNode->namevalue + 'a');
}
if (pNode->nodeid > 0) {
printf("\n");
}
if (0 <pNode->pChildList.size()) {
for (int i =0; i < pNode->pChildList.size(); ++i) {
if (pNode->nodeid > 0 || i > 0) {
for(int k = 0; k < counts; ++k) {
printf(" ");
}
}
printAll(pNode->pChildList[i], counts + 1);
}
}
}
void CTree::printfWord(pSTreeNode pNode)
{
if (pNode == NULL) {
return;
}
if (0 <pNode->pChildList.size()) {
for (int i =0 ; i < pNode->pChildList.size(); ++i) {
if (0 < pNode->pChildList[i]->nodeid) {
printf("word is: %s nodeid is: %d\n", pNode->pChildList[i]->word, pNode->pChildList[i]->nodeid);
}
printfWord(pNode->pChildList[i]);
}
}
}
//递归处理ChildList
void CTree::ReSerachName(pSTreeNode pNode)
{
if (pNode == NULL) {
return;
}
if (1 < pNode->pChildList.size()) {
sort(pNode->pChildList.begin(), pNode->pChildList.end(), Cmpvalue);
}
ReSerachName(pNode->pFirstChild);
ReSerachName(pNode->pNextBrother);
}
//精确匹配
int CTree::Fullmatch(char* name)
{
pSTreeNode Root_tmp = CTree::GetInstance();
int idx = 0;
if(NULL == Root_tmp || NULL == name) {
return 0;
}
while(NULL != Root_tmp && '\0' != *name) {
idx = get_index(*name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);
if (NULL == tmp) {
return INVALID;
}
else {
Root_tmp = tmp;
}
name++;
}
if(Root_tmp != NULL) {
return Root_tmp->nodeid;
}
else {
return INVALID;
}
}
//模糊匹配 test
/*
int CTree::Fuzzymatch(char* name)
{
pSTreeNode Root_tmp = pRoot;
int idx = 0;
if(NULL == Root_tmp || NULL == name) {
return 0;
}
while(NULL != Root_tmp && '\0' != *name) {
idx = get_index(*name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode tmp = SearchNodeThVal(Root_tmp, idx);
if (0 < tmp->nodeid) {
printf("idx: %c: ", idx + 'a');
cout<<tmp->nodeid<<endl;
}
else {
vector<int> nodeids;
ReFuzzymatch(tmp, nodeids);
printf("idx: %c: ", idx + 'a');
for (int k = 0; k < nodeids.size(); ++k) {
cout<<nodeids[k]<<" ";
}
cout<< endl;
}
if (NULL == tmp) {
return INVALID;
}
else {
Root_tmp = tmp;
}
name++;
}
if(Root_tmp != NULL) {
return Root_tmp->nodeid;
}
else {
return INVALID;
}
}
*/
//递归搜索某个节点的’叶子节点‘
void CTree::ReFuzzymatch(pSTreeNode pNode, vector<int>&nodeids)
{
if (pNode == NULL) {
return;
}
if (0 <pNode->pChildList.size()) {
for (int i =0; i < pNode->pChildList.size(); ++i) {
if (0 < pNode->pChildList[i]->nodeid) {
nodeids.push_back(pNode->pChildList[i]->nodeid);
}
ReFuzzymatch(pNode->pChildList[i], nodeids);
}
}
}
//返回单个字符的nodeids
pSTreeNode CTree::Fullmatch(char name, vector<int>&nodeids, pSTreeNode pNode)
{
if(NULL == pNode || '\0' == name) {
return NULL;
}
int idx = 0;
idx = get_index(name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode pNode_tmp = SearchNodeThVal(pNode, idx);
if (0 < pNode_tmp->nodeid) {
nodeids.push_back(pNode_tmp->nodeid);
return pNode_tmp;
}
else {
ReFuzzymatch(pNode_tmp, nodeids);
}
if (NULL == pNode_tmp) {
return NULL;
}
else {
return pNode_tmp;
}
}
//二分查找
pSTreeNode CTree::Fullmatch(char* name, vector<int>&nodeids, pSTreeNode pNode)
{
int idx = 0;
if(NULL == pNode || NULL == name) {
return NULL;
}
while(NULL != pNode && '\0' != *name) {
nodeids.clear();
idx = get_index(*name);
if(IDXERR == idx) {
return INVALID;
}
pSTreeNode tmp = SearchNodeThVal(pNode, idx);
if (0 < tmp->nodeid) {
nodeids.push_back(tmp->nodeid);
}
else {
ReFuzzymatch(tmp, nodeids);
}
if (NULL == tmp) {
return NULL;
}
else {
pNode = tmp;
}
name++;
}
if(pNode != NULL) {
return pNode;
}
else {
return NULL;
}
}
pSTreeNode CTree::SearchNodeThVal(pSTreeNode pNode, int value)
{
int Listleft = 0;
int Listright = pNode->pChildList.size() - 1;
int Listmid = 0;
while (Listleft <= Listright) {
Listmid = (Listleft + Listright) / 2;
if (value < pNode->pChildList[Listmid]->namevalue) {
Listright = Listmid - 1;
}
else if (value > pNode->pChildList[Listmid]->namevalue) {
Listleft = Listmid + 1;
}
else {
return pNode->pChildList[Listmid];
}
}
printf("SearchNodeThVal failed value is: %d\n", value);
return NULL;
}
ps:主函数大家可以自行构造测试,欢迎评论交流!