导读:gnu的grep和fgrep程序内部都调用了GLibc的4个函数实现正则表达式的搜索,他们是:
int regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
int regexec (const regex_t *__restrict preg, const char *__restrict string,
size_t nmatch, regmatch_t pmatch[], int eflags)
size_t regerror (int errcode, const regex_t * preg, char *errbuf,
size_t errbuf_size)
void regfree (regex_t *preg)
这4个函数也是POSIX规范里要求的4个函数。GNU还实现了re_match函数和re_search函数,功能类似。
正则表达式中使用了2个重要的概念DFA和NFA(至今还不理解。。。)
下面的例子展示了这4个函数的使用方法:
#include "types.h"
#include "mcheck.h"
#include "regex.h"
#include "stdio.h"
#include "stdlib.h"
#include"regex_internal.h"
static const char text[] = "This is a test; this is a test";
int main (void)
{
regex_t re;
regmatch_t rm[2];
int n;
int i;
re_token_t *tkn;
n = regcomp (&re, "a test", (REG_EXTENDED|REG_ICASE));
if (n != 0)
{
char buf[500];
regerror (n, &re, buf, sizeof (buf));
printf ("regcomp failed: %s\n", buf);
exit (1);
}
printf("re.buffer=%p\n",re.buffer);
re_dfa_t *dfa=(re_dfa_t *)re.buffer;
tkn=dfa->nodes;
for (n = 0; n < 1; ++n)
{
printf("Bin tree addr is %p\n",dfa->str_tree_storage);
printf("DFA state_table is %p\n",dfa->state_table);
printf("\tDFA state_table->(num=%d,alloc=%d,array=%p)\n",dfa->state_table->num,dfa->state_table->alloc,dfa->state_table->array);
printf("DFA init_state is %p\n",dfa->init_state);
printf("\tDFA init_state->(hash=%d,entrance_nodes=%p,nodes.nelem=%d)\n",dfa->init_state->hash,dfa->init_state->entrance_nodes,dfa->init_state->nodes.nelem);
printf("DFA init_state_begbuf is %p\n",dfa->init_state_begbuf);
printf("DFA init_state_nl is %p\n",dfa->init_state_nl);
printf("DFA init_state_word is %p\n",dfa->init_state_word);
printf("DFA node addr is %p\n",dfa->nodes);
printf("DFA next addr is %p\n",dfa->nexts);
printf("DFA nodes_alloc is %d\n",dfa->nodes_alloc);
printf("DFA nodes_len is %d\n",dfa->nodes_len);
printf("DFA edests is %p\n",dfa->edests);
printf("\tDFA edests->(nelem=%d,alloc=%d,elem=%p)\n",dfa->edests->nelem,dfa->edests->alloc,dfa->edests->elems);
printf("DFA eclosures is %p\n",dfa->eclosures);
printf("\tDFA eclosures->(nelem=%d,alloc=%d,elem=%p)\n",dfa->eclosures->nelem,dfa->eclosures->alloc,dfa->eclosures->elems);
printf("DFA inveclosures is %p\n",dfa->inveclosures);
printf("DFA inveclosures is %p\n",dfa->inveclosures);
printf("DFA org_indices is %p\n",dfa->org_indices);
printf("DFA sb_char is %p\n",dfa->sb_char);
for (i = 0; i < dfa->nodes_len; ++i)
{
printf("(%d,%c),(C:%d,D:%d)\n",tkn->type,tkn->opr.idx,tkn->constraint,tkn->duplicated);
tkn+=1;
}
regexec (&re, text, 1, rm, 0);
printf("\nrm[0].eo=%d,so=%d",rm[0].rm_eo,rm[0].rm_so);
memset((void *)&rm[0],0,sizeof(rm));
}
regfree (&re);
return 0;
}
regcomp内部调用了re_compile_internal来初始化DFA,然后构造二叉树,最后计算闭包。
基于上一个例子构造的二叉树结构如下:
暂时先分析到这里。