We all know that word segmentation in English is relatively easy because words are separated by spaces, but Chinese is different. The separation of a sentence in Chinese is in units of words, and the so-called positive Maximum matching and reverse maximum matching is a method of word segmentation matching, which is explained here with dictionary matching.
The so-called dictionary forward maximum matching is to separate a string of characters, where the length of the separation is limited, and then match the separated substring with the words in the dictionary. If the match is successful, the next round of matching is performed until all strings are matched. After processing, otherwise remove a word from the end of the substring, and then perform matching, and so on. Reverse matching is similar to this, and an example is shown below: the string to be tokenized: "study the origin of life"
Suppose the relevant content in our dictionary is as follows:
Research
Postgraduate
life
Life
of
origin
Assume that the maximum matching word count is set to 5
Forward maximum matching process:
study of life
research life
Graduate # first word match succeeded
origin of life
the beginning of life
fate
Life # The second word matches successfully, a single word
the origin
from
The #third word matches successfully
Origin#The fourth word matches successfully
Then the result of forward maximum matching is
study the origin of life
Now let's look at the process of reverse max matching:
Origin of Life
origin of life
the origin
origin#The first word matches successfully
study of life
life
life's
fate
The #2nd word matches successfully
research life
study life
Life #The third word matches successfully
Research #4th word match success
So the result after reverse maximum matching is
study the origin of life
Summary of the two word segmentation processes:
[Forward matching: from left to right, gradually remove the words on the right (bottom) for a new round of matching, reverse matching: from right to left, gradually remove the words on the left (bottom) for a new round of matching]
Because of the complexity of Chinese and the particularity of Chinese, reverse maximum matching is often more accurate than forward matching.
从上面可以看出来,其实进行匹配并不困难,当然首先我们得找到一个好的词典。比如汉语词典,搜狗词典等,然后将其加载进一个散列表里,最后从输入读取字符串进行匹配,做中文分词这个还是使用脚本语言比较好,比如python,c 语言做这个比较坑,特别是对字符串的分隔要掌握好,而且非常麻烦,下面所处理的中文是UTF-8编码的,所以如果你输入的字符串不是UTF-8编码可能该程序会无法工作。
#include <stdio.h> #include <stdlib.h> #include <string.h> #define HASH_LEN 39841 //定义散列表大小 #define STACK_LEN 100 //定义栈大小 #define MAX 5 //最大匹配字数 typedef struct hash_node { char *data; struct hash_node *next; }HASH_NODE; //散列表下拉链表结果 typedef struct { int len; HASH_NODE *data; }HASH; //散列表数据结构 typedef struct { int len; char *data[STACK_LEN]; }STACK; //栈数据结构 //哈希函数,计算哈希值 unsigned int hash_fun(char *key) { unsigned int seed=131; unsigned int hash=0; while(*key) { hash=hash*seed+*key; ++key; } return hash&0x7FFFFFFF; } //初始化散列表 void hash_init(HASH *hash) { int i; //所有表中数据为空 for(i=0;i != HASH_LEN;++i) hash[i].len=0; } //冲突时拉下链表 void hash_list_insert(HASH_NODE *data,char *key) { HASH_NODE *temp; while(data->next != NULL) data=data->next; temp=malloc(sizeof(HASH_NODE)); temp->data=key; temp->next=NULL; data->next=temp; } //插入数据 void hash_insert(HASH *hash,char *key) { int h; h=hash_fun(key)%HASH_LEN; //如果当前表中没有数据则直接插入 //否则插入链表中 if(hash[h].len > 0) hash_list_insert(hash[h].data,key); else { hash[h].data=malloc(sizeof(HASH_NODE)); hash[h].data->data=key; hash[h].data->next=NULL; } //当前表中数据值加1 ++hash[h].len; } //从该表地址中进行搜索 char *hash_node_key(HASH_NODE *node,char *key) { while(node) { if(strcmp(node->data,key) == 0) return node->data; node=node->next; } return NULL; } //从散列表查找 char *hash_get(HASH *hash,char *key) { int h; h=hash_fun(key)%HASH_LEN; if(hash[h].len == 0) return NULL; return hash_node_key(hash[h].data,key); } //判断数据是否在该表中 int hash_node_in(HASH_NODE *node,char *key) { while(node) { if(strcmp(node->data,key) == 0) return 1; node=node->next; } return 0; } //从表中搜索关键词 //如若存在则返回1 //否则返回0 int hash_in(HASH *hash,char *key) { int h; h=hash_fun(key)%HASH_LEN; if(hash[h].len == 0) return 0; return hash_node_in(hash[h].data,key); } //打印表的所有数据 void hash_list_print(HASH_NODE *data) { while(data != NULL) { printf("%s ",data->data); data=data->next; } } //打印散列表 void hash_print(HASH *hash) { int i; for(i=0;i != HASH_LEN;++i) { if(hash[i].len != 0) { hash_list_print(hash[i].data); printf("\n"); } } } //加载词典数据并存入散列表中 void load_data(HASH *hash,char *path) { char *buf=NULL; char *temp; size_t len; int s; FILE *fp; if((fp=fopen(path,"rb")) == NULL) return; //按行读取 while((s=getline(&buf,&len,fp)) > 0) { temp=malloc(sizeof(char)*s); snprintf(temp,sizeof(char)*s,"%s",buf); //去除回车符 hash_insert(hash,temp); //插入数据 free(buf); buf=NULL; } fclose(fp); } //初始化栈 void stack_init(STACK *stack) { int i; //栈数据置零 stack->len=0; for(i=0;i != STACK_LEN;++i) stack->data[i]=NULL; } //压入一个数据到栈中 int stack_push(STACK *stack,char *data) { if(stack->len >= STACK_LEN) return 0; stack->data[stack->len]=data; ++stack->len; } //从栈中弹出一个数据 char *stack_pop(STACK *stack) { if(stack->len <= 0) return NULL; --stack->len; return stack->data[stack->len]; } //正向最大匹配 int for_match(HASH *hash,STACK *stack,char *data,int *index) { int len=strlen(data); while(len) { //判断词典中是否有该词 //有则将该词压入栈中 //否则从字符串后减一个字进行循环 if(hash_in(hash,data)) { stack_push(stack,data); *index+=len; return 1; } len-=3; data=realloc(data,sizeof(char)*len+1); data[len]='\0'; } return 0; } //逆向最大匹配 int re_match(HASH *hash,STACK *stack,char *data,int *index) { int len=strlen(data); while(len) { //判断词典中是否有该词 //有则将该词压入栈中 //否则从字符串前减一个字进行循环 if(hash_in(hash,data)) { stack_push(stack,data); *index-=len; return 1; } data+=3; len-=3; } return 0; } //预处理字符串 void pre_set(char *str) { char temp[600]={0}; int i=0; int index=0; while(i < strlen(str)) { if((str[i]&0xe0) == 0xe0) { snprintf(temp+index,sizeof(char)*3+1,"%s",str+i); i+=3; index+=3; } else if((str[i]&0xc0) == 0xc0) //标点等 i+=2; else if((str[i]&0x80) == 0) //英文字符 ++i; } //重新设置字符串 memset(str,0,strlen(str)+1); snprintf(str,strlen(temp)+1,"%s",temp); } int main(int argc,char **argv) { HASH hash[HASH_LEN]; //散列表 STACK stack; //压入匹配到的词的栈 STACK res; //以顺序打印正向匹配结果的交换栈 char string[600]={0}; //输入的字符串 int i=0; int index=0; char *temp; //取出的字符串 //初始化 hash_init(hash); stack_init(&stack); stack_init(&res); load_data(hash,"./现代汉语常用词表.txt"); //hash_print(hash); printf("请输入一个字符串:"); //scanf("%s",string); fgets(string,600,stdin); //预处理字符串,去除英文和其它非中文字符 pre_set(string); //开始正向取出字符串 while(i< strlen(string)) { temp=malloc(sizeof(char)*3*MAX+1); snprintf(temp,sizeof(char)*3*MAX+1,"%s",string+i); //正向最大匹配 if(!for_match(hash,&stack,temp,&i)) { /*printf("正向匹配失败!\n"); return -1;*/ i+=3; } } //将匹配结果重新排序并打印 while(temp=stack_pop(&stack)) stack_push(&res,temp); printf("正向最大匹配:\n"); while(temp=stack_pop(&res)) printf("%s ",temp); //取出逆向匹配字符串 printf("\n\n"); i=strlen(string); while(i > 0) { int index=0; //如果当前字符串不够5个,则从头开始截取 if(i < 3*MAX) index=0; else index=i-3*MAX; temp=malloc(sizeof(char)*3*MAX+1); snprintf(temp,sizeof(char)*3*MAX+1,string+index); //开始逆向匹配 if(!re_match(hash,&stack,temp,&i)) { /*printf("逆向匹配失败!\n"); return -2;*/ i-=3; } //去除已匹配的字符串 string[i]='\0'; } //打印匹配结果 printf("逆向最大匹配:\n"); while(temp=stack_pop(&stack)) printf("%s ",temp); printf("\n"); return 0; }