[Trie树] 统计英文文本中单词出现的个数 - C语言实现 - 考虑数字、英文

版权声明:本文为博主原创文章,若有错误之处望大家批评指正!转载需附上原文链接,谢谢! https://blog.csdn.net/summer_dew/article/details/83984349

【英文文本】

   However, after reaching the shore there are plenty of challenges waiting for him."The biggest challenge now is learning to walk again! My biggest fear when I was coming out of the water and back onto the beach was that I was going to fall over. As I’ve not stepped foot on land for over five months, the tendons and ligaments in my feet have been asleep, so I basically have to learn to walk again. 
  Ross is not new to extreme challenges. He accomplished the world's longest rope climb in less than 24 hours by climbing the height equal of Mount Everest-8,848m (29,029 feet)
  He also attempted to swim 100km in the Caribbean carrying a 100lb tree.
On his Twitter account, Ross shared what his body looked like after this swim.

【结果】
在这里插入图片描述

【代码】

/*
 *@Time:20181111
 *@Test:统计英文文本中所有英语单词的个数
 *@Desc:只统计单词,并全部化为小写来统计
 */
 
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
 
#define MAXSTRLEN 1024
 
typedef struct node {
	int cnt;                       // 统计各个单词出现的次数 初始化为0
	struct node *next[40];		   // 数字放在0-9,单词放在10-36 
}TrieTreeNode,* TrieTree;
 
TrieTree createTrieTreeNode();                        // 新建并初始化一个节点
int InsertTrieTreeNode(TrieTree *pT, char *str);  // 插入一个单词
int DeleteTrieTreeNode(TrieTree *pT, char *str);  // 删除一个单词
int SearchTrieTree(TrieTree T, char *str);      // 搜索Trie树
void TraverseTrieTree(TrieTree T);              // 遍历Trie树
void DestroyTrieTree(TrieTree *pT);               // 销毁Trie树
 
// 新建并初始化一个节点
TrieTree createTrieTreeNode() {
	TrieTreeNode *treeNode;

    treeNode = (TrieTreeNode*)malloc(sizeof(TrieTreeNode));	if (!treeNode) exit(0);
	memset(treeNode->next, 0x00, sizeof(treeNode->next)); //所有的next都赋值为0
	treeNode->cnt = 0;
	return treeNode;
}
 
// 插入一个单词
int InsertTrieTreeNode(TrieTree *pT, char *str) {
	int i, index;
	TrieTreeNode *tempNode = *pT;
	
	if(tempNode==NULL || str == NULL || str[0]=='\0') // Trie树带有一头结点
		return 0;
	
	// 遍历字符串,找到Trie中的位置
	for(i=0; i<strlen(str) ;i++) {
		if (str[i]>='0' && str[i]<='9') //数字
			index = str[i] - '0'; //下标放在[0,9]
		else if (str[i]>='a' && str[i]<='z') //字母 
			index = str[i] - 'a' + 10; //下标放在[10,36]
		else {
			printf("单词错误,有别的类型字符\n");
			return 0;
		}
		
		// 往下走
		if (tempNode->next[index] == NULL) { //还没有这个结点
			tempNode->next[index] = createTrieTreeNode();
		}
		tempNode = tempNode->next[index]; //往下走
	}
	// 计数
	tempNode->cnt = tempNode->cnt + 1;

	return 0;
}
 
 
/*
 * 搜索Trie树
 * 存在返回个数
 * 不存在返回0
 */
int SearchTrieTree(TrieTree T, char *str) {
	int i, index; 
	TrieTreeNode *tempNode = T;

	if(tempNode==NULL || str == NULL)
		return 0;
	
	//搜索
	for(i=0; i<strlen(str); i++) {
		if (str[i]>='0' && str[i]<='9') //数字
			index = str[i] - '0'; //下标放在[0,9]
		else if (str[i]>='a' && str[i]<='z') //字母 
			index = str[i] - 'a' + 10; //下标放在[10,36]
		else {
			printf("单词错误,有别的类型字符\n");
			return 0;
		}

		if(tempNode->next[index] == NULL) { //走不下去了
			return 0; //没找到
		}
		tempNode  = tempNode->next[index];
	}
	
	return tempNode->cnt;
}  
 
/*
 * 删除单词
 * 不存在返回0
 * 存在返回删除后的个数
 */
int DeleteTrieTreeNode(TrieTree *pT, char *str) {
	int i, index; 
	TrieTreeNode *tempNode = *pT;
	 
	if(tempNode==NULL || str == NULL) 
		return 0;
	
	for(i=0; i<strlen(str); i++) {
		if (str[i]>='0' && str[i]<='9') //数字
			index = str[i] - '0'; //下标放在[0,9]
		else if (str[i]>='a' && str[i]<='z') //字母 
			index = str[i] - 'a' + 10; //下标放在[10,36]
		else {
			printf("单词错误,有别的类型字符\n");
			return 0;
		}

        if(tempNode->next[index] == NULL) {
			return 0;
		}
		tempNode  = tempNode->next[index];
	}
	tempNode->cnt = tempNode->cnt-1;
	return tempNode->cnt;
}
 
// 遍历Trie树,使用静态变量,递归时可以记录之前一层上的字符
void TraverseTrieTree(TrieTree T) {
	int i;
	static char word[MAXSTRLEN+1] = {'\0'};
	static int len=0;
	
	if(T==NULL) 
		return;

	for(i=0; i<37; i++) {
		if (T->next[i]==NULL) {
			continue;
		}
		// 赋值
		if (i>=0 && i<=9) { //数字
			word[len++] = i + '0';
		} else { //字母
			word[len++] = i - 10 + 'a';
		}
		// 如果这个字符串存在,输出
		if(T->next[i]->cnt > 0) {
			word[len] = '\0';
			printf("%-20s %-8d\n", word, T->next[i]->cnt);
		}
		// 遍历下一个
		TraverseTrieTree(T->next[i]);
	    len--;
	}
}
 
/*销毁Trie树*/
void DestroyTrieTree(TrieTree *pT) {
	int i;
	
	if((*pT)==NULL) 
		return;
	
	for(i=0; i<40; i++) {
		if((*pT)->next[i] != NULL) {
			DestroyTrieTree(&(*pT)->next[i]);
		}
	}
	
	free(*pT); // 子节点全部删除
	*pT = NULL;
}            

int main() {
	char word[1024+1] = {'\0'}; //存放读到的单词
	char c;
	int len;
	TrieTree T = NULL;
	FILE *fp;

	T =  createTrieTreeNode(); //创建Trie树

	// 读入文件
	fp = fopen("2013-8.txt", "r");
	len = 0;
	while ( fscanf(fp, "%c", &c)!=EOF ) {
		if ( c>='0' && c<='9' ) { //数字
			word[len++] = c;
		} else if ( c>='a' && c<='z' ) { //小写字母
			word[len++] = c;
		} else if ( c>='A' && c<='Z') { //大写字母
			word[len++] = c - 'A' + 'a'; //转成小写,考虑
		} else { //其他字符
			if (len==0) { //前面没有单词
				continue; //继续
			} else { //前面有单词
				word[len] = '\0'; //附上结束串
				//printf("- %s\n", word); //debug打印读取的字符串
				InsertTrieTreeNode(&T, word); //插入Trie树
			}
			len =0;
		}
	}
	fclose(fp);

	// 输出全部单词的次数
	TraverseTrieTree(T);
	// 销毁
	DestroyTrieTree(&T);
	
	return 0;
}  

猜你喜欢

转载自blog.csdn.net/summer_dew/article/details/83984349