[编程笔记] 文件单词统计C语言实现


/********************** 程序说明 ***********************
*
*单词只能由字母,数字和下划线组成,且第一个字符只能是字母或者下划线
*注:只由下划线组成的也被认为是单词,如"","___","___________"等

  • 并且如果单词的头部为数字,则不认为数字后面的是一个单词,例如:
  • “1abc bcd def"中,认为只有两个单词bcd和def,忽略1后面得"abc”。

*******************************************************************/


#include <iostream>
#include <fstream>

using namespace std;

#define WORD_LENGTH 50    //单词长度

char c_Table[64] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
char n_Table[11] = "0123456789";

struct word
{
	char w[WORD_LENGTH];
	word * next;
};

word * head = NULL, * p, * end;   //单词表指针

bool check_c(char ch)    //检查是不是字母 数字 下划线
{
	for ( int i = 0; i < 63; i++ )
	{
		if ( ch == c_Table[i] )
		{
			return true;
		}
	}
	
	return false;
}

bool check_n(char ch)    //用于检查首位不是数字
{
	for ( int i = 0; i < 10; i++ )
	{
		if ( ch == n_Table[i] )
		{
			return true;
		}
	}
	
	return false;
}

void clear_word(char w[50])    //给单词赋初始值
{
	for ( int i = 0; i < WORD_LENGTH; i++ )
	{
		w[i] = '\0';
	}
}

bool check_in_list(char w[50])   //检查是否与单词表重复
{
	word * p = head;
	while ( p != end )
	{
		if ( strcmp(p->w,w) == 0 )
		{
			return false;   //与单词表中单词重复
		}
		p = p->next;
	}
	
	return true;   //没找到相同单词(可以加入单词表)
}

int main()
{
	ifstream fin;
	char filename[200];
	char filename_cut[200];
	char read_word[WORD_LENGTH];
	char ch;
	bool flag = false, flag1 = false;
	int ctr = 0;
	int words_num = 0, different_words_num = 0;
	
	printf("程序从文件中读取英文语句,判断其中的单词数(文件拖拽)\n");
	printf("请输入文件名:");
	gets(filename);

	if ( filename[0] == '\"' )
	{
		filename[strlen(filename)-1] = '\0';
		strcpy(filename,filename+1);
	}
	for ( int j = strlen(filename); j >= 0; j-- )
	{
		if ( filename[j] == '\\' )
		{
			strcpy(filename_cut,filename+j+1);
			break;
		}
	}

	fin.open(filename,0);
	if ( fin != NULL )
	{
		clear_word(read_word);
	
		while ( !fin.eof() )
		{
			ch = '\0';
			fin.get(ch);
			if ( check_c(ch) )
			{
				if ( !flag && !check_n(ch) && !flag1 )
				{
					flag = true;
					read_word[ctr] = ch;
					ctr++;
				}
				else
				if ( !flag && check_n(ch) )
				{
					flag1 = true;
				}
				else
				if ( flag )
				{
					read_word[ctr] = ch;
					ctr++;
				}
			}
			else
			{
				if ( flag1 )
				{
					flag1 = false;
				}
				else
				if ( flag && check_in_list(read_word) )
				{
					if ( head == NULL )
					{
						head = new word;
						end = new word;
						clear_word(head->w);
						clear_word(end->w);
						head->next = end;
						p = head;
					}
					else
					{
						p = new word;
						clear_word(p->w);
						word * tmp = p;
						p = end;
						end = tmp;
						p->next = end;
						tmp = NULL;
					}
					for ( int i = 0; i < ctr; i++ )
					{
						p->w[i] = read_word[i];
					}
					ctr = 0;
					flag = false;
					clear_word(read_word);
					different_words_num++;
					words_num++;
				}
				else
				if ( flag && !check_in_list(read_word) )
				{
					ctr = 0;
					flag = false;
					clear_word(read_word);
					words_num++;
				}
			}
		}

		printf("\n%s文件中,",filename_cut);
		printf("共%d个单词,",words_num);
		printf("其中有%d个不同的单词:\n",different_words_num);
		word * h = head;
		while ( h != end )
		{
			printf("%s  ",h->w);
			h = h->next;
		}
		printf("\n\n");

		word * r, *s;
		r = head;
		while ( r != end )
		{
			s = r;
			r = r->next;
			delete s;
		}
		delete r;

		fin.close();
	}
	else
	{
		printf("打开文件失败!\n");
	}

	return 1;
}

猜你喜欢

转载自blog.csdn.net/cp_oldy/article/details/88288722
今日推荐