I have already graduated, have not engaged in programming-related work, and will rarely write code in the future. Think about writing a lot of code over the years, from the very beginning of A+B, to the last of all kinds of programs that seem to be very cool. I have stepped on many pitfalls, asked many people, and checked a lot of information. Until now, I dare to say that I am proficient in C language, but I am only proficient in the language itself, and I still know little about algorithms in various professional fields. When I have nothing to do, I organize the computer disk and find a large number of various programs written from the freshman to the postgraduate stage. It is a pity to think about deleting it, after all, it is a few years of painstaking effort. While work is not busy, why not release the previous code, on the one hand, as a souvenir, in the future, you can boast that you have written hundreds of thousands of lines of code, so you can have a proof, and on the other hand, it will also give some references to the latecomers. It's also nice to help them.
Click here to see the source code:
word frequency statistics program
1. Topic requirements
To write a simple word frequency statistics program in C language, the requirements are as follows:
1. Read an English short paragraph from the file, and output it in the console as it is;
2. Count the number of various words in English sentences and output them to the console;
3. Output the top ten words with the largest number.
2. Topic analysis
Judging from the requirements of the topic, several programming techniques need to be used. First, reading data from a file involves file operations; second, in order to save English paragraphs, an array is involved; third, to save words of different lengths, it involves dynamic allocation of memory; fourth, to read words, Counting word frequency involves linked list and string operations; fifth, outputting high-frequency words involves sorting.
3. Structure and function declaration
(1) Structure declaration
//单词节点
struct WordNode
{
char* word;//保存单词的字符串
int count;//该单词的数目
WordNode* next;//下一个单词节点
};
//单词链表
struct WordList
{
WordNode* first;//单词链表的首节点
WordNode* last;//单词链表的末节点
int nodeCount; //单词节点的数目
};
(2) Function declaration
a. Custom function
The custom functions of this program are as follows:
//打印词汇表,max_num为打印词汇的个数,如果是-1,则全部打印
void PrintWordList(WordList* wordlist, int max_num = -1);
//词汇表排序,isAscend=true为升序排序,否则为降序排序
void SortWordList(WordList* wordlist, bool isAscend = true);
//从词汇表中查找某个词汇,word为要查找的词汇,如果词汇存在表中,则返回该词汇的节点指针,否则返回NULL
WordNode* FindWord(WordList* wordlist, const char* word);
//从文件中读取字符,并保存到字符串中,words为接收字符串地址,count指定读取字符的最大数量,返回值为实际读取的字符个数
int ReadWordsFromFile(FILE* fp, char* words, int count);
//词汇统计,words为已经保存了词汇的字符串,wordlist为将要生成的词汇表,flag如果为true则表示区分大小写,为false则不区分大小写。
void WordsStatistic(const char* words, WordList* wordlist,bool flag);
//从字符串中读取一个单词,start为源字符串words的开始位置下标,end用来接收读取一个单词后,words的下标。返回值为单词的长度,如果返回-1,表示已经读完所有的单词。
int ReadWordFromString(const char* words, int start, int* end, char* word);
//释放词汇表的内存空间
void FreeWordList(WordList wordlist);
b. Library functions
The library functions used by the program itself are as follows:
//打开文件
FILE* fopen(char const* _FileName, char const* _Mode);
//设置文件指针位置
int fseek(FILE* _Stream,long _Offset,int _Origin);
//获取文件指针的位置(计算文件的大小)
long ftell(FILE* _Stream);
//从文件中读取字节
int fread(void* _Buffer,int _ElementSize,int _ElementCount,FILE* _Stream);
//动态分配内存
void* malloc(int _Size);
//获取字符串的长度
int strlen(char const* _Str);
//将字符串复制到另一个字符串
char strcpy(char* _Destination,const char* _Source);
//两个字符串比较
int strcmp(const char* str1,const char* str2);
//关闭文件
int fclose(FILE *Stream);
//释放内存空间
void free(void* block);
//将数据打印到控制台
int printf(const char* format,...);
4. The necessary header files include
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
5. Design instructions for custom functions
(1)、ReadWordsFromFile
The function of this function is to read the English articles in the file into memory intact.
//从文件中读取若干字节的单词。
//fp:非空的文件指针
//words:用来接收来自文件的字符串
//count:从文件中读取的字节数,如果该参数大于文件实际的字节数,则去取全部的字节
int ReadWordsFromFile(FILE* fp, char* words, int count)
{
//获取文件的字节数
fseek(fp, 0, SEEK_END);
int file_size = ftell(fp);
//开始读取字符
fseek(fp, 0, SEEK_SET);
int buffer_len = count;
if (count > file_size)buffer_len = file_size;
for (int i = 0; i < buffer_len; i++)
{
char ch;
fread(&ch, 1, 1, fp);
words[i] = ch;
}
//字符串结尾
words[buffer_len] = '\0';
return buffer_len;
}
(2)、ReadWordFromString
Read a word from a string containing an entire English paragraph.
//从字符串中读取一个单词,因为每次只能读取一个单词,所以为了能够遍历整个字符串,读取完一个
//单词之后,就返回当前的指针位置,下次读取一个单词,指针就从此开始。(类似于文件的读取)
//words:包含英文段落的字符串
//start:从字符串的这个位置开始读取一个单词
//end:接收返回的指针位置
//word:j接收读到的单词
//返回值:返回读到的单词的长度,-1表示已经到了结尾,0表示没有读取到单词
int ReadWordFromString(const char* words, int start, int* end, char* word)
{
char ch;
int count = 0;//单词的长度
*end = start;
while (true)
{
ch = words[*end];
(*end)++;
if ((ch >= 48 && ch <= 57) || (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122))//如果是谁或者字母,则读取为单词的一部分
{
word[count] = ch;
count++;
}
else if (ch == '\0') return -1;//如果读到字符串末尾,则返回结束标志-1
else break;//如果字符是标点、空格、换行等非字母和数字型字符,则中断,表示已经完成一个单词的读取
}
word[count] = '\0';
return count;
}
(3)、FindWord
Find the specified word in the word list.
//从单词列表中查找单词,如果找到指定的单词,则返回该单词的节点指针,否则返回NULL
//wordlist:单词链表
//word:要查找的单词
WordNode* FindWord(WordList* wordlist, const char* word)
{
int count = wordlist->nodeCount;
WordNode* node = wordlist->fisrt;
//笨办法,遍历整个链表,查找是否存在指定的单词
for (int i = 0; i < count; i++)
{
if (strcmp(word, node->word) == 0)return node;
node = node->next;
}
return NULL;
}
(4)WordStatistic
Count the number of each word and connect the words in a linked list.
//统计单词并生成单词列表
//words:保存英语段落的字符串
//wordlist:一个空的单词列表
//flag:指示是否区分大小写,如果为true,就是区分大小写,false就是不区分大小写
void WordsStatistic(const char* words, WordList* wordlist,bool flag )
{
int start = 0, end;
while (true)
{
char word[100] = { '\0' };//一般没有单词超过100个字母的吧,所以定义100字节的数组足够了
int word_len;
word_len = ReadWordFromString(words, start, &end, word);//从字符串中读取一个单词
start = end;
if (word_len == -1)break;//字符串已经到了结尾,表示段落已经处理完,可以停止读取单词
if (word_len == 0)continue;//没有读取到单词,则立即再次读取
//不区分大小写的话(Is 和is被认为是同一个单词),则将单词全部变为小写(如Is变为is)
if (flag==false)for (int i = 0; i < word_len; i++)
{
if (word[i] >= 65 && word[i] <= 90)word[i] = word[i] + (char)32;
}
WordNode* node;
node = FindWord(wordlist, word);//读取到一个单词,首先找找这个单词是否在单词链表中,如果在则返回该单词的节点指针
if (node == NULL)//如果单词链表中不存在该单词,则将该单词添加进去链表中
{
WordNode* word_node = (WordNode*)malloc(sizeof(WordNode));//创建一个单词节点
int word_len = strlen(word);
word_node->word = (char*)malloc(word_len + 1);//为单词节点的word成员分配空间
word_node->word[word_len] = '\0';
word_node->count = 1;
strcpy(word_node->word, word);//将单词复制到单词节点中
word_node->next = NULL;
if (wordlist->fisrt == NULL)//如果单词链表为空,则进行以下操作
{
wordlist->nodeCount = 1;
wordlist->fisrt = word_node;
wordlist->last = word_node;
}
else//单词列表部位空,则进行以下操作
{
wordlist->last->next = word_node;
wordlist->last = word_node;
wordlist->nodeCount++;
}
}
else//如果单词已经存在单词列表中,则将该单词的数目加1
{
node->count++;
}
}
}
(5)、PrintWordList
print word list
//打印单词列表的前
//wordlist:单词列表
//max_num:打印单词节点的最大数量,如果是-1,则表示打印整个单词链表,如果max_num大于链表的实际长度,也打印全部链表节点。
void PrintWordList(WordList* wordlist, int max_num)
{
WordNode* node = wordlist->fisrt;
int n = max_num;
if (max_num == -1 || max_num > wordlist->nodeCount)n = wordlist->nodeCount;
//遍历链表并打印
for (int i = 0; i < n; i++)
{
printf("%s:%d\n", node->word, node->count);
node = node->next;
}
}
(6)SortWordList
Sort the linked list of words
//单词链表排序。冒泡排序,考虑到WordNode的成员数比较少,直接采用值交换的方法,而不是链表节点指针的交换
//wordlist:单词链表
//isAscend:是否升序排序,true为升序排序,false为降序排序
void SortWordList(WordList* wordlist, bool isAscend)
{
WordNode* p;
if (wordlist->nodeCount <= 1)return;//如果单词链表的长度小于1则直接结束排序
for (int i = wordlist->nodeCount - 1; i > 0; i--)
{
p = wordlist->fisrt;
for (int j = 0; j < i; j++)
{
if ((isAscend && p->count > p->next->count) || (!isAscend && p->count < p->next->count))
{
char* s;
int c;
s = p->word;
c = p->count;
p->word = p->next->word;
p->count = p->next->count;
p->next->word = s;
p->next->count = c;
}
p = p->next;
}
}
}
(7)FreeWordList
Release the memory space of the word list
//释放单词列表
void FreeWordList(WordList wordlist)
{
WordNode* cur, * next;
cur = wordlist.fisrt;
for (int i = 0; i < wordlist.nodeCount; i++)
{
next = cur->next;
free(cur->word);//释放word成员指向的内存
free(cur);//释放节点的内存
cur = next;
}
}
(8)main
The main function is to open the file, fetch the file, count the words, sort the word list, and print the word list.
int main()
{
char* words;
FILE* fp;
fp = fopen("test.txt", "r");
//获取文件的字节数
int file_size;
fseek(fp, 0, SEEK_END);
file_size = ftell(fp);
//重置文件指针到开头
fseek(fp, 0, SEEK_SET);
//从文件中读取单词到字符串
words = (char*)malloc(file_size + 1);
ReadWordsFromFile(fp, words, file_size);
//打印原文
printf("原文:\n");
printf("%s\n", words);
//单词频率统计,不区分大小写
WordList wordlist;
wordlist.fisrt = NULL;
wordlist.last = NULL;
wordlist.nodeCount = 0;
WordsStatistic(words, &wordlist,false);//false表示不区分大小写
//打印词汇列表,全部打印
printf("词频统计表:\n");
PrintWordList(&wordlist,-1);//-1位打印全部链表节点
//词汇表排序,降序排序
SortWordList(&wordlist,false);//false表示降序排序
//打印频率最高的10个单词
printf("高频词汇:\n");
PrintWordList(&wordlist,10);
fclose(fp);
free(words);
FreeWordList(wordlist);
getchar();
return 0;
}