Count the number of characters, lines, and words in "The_Holy_Bible_Res.txt" , count the word frequency of words and print out the top 10 words with the highest word frequency and their word frequency
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXKEY 10000
#define SWAP(a,b) {pInfo_t t=a;a=b;b=t;}
int hash(char *key) {//哈希函数:输入字符串的地址返回字符串对应的哈希值
int h = 0 , g;
while (*key) {
h = (h << 4) + *key++;
g = h & 0xf0000000;
if (g)
h ^= g >> 24;
h &= ~g;
}
return h % MAXKEY;
}
typedef struct info
{
int num;//单词词频
char *address;//单词地址
struct info *next;//指向哈希冲突链表的下一个结点
}Info_t , *pInfo_t;
int isLetterofAlphabet(char c)//判断小写字母
{
if (c <= 122 && c >= 97)
{
return 1;
}
else
{
return 0;
}
}
//数组下标从0开始
//left是堆顶的下标,编号是left+1
//左孩子编号2*left+2,下标是2*left+1
//右孩子编号2*left+3,下标是2*left+2
void adjustMinHeap(pInfo_t *p , int left , int right)//调整为小顶堆
{//自上而下调整
int father = left;
int son = 2 * father + 1;//son指向更小的孩子
while (son <= right)
{
if (son + 1 <= right && (p[son]->num > p[son + 1]->num))//son+1<=right保证了没有右孩子就不会比较两个孩子
{
son = son + 1;//右孩子小就指向右孩子
}
if (p[father]->num > p[son]->num)
{
SWAP(p[father] , p[son]);
father = son;
son = 2 * father + 1;
}
else
{
break;
}
}
}
void buildMinHeap(pInfo_t *p , int left , int right)//建立小顶堆
{
int lastFather = (right - 1) / 2;//最后一个分支结点
for (int i = lastFather; i >= left; i--)//自下而上建堆
{
adjustMinHeap(p , i , right);
}
}
/*
没有用到这个函数,只是提供了一种找出文件里的单词的思路
缺点:如果指针所指向的是文件里面第一个字符,那么p[-1]就错了
*/
int isWord(char *p)
{//文件指针此时指向的是字母,如果前面不是字母,说明这是一个单词
if (p[-1] < 97 || p[-1]>122)
{
return 1;
}
else
{
return 0;
}
}
//2
int main(int argc , char *argv[])
{
FILE *fp = fopen(argv[2] , "rb");
if (fp == NULL)
{
perror("fp.fopen:");
return -1;
}
pInfo_t hashTable[MAXKEY] = { NULL };
char c;
int n1=0 , n2=1 , n3=0;//字符的个数,行数,单词的个数
while ((c = fgetc(fp)) != EOF)
{
/* 错误:在文件里单词的结尾不是'\0',这样把单词地址输入hash函数得到的哈希值错
if (c <= 122 && c >= 97)//小写字母
{
if (isWord(&c))//是单词
{
n3++;
pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
p->num++;
p->address = &c;
//错:每遇到一个单词就创新结点p,用p->num==1去判断单词以前是否出现过是不合理的
if (hashTable[hash(&c)] != NULL&&p->num==1)
{//这个结点第一次出现并且发生了冲突
pInfo_t q = hashTable[hash(&c)];
while (q->next!=NULL)
{
q = q->next;
}
q->next = p;
}
*/
if (isLetterofAlphabet(c))//小写字母
{
n3++;
n1++;
int i = 0;
//错误的写法:char word[100] = {0};
char *word = (char*) calloc(1 , 100);
word[i++] = c;
while ((c = fgetc(fp)) != EOF && isLetterofAlphabet(c))
{
word[i++] = c;
n1++;
}
if (hashTable[hash(word)] != NULL)
{//单词发生了冲突或者是单词之前出现过
pInfo_t q = hashTable[hash(word)];
//比较哈希元素对应的单词是否和word相同
while (q->next != NULL && strcmp(q->address , word)!=0)
{
q = q->next;
}
if (strcmp(q->address , word) == 0)
{
q->num++;//冲突链表里有word这个单词,词频+1
}
else
{
pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
p->address = word;
p->num = 1;
q->next = p;
}
}
else
{//单词没发生冲突并且是第一次出现
pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
p->address = word;
p->num = 1;
hashTable[hash(word)] = p;
}
if (c == EOF)//到达文件尾
{
break;
}
}
if(c=='\n')//遇到'\n'
{
n2++;
}
}
fclose(fp);
printf("字符数%d,行数%d,单词数%d\n" , n1 , n2 , n3);
pInfo_t A[10] = {NULL};//存放前10大词频的结点指针
int flag = 0;//标识是否建立了初始的前10个结点指针的小顶堆
//扫描哈希链表
for (int i = 0,j=0; i < MAXKEY; i++)
{
if (hashTable[i]!=NULL)
{
pInfo_t t= hashTable[i];
while (t!=NULL)
{
if (j<10)//把前10个结点指针存进A
{
A[j++] = t;
}
else if(j==10&&flag==0)
{
buildMinHeap(A , 0 , 9); //建立小顶堆
flag = 1;
}
else
{
if (A[0]->num < t->num)
{
//错:SWAP(A[0],t); 影响t = t->next;
A[0] = t;
adjustMinHeap(A , 0 , 9);
}
}
t = t->next;
}
}
}
for (int i = 0; i < 10; i++)
{//打印输出词频最高的前 10 个单词及其词频
printf("%s:%d\n" , A[i]->address , A[i]->num);
}
return 0;
}