Top 10 words appearing in the statistics file

Count the number of characters, lines, and words in "The_Holy_Bible_Res.txt" , count the word frequency of words and print out the top 10 words with the highest word frequency and their word frequency 

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>	
#include <string.h>
#define MAXKEY 10000
#define SWAP(a,b) {pInfo_t t=a;a=b;b=t;}

int hash(char *key) {//哈希函数:输入字符串的地址返回字符串对应的哈希值
	int h = 0 , g;
	while (*key) {
		h = (h << 4) + *key++;
		g = h & 0xf0000000;
		if (g)
			h ^= g >> 24;
		h &= ~g;
	}
	return h % MAXKEY;
}
typedef struct info
{
	int num;//单词词频
	char *address;//单词地址
	struct info *next;//指向哈希冲突链表的下一个结点
}Info_t , *pInfo_t;

int isLetterofAlphabet(char c)//判断小写字母
{
	if (c <= 122 && c >= 97)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

//数组下标从0开始
//left是堆顶的下标,编号是left+1
//左孩子编号2*left+2,下标是2*left+1
//右孩子编号2*left+3,下标是2*left+2
void adjustMinHeap(pInfo_t *p , int left , int right)//调整为小顶堆
{//自上而下调整
	int father = left;
	int son = 2 * father + 1;//son指向更小的孩子
	while (son <= right)
	{
		if (son + 1 <= right && (p[son]->num > p[son + 1]->num))//son+1<=right保证了没有右孩子就不会比较两个孩子
		{
			son = son + 1;//右孩子小就指向右孩子
		}
		if (p[father]->num > p[son]->num)
		{
			SWAP(p[father] , p[son]);
			father = son;
			son = 2 * father + 1;
		}
		else
		{
			break;
		}
	}
}
void buildMinHeap(pInfo_t *p , int left , int right)//建立小顶堆
{
	int lastFather = (right - 1) / 2;//最后一个分支结点
	for (int i = lastFather; i >= left; i--)//自下而上建堆
	{
		adjustMinHeap(p , i , right);
	}
}

/*
没有用到这个函数,只是提供了一种找出文件里的单词的思路
缺点:如果指针所指向的是文件里面第一个字符,那么p[-1]就错了
*/
int isWord(char *p)	
{//文件指针此时指向的是字母,如果前面不是字母,说明这是一个单词
	if (p[-1] < 97 || p[-1]>122)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}


//2
int main(int argc , char *argv[])
{
	FILE *fp = fopen(argv[2] , "rb");
	if (fp == NULL)
	{
		perror("fp.fopen:");
		return -1;
	}
	pInfo_t hashTable[MAXKEY] = { NULL };
	char c;
	int n1=0 , n2=1 , n3=0;//字符的个数,行数,单词的个数	
	while ((c = fgetc(fp)) != EOF)
	{
		/* 错误:在文件里单词的结尾不是'\0',这样把单词地址输入hash函数得到的哈希值错
		if (c <= 122 && c >= 97)//小写字母
		{
			if (isWord(&c))//是单词
			{
				n3++;
				pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
				p->num++;
				p->address = &c;
				//错:每遇到一个单词就创新结点p,用p->num==1去判断单词以前是否出现过是不合理的
				if (hashTable[hash(&c)] != NULL&&p->num==1)
				{//这个结点第一次出现并且发生了冲突
					pInfo_t q = hashTable[hash(&c)];
					while (q->next!=NULL)
					{
						q = q->next;
					}
					q->next = p;
				}
		*/
		if (isLetterofAlphabet(c))//小写字母
		{
			n3++;
			n1++;
			int i = 0;
			//错误的写法:char word[100] = {0}; 
			char *word = (char*) calloc(1 , 100);
			word[i++] = c;
			while ((c = fgetc(fp)) != EOF && isLetterofAlphabet(c))
			{
				word[i++] = c;
				n1++;
			}
			if (hashTable[hash(word)] != NULL)
			{//单词发生了冲突或者是单词之前出现过
				pInfo_t q = hashTable[hash(word)];
				//比较哈希元素对应的单词是否和word相同
				while (q->next != NULL && strcmp(q->address , word)!=0)
				{
					q = q->next;
				}
				if (strcmp(q->address , word) == 0)
				{
					q->num++;//冲突链表里有word这个单词,词频+1
				}
				else
				{
					pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
					p->address = word;
					p->num = 1;
					q->next = p;
				}
			}
			else
			{//单词没发生冲突并且是第一次出现
				pInfo_t p = (pInfo_t) calloc(1 , sizeof(Info_t));
				p->address = word;
				p->num = 1;
				hashTable[hash(word)] = p;
			}
			if (c == EOF)//到达文件尾
			{
				break;
			}
		}
		if(c=='\n')//遇到'\n'
		{
			n2++;
		}
	}
	fclose(fp);
	printf("字符数%d,行数%d,单词数%d\n" , n1 , n2 , n3);

	pInfo_t A[10] = {NULL};//存放前10大词频的结点指针
	int flag = 0;//标识是否建立了初始的前10个结点指针的小顶堆
	//扫描哈希链表
	for (int i = 0,j=0; i < MAXKEY; i++)
	{	
		if (hashTable[i]!=NULL)
		{
			pInfo_t t= hashTable[i];
			while (t!=NULL)
			{
				if (j<10)//把前10个结点指针存进A	
				{
					A[j++] = t;
				}
				else if(j==10&&flag==0)
				{
					buildMinHeap(A , 0 , 9); //建立小顶堆
					flag = 1;
				}
				else
				{
					if (A[0]->num < t->num)
					{
						//错:SWAP(A[0],t);  影响t = t->next;
						A[0] = t;
						adjustMinHeap(A , 0 , 9);
					}	
				}
				t = t->next;
			}
		}
	}

	for (int i = 0; i < 10; i++)
	{//打印输出词频最高的前 10 个单词及其词频
		printf("%s:%d\n" , A[i]->address , A[i]->num);
	}
	return 0;
}  

Guess you like

Origin blog.csdn.net/qq_43496435/article/details/113768920