Linux using C language to achieve a text word count (word count) and run-time statistics

Linux using C language to achieve a text word count (word count) and run-time statistics

由于WHUT的云计算课程实验要求,采用C lang实现word count功能。不得不说,用C语言处理字符串,真有一种想砸了电脑的冲动。。。

Here is the source code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>


int* findn(int A[], int N);

int main() {

	FILE* fp;

	char fileName[256] = "/usr/local/data.dat";

	// scanf("%s", &fileName);

	fp = fopen(fileName, "r");  /* 以只读方式读取文件 */
        
	clock_t start, stop;

	start = clock();

	int word_count = 0;	 // 记录单词总数

	char store[60][15] = { '\0' };

	char key[15] = { '\0' };

	char temp[15] = { '\0' };

	int weight[60] = {0};

	int value = 0;
	
	int key_length = 0;

	while ((value = fgetc(fp)) != EOF) {    /* 当文件未到末尾时,统计各对应字符的权值 (即频数) */

		if (value != ' ') {

			if (value != '.' && value != ',' && value != '!' && value != '?' && value != ':' && value != '#') {

				key[key_length++] = value;	// 获得单词key

			}

		}
		
		if (value == ' ' || value == '#' || value == '\n') {

			key[key_length] = '\0';

			if (word_count == 0) {	// 第一次

				for (int m = 0; m <= key_length; m++) {	// 将每个单词以行的形式存储在二维数组store中

					store[0][m] = key[m];

				}

				weight[word_count]++;

				word_count++;

			} else {

				int flag = 0;

				for (int i = 0; i < word_count; i++) {
					
					int k;

					for (k = 0; store[i][k] != '\0'; k++) {	// 将每个单词以行的形式存储在二维数组store中

						temp[k] = store[i][k];

					}

					temp[k] = '\0';

					int isOkay = -1;

					if (abs(temp[0] - key[0]) == 32 && strlen(temp) == strlen(key)) {	// 相差32 & 长度相同

						char dest1[15] = { '\0' };

						char dest2[15] = { '\0' };

						int sub_length = strlen(temp) - 1;

						strncpy(dest1, temp + 1, sub_length);
						strncpy(dest2, key + 1, sub_length);

						isOkay = strcmp(dest1, dest2);

						if (isOkay == 0) {	// 相同单词, 首字母大小写不同

							weight[i]++;

							flag = 1;

							break;

						}

					}
					else {

						isOkay = strcmp(key, temp);	// 0相同, 1不同

						if (isOkay == 0) {	// 相同单词

							weight[i]++;

							flag = 1;

							break;

						}

					}

				}

				if (flag == 0) {	// 全新单词

					for (int j = 0; j < key_length; j++) { 

						store[word_count][j] = key[j];

					}

					weight[word_count]++;

					word_count++;

				}

			}

			key_length = 0;

		}

	}


	//for (int i = 0; i < word_count; i++) {	// 调试用

	//	for (int j = 0; store[i][j] != '\0'; j++) {

	//		printf("%c", store[i][j]);

	//	}

	//	printf("%4d\n", weight[i]);

	//}


	int* index = findn(weight, word_count);

	for (int i = 0; i < 3; i++) {

		int order = *(index + i);

		for (int j = 0; store[order][j] != '\0'; j++) {

			printf("%c", store[order][j]);

		}

		printf("%4d\n", weight[order]);

	}

	stop = clock();

	double duration = (double)(stop - start) / CLOCKS_PER_SEC;
	
	printf("word_count time: %fs\n",  duration);
        
        return 0;

}


int* findn(int A[], int N) {	// 冒泡排序

	int value[3] = { 0 };

	int max = 0;

	static int index[3] = { 0 };	// 此处必须加static,否则报错!

	for (int j = 0; j < 3; j++) {

		for (int i = 0; i < N; i++) {

			if (A[i] > max) {

				max = i;
				value[j] = A[i];

			}

		}

		index[j] = max;

		A[max] = 0;

		max = 0;

	}

	for (int j = 0; j < 3; j++) {

		A[index[j]] = value[j];

	}

	return index;

}


Some explanation :

  1. Not fully take into account all the punctuation marks in English, but you can own in your program to get the word key position changes!

  2. The program # as the end identifier.

  3. The programs under Linux GCC compiler to run through. (Note that the input file path can not include Chinese learned (blood !!), there would be 段错误(核心转储))
    as shown below:
    Here Insert Picture Description

  4. findn function only achieved a number of words to find the first three words, there is no implementation returns the subscript n sequence of the pre-word. Also note that the function returns an array findn in the declaration, it must be added to static .

  5. GNU under linux GCC compiler, the header file <string.h> and <time.h> Implementation and Visual Studio under windows C lang compiler implements different.
    Do not list them here, indicating only difference in this program:

    1. In Visual Studio, C lang compiler, strcpy () that can be used to intercept the string, and in linux, this function instead strncpy () implementation.
    2. In Visual Studio compiler of C lang, CLK_TCK one clock cycle, and in linux, this function instead CLOCKS_PER_SEC achieve.

Finally had to say, C lang string handling really rubbish! Had to sigh, Java String great! !

Published 69 original articles · won praise 11 · views 8440

Guess you like

Origin blog.csdn.net/qq_40994260/article/details/105091051