Write a program to count the number of occurrences of each word in an English text file

Table of contents

Brief description of the topic

Code writing

Python code implementation

C language code implementation

C++ code implementation


Brief description of the topic


[Problem Description] Write a program to count the number of occurrences of each word in an English text file (word frequency statistics), and output the statistical results to the screen in dictionary order of words. Note: Here a word is a sequence of characters consisting only of letters. Words containing uppercase letters should be counted after converting the uppercase letters to lowercase letters.

[Input form] Open the file "article.txt" in the current directory, read English words from it, and perform word frequency statistics.

[Output form] The program outputs the word statistics results to the screen in dictionary order. Each line outputs one word and its number of occurrences. The word and its number of occurrences are separated by a space. There is no space after the number of occurrences, and it is directly Enter.

[Sample input] The content of the file article.txt in the current directory is as follows: "Do not take to heart every thing you hear." "Do not spend all that you have." "Do not sleep as long as you want,"

[Sample output]

all 1

as 2

do 3

every 1

have 1

……

Code writing


Python code implementation

import re

def word_count(file_path):
    # 创建一个空字典以存储单词频率
    word_dict = {}

    # 打开文件并逐行读取
    with open(file_path, 'r', encoding='utf-8') as file:
        # 遍历文件中的每一行
        for line in file:
            # 使用正则表达式在行中查找所有单词(忽略大小写)
            words = re.findall(r'\b\w+\b', line.lower())
            
            # 遍历每个单词并在字典中更新单词频率
            for word in words:
                word_dict[word] = word_dict.get(word, 0) + 1

    # 按字典序对单词字典进行排序
    sorted_word_dict = sorted(word_dict.items())

    # 打印每个单词及其频率
    for word, count in sorted_word_dict:
        print(f'{word} {count}')

if __name__ == "__main__":
    # 设置文件路径
    file_path = r"D:\untitled13\9.2\.vscode\article.txt"
    
    # 调用 word_count 函数并传入指定的文件路径
    word_count(file_path)

Note that the program tokenizes each line in the file and uses the regular expression `\b\w+\b` to match words made up of letters. Then, convert the words to lowercase and use a dictionary for word frequency statistics. Finally, the statistical results are sorted in dictionary order of words and output to the screen.

C language code implementation

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_WORD_LENGTH 100

// 结构体用于存储单词和其频率
struct WordFrequency {
    char word[MAX_WORD_LENGTH];
    int count;
};

// 比较函数用于排序
int compare(const void *a, const void *b) {
    return strcmp(((struct WordFrequency *)a)->word, ((struct WordFrequency *)b)->word);
}

// 函数用于统计单词频率并按字典序输出
void word_count(const char *file_path) {
    FILE *file = fopen(file_path, "r");
    if (file == NULL) {
        perror("Error opening file");
        exit(EXIT_FAILURE);
    }

    struct WordFrequency *word_freq_array = NULL;
    int array_size = 0;

    // 读取文件内容
    char line[256];
    while (fgets(line, sizeof(line), file) != NULL) {
        char *token = strtok(line, " \t\n");

        while (token != NULL) {
            // 将单词转换为小写
            for (int i = 0; token[i]; i++) {
                token[i] = tolower(token[i]);
            }

            // 在 word_freq_array 中查找单词
            int found = 0;
            for (int i = 0; i < array_size; i++) {
                if (strcmp(word_freq_array[i].word, token) == 0) {
                    word_freq_array[i].count++;
                    found = 1;
                    break;
                }
            }

            // 如果单词未找到,则添加到 word_freq_array 中
            if (!found) {
                array_size++;
                word_freq_array = realloc(word_freq_array, array_size * sizeof(struct WordFrequency));
                if (word_freq_array == NULL) {
                    perror("Memory allocation error");
                    exit(EXIT_FAILURE);
                }
                strcpy(word_freq_array[array_size - 1].word, token);
                word_freq_array[array_size - 1].count = 1;
            }

            token = strtok(NULL, " \t\n");
        }
    }

    fclose(file);

    // 按字典序对单词进行排序
    qsort(word_freq_array, array_size, sizeof(struct WordFrequency), compare);

    // 输出排序后的单词及其频率
    for (int i = 0; i < array_size; i++) {
        printf("%s %d\n", word_freq_array[i].word, word_freq_array[i].count);
    }

    // 释放动态分配的内存
    free(word_freq_array);
}

int main() {
    const char *file_path = "D:\\untitled13\\9.2\\.vscode\\article.txt";
    word_count(file_path);

    return 0;
}

C++ code implementation


#include <iostream>
#include <fstream>
#include <sstream>
#include <map>
#include <vector>
#include <algorithm>

struct WordFrequency {
    std::string word;
    int count;
};

// 比较函数用于排序
bool compare(const WordFrequency& a, const WordFrequency& b) {
    return a.word < b.word;
}

// 函数用于统计单词频率并按字典序输出
void word_count(const std::string& file_path) {
    std::ifstream file(file_path);
    if (!file.is_open()) {
        std::cerr << "Error opening file." << std::endl;
        exit(EXIT_FAILURE);
    }

    std::map<std::string, int> word_freq_map;

    // 读取文件内容
    std::string line;
    while (std::getline(file, line)) {
        std::istringstream iss(line);
        std::string word;
        
        // 分词并统计单词频率
        while (iss >> word) {
            // 将单词转换为小写
            std::transform(word.begin(), word.end(), word.begin(), ::tolower);

            // 更新单词频率
            word_freq_map[word]++;
        }
    }

    file.close();

    // 将频率统计结果存入结构体数组
    std::vector<WordFrequency> word_freq_vector;
    for (const auto& pair : word_freq_map) {
        word_freq_vector.push_back({pair.first, pair.second});
    }

    // 按字典序对单词进行排序
    std::sort(word_freq_vector.begin(), word_freq_vector.end(), compare);

    // 输出排序后的单词及其频率
    for (const auto& word_freq : word_freq_vector) {
        std::cout << word_freq.word << " " << word_freq.count << std::endl;
    }
}

int main() {
    const std::string file_path = "D:\\untitled13\\9.2\\.vscode\\article.txt";
    word_count(file_path);

    return 0;
}

Guess you like

Origin blog.csdn.net/qq_50942093/article/details/134936490