8.2 Estadísticas de frecuencia de palabras en inglés (proyecto)

Tabla de contenido

Nivel 3 Contar el número de ocurrencias de palabras

Nivel 4 Contar el número de ocurrencias de palabras no especiales

Archivo de lectura de nivel 1

La tarea de este nivel: escribir un pequeño programa para leer archivos.

Descripción del problema

《谁动了我的奶酪？》Es una fábula creada por el escritor estadounidense Spencer Johnson, que se publicó por primera vez en 1998. El libro habla principalmente de 4un "personaje": dos ratoncitos "Sniff" y "Scurry" y dos enanos "Hem" y "Haw" que buscan queso. ‪‬‪‬‪‬‪‬‪‬‮‬‪‬‭‬‪‬‪‬‪‬‪‬‪‬‮‬‪‬‪‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‪‬‪‬ ‪‬‪‬‪‬‪‬‮‬‪‬‪‬

import string


def read_file(file):
    """接收文件名为参数，将文件中的内容读为字符串，
    只保留文件中的英文字母和西文符号，过滤掉中文
    所有字符转为小写，
    将其中所有标点、符号替换为空格，返回字符串"""
    ########## Begin ##########
    with open (file) as f :
        txt = f.read().lower()
        for i in ',."-':
            txt = txt.replace(i,' ')
        return txt
    
    ########## End ##########


if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    n = int(input())
    print(content[:n])

El nivel 2 cuenta el número de palabras

La tarea de este nivel: escribir un pequeño programa que pueda contar el número de palabras.

import string


def count_of_words(txt):
    """接收去除标点、符号的字符串，统计并返回其中单词数量和不重复的单词数量"""
    ########## Begin ##########
    txt = txt.split()
    counts = {}
    for i in txt:
        counts[i] = counts.get(i,0) + 1
    return len(txt),len(counts)
    


    ########## End ##########

def read_file(file):
    """接收文件名为参数，将文件中的内容读为字符串，
    只保留文件中的英文字母和西文符号，过滤掉中文
    所有字符转为小写，
    将其中所有标点、符号替换为空格，返回字符串"""
    with open(file, 'r', encoding='utf-8') as novel:
        txt = novel.read()
    english_only_txt = ''.join(x for x in txt if ord(x) < 256)
    english_only_txt = english_only_txt.lower()
    for character in string.punctuation:
        english_only_txt = english_only_txt.replace(character, ' ')
    return english_only_txt

if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    amount_results = count_of_words(content)
    print('文章共有单词{}个，其中不重复单词{}个'.format(*amount_results))

El nivel 3 cuenta el número de ocurrencias de palabras

Rendimiento esperado:

the 369

he 337

to 333

and 312

cheese 214

it 187

they 166

of 158

a 146

had 142

import string


def word_frequency(txt):
    """接收去除标点、符号的字符串，统计并返回每个单词出现的次数
    返回值为字典类型，单词为键，对应出现的次数为值"""
    ########## Begin ##########
    txt = txt.split()
    counts = {}
    for i in txt:
        counts[i] = counts.get(i,0) + 1
    return counts
    ########## End ##########


def top_ten_words(frequency, cnt):
    """接收词频字典，输出出现次数最多的cnt个单词及其出现次数"""
    ########## Begin ##########
    dic = sorted(frequency.items(),key = lambda x: x[1], reverse = True)
    for i in dic[0:cnt]:
        print(*i)
    
    ########## End ##########

def read_file(file):
    """接收文件名为参数，将文件中的内容读为字符串，
    只保留文件中的英文字母和西文符号，过滤掉中文
    所有字符转为小写，
    将其中所有标点、符号替换为空格，返回字符串"""
    with open(file, 'r', encoding='utf-8') as novel:
        txt = novel.read()
    english_only_txt = ''.join(x for x in txt if ord(x) < 256)
    english_only_txt = english_only_txt.lower()
    for character in string.punctuation:
        english_only_txt = english_only_txt.replace(character, ' ')
    return english_only_txt

if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    frequency_result = word_frequency(content)  # 统计词频
    n = int(input())
    top_ten_words(frequency_result, n)

El nivel 4 cuenta el número de ocurrencias de palabras no especiales

Entrada de prueba:8

Rendimiento esperado:

cheese 214

haw 113

what 105

change 86

hem 83

new 70

said 60

maze 46

import string


def top_ten_words_no_excludes(frequency, cnt):
    """接收词频字典，去除常见的冠词、代词、系动词和连接词后，输出出现次数最多的cnt个单词及其出现次数
    需排除的单词如下：
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we','or', 'is', 'was', 'do',
                      'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her', 'c','in', 'you', 'had',
                      's', 'with', 'for', 't', 'but', 'as', 'not', 'they', 'be', 'were', 'so', 'our',
                      'all', 'would', 'if', 'him', 'from', 'no', 'me', 'could', 'when', 'there',
                      'them', 'about', 'this', 'their', 'up', 'been', 'by', 'out', 'did', 'have']
    """
    ########## Begin ##########
    excludes_words = ['a', 'an', 'the', 'i', 'he', 'she', 'his', 'my', 'we','or', 'is', 'was', 'do',
                      'and', 'at', 'to', 'of', 'it', 'on', 'that', 'her', 'c','in', 'you', 'had',
                      's', 'with', 'for', 't', 'but', 'as', 'not', 'they', 'be', 'were', 'so', 'our',
                      'all', 'would', 'if', 'him', 'from', 'no', 'me', 'could', 'when', 'there',
                      'them', 'about', 'this', 'their', 'up', 'been', 'by', 'out', 'did', 'have']
    for i  in excludes_words:
        frequency.pop(i)
    dic = sorted(frequency.items(),key = lambda x: x[1], reverse = True)

    for i in dic[0:cnt]:
        print(*i)

    ########## End ##########


def read_file(file):
    """接收文件名为参数，将文件中的内容读为字符串，
    只保留文件中的英文字母和西文符号，过滤掉中文
    所有字符转为小写，
    将其中所有标点、符号替换为空格，返回字符串"""
    with open(file, 'r', encoding='utf-8') as novel:
        txt = novel.read()
    english_only_txt = ''.join(x for x in txt if ord(x) < 256)
    english_only_txt = english_only_txt.lower()
    for character in string.punctuation:
        english_only_txt = english_only_txt.replace(character, ' ')
    return english_only_txt

def word_frequency(txt):
    """接收去除标点、符号的字符串，统计并返回每个单词出现的次数
    返回值为字典类型，单词为键，对应出现的次数为值"""
    frequency = dict()
    words_list = txt.split()
    for word in words_list:
        frequency[word] = frequency.get(word, 0) + 1
    return frequency



if __name__ == '__main__':
    filename = 'Who Moved My Cheese.txt'  # 文件名
    content = read_file(filename)  # 调用函数返回字典类型的数据
    frequency_result = word_frequency(content)  # 统计词频
    n = int(input())
    top_ten_words_no_excludes(frequency_result, n)