统计数据集(txt文档)中的单词种类

# (seq.in, ./vocab/in_vocab) seq.in是输入句子,in_vab是字典
def createVocabulary(input_path, output_path, no_pad=False, no_unk=False):
    if not isinstance(input_path, str):#异常
        raise TypeError('input_path should be string')

    if not isinstance(output_path, str):#异常
        raise TypeError('output_path should be string')

    vocab = {}
    with open(input_path, 'r',encoding='UTF-8') as fd, \
            open(output_path, 'w+',encoding='UTF-8') as out:
        for line in fd:#处理每句话
            line = line.rstrip('\r\n')
            words = line.split()

            for w in words:#针对每个单词,统计出现次数
                if w == '_UNK':
                    if str.isdigit(w) == True:
                        w = '0'
                if w in vocab:
                    vocab[w] += 1
                else:
                    vocab[w] = 1
        if no_pad == False:#sorted()排序
            vocab = ['_PAD', '_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
        else:
            vocab = ['_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
        for v in vocab:#将出现的词都输入到in_vocab中
            out.write(v + '\n')

发布了41 篇原创文章 · 获赞 44 · 访问量 7665

猜你喜欢

转载自blog.csdn.net/tailonh/article/details/105038660