版权声明:转载请注明出处。 https://blog.csdn.net/Xin_101/article/details/83154357
1 将文本按照词频顺序排列
import codecs
import collections
from operator import itemgetter
RAW_DATA = "vocabulary.txt"
VOCAB_OUTPUT = "ptb.vocab"
counter = collections.Counter()
with codecs.open(RAW_DATA, "r", "utf-8") as f:
for line in f:
for word in line.strip().split():
counter[word] += 1
print(counter)
sorted_word_to_cnt = sorted(counter.items(), key=itemgetter(1), reverse=True)
print(sorted_word_to_cnt)
sorted_words = [x[0] for x in sorted_word_to_cnt]
print(sorted_words)
sorted_words = ["<eos>"] + sorted_words
print(sorted_words)
sorted_words = ["<unk>", "<sos>", "<eos>"] + sorted_words
print(sorted_words)
with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:
for word in sorted_words:
file_output.write(word + "\n")
2 文本分配编号
import codecs
import sys
RAW_DATA = "vocabulary.txt"
VOCAB = "ptb.vocab"
OUTPUT_DATA = "ptb.train"
#读取词汇表
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
vocab = [w.strip() for w in f_vocab.readlines()]
#新建字典:文本:行号
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}
#获取词汇表中词对应的行号
def get_id(word):
return word_to_id[word] if word in word_to_id else word_to_id["<unk>"]
fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, "w", "utf-8")
for line in fin:
words = line.strip().split() + ["<eos>"]
out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
fout.write(out_line)
fin.close()
fout.close()