自然语言处理--利用 Doc2vec 计算文档向量

与训练词向量类似,可以使用 gensim 包来训练文档向量。

import multiprocessing
# gensim Doc2vec 模块为语料库中的每篇文档包含了词向量嵌入和文档向量
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
# gensim的simple_preprocess 单元是一个粗分词器,会去除单字母词和所有标点符号
from gensim.utils import simple_preprocess

# 统计CPU的核数,便于后面设定线程数量
num_cores = multiprocessing.cpu_count()

# 提供一个逐条遍历文档字符串的对象
corpus = ["This class provides a uniform interface to fast distance metric functions. ",
          "The various metrics can be accessed via the get_metric class method and the metric string identifier (see below).",
          "Though intended for integer-valued vectors, these are also valid metrics in the case of real-valued vectors.",
          "Here func is a function which takes two one-dimensional numpy arrays, and returns a distance."]

training_corpus = []
# 标记文档
for i, text in enumerate(corpus):
    tagged_doc = TaggedDocument(simple_preprocess(text), [i])
    training_corpus.append(tagged_doc)

# 实例化一个 Doc2vec 对象,滑动窗口大小为 10 个词,每个
# 词和文档向量 100 维,min_count 是词汇表中文档频率的最小值
model = Doc2Vec(size=100, min_count=2, workers=num_cores, iter=10)
# 模型开始训练之前需要对词汇表进行编译
model.build_vocab(training_corpus)
# 10 个训练周期后结束训练
model.train(training_corpus, total_examples=model.corpus_count, epochs=model.iter)

# 推理
# Doc2vec 在做新向量推理时需要一个训练步骤,此通过10 步(或迭代)来更新向量
result = model.infer_vector(simple_preprocess('Convert the true distance to the reduced distance.'), steps=10)
print(result)

猜你喜欢

转载自blog.csdn.net/fgg1234567890/article/details/112975073