在 Windows下使用 fastText

gemsim-fastText
直接 pip install gemsim

UserWarning: C extension not loaded, training will be slow. Install a C compiler and reinstall gensim for fast training. "C extension not loaded, training will be slow. "

https://radimrehurek.com/gensim/models/fasttext.html
https://www.cs.mcgill.ca/~mxia3/FastText-for-Windows/
看到要安装 VS2017就算了，训练慢就慢一点。

gemsim-fastText 编程练习
https://blog.csdn.net/sinat_26917383/article/details/83041424
https://radimrehurek.com/gensim/models/fasttext.html

fastText 和 Word2Vec 的关系
model: Training architecture. Allowed values: cbow, skipgram (Default cbow)，Word2Vec 也有两种形式：cbow, skipgram

word_ngrams ({1,0}, optional)
word_ngrams = 1 with subword(n-grams) information.
word_ngrams = 0, this is equivalent to Word2Vec.

gensim.models 上就可以调用 fasttext.

# encoding:utf8
"""参考：https://blog.csdn.net/sinat_26917383/article/details/83041424"""

# 训练主函数
from gensim.models import FastText

# sentences = [["你", "是", "谁"], ["我", "是", "中国人"]]
# model = FastText(sentences, size=5, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=0)
# fasttext_model = FastText(sentences, size=5, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=0)

# print(model.wv["你"])
# print(model.wv.word_vec("你"))
# fname = "D:\\pycode\\SharedData\\test"

# 保存fastText模型
# model.save(fname)

# 将fastText的词向量 保存 为word2vec的格式
# fasttext_model.wv.save_word2vec_format('../SharedData/test_fasttext.txt', binary=False)
# fasttext_model.wv.save_word2vec_format('../SharedData/test_fasttext.bin', binary=True)

# 加载fastText模型
# model = FastText.load(fname)


# # 在线更新训练 fastText
# from gensim.models import FastText
# sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
# sentences_2 = [["dude", "say", "wazzup!"]]
# model = FastText(min_count=1)
#
# model.build_vocab(sentences_1)
# model.train(sentences_1, total_examples=model.corpus_count, epochs=model.epochs)
#
# model.build_vocab(sentences_2, update=True)
# model.train(sentences_2, total_examples=model.corpus_count, epochs=model.epochs)

# 获得词向量
sentences = [["你", "是", "谁"], ["我", "是", "中国人"]]
fasttext_model = FastText(sentences, size=4, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=0)

# word vector
# print(fasttext_model.wv.vectors)  # (5, 4)
# print(fasttext_model.wv.vectors_vocab)  # (5, 4)

# ngram vector
# print(fasttext_model.wv.min_n)  
# print(fasttext_model.wv.max_n)
# min_n=3, max_n=6 所以一个ngram最少包含3个字符，最多包含6个字符，这里一个汉字算一个字符
# 3gram,4gram,5gram,6gram
# print(fasttext_model.wv.num_ngram_vectors)  # 10
# print(fasttext_model.wv.vectors_ngrams)  # (10, 4)

# ['<中国', '中国人', '国人>', '<中国人', '中国人>', '<中国人>','你',''我,'是','谁']

from gensim.models.utils_any2vec import _compute_ngrams_py
ngrams = _compute_ngrams_py("吃了吗", min_n=3, max_n=6)
print(len(ngrams))  # 6
print(ngrams)
# ['<吃了', '吃了吗', '了吗>', '<吃了吗', '吃了吗>', '<吃了吗>']

# 求词附近的相似词
# fasttext自带的OOV功能
# 获取 fasttext的n-grams词向量

在 Windows下使用 fastText

猜你喜欢