文本表示方法--单词嵌入向量(word2vec)

在这里插入图片描述

# 下载相应数据集
# wget https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip

import pandas as pd
import json
import jieba


## 0.gesim词向量实战
#   1.读取预处理的数据集
#   2.训练
#   3.结果

# 1.1数据预处理
def get_sentence(data_file):
    # 读取文件
    f = open(data_file, encoding='utf-8')
    reader = f.readlines()
    sentence = []
    for line in reader:
        line = json.loads(line.strip())
        sentence.append(line['sentence'])
    return sentence


train_sentence = get_sentence(r'tnews_public/train.json')
test_sentence = get_sentence(r'tnews_public/test.json')
dev_sentence = get_sentence(r'tnews_public/dev.json')

# 全量数据集
train_data = train_sentence+test_sentence+dev_sentence
train_data = [list(jieba.cut(sen))for sen in train_data]
print(train_data)
print(len(train_data))

# 3.构建词向量模型
from gensim.models.word2vec import  LineSentence
from gensim.models import word2vec
import gensim

import logging

logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)',level=logging.INFO)
# 构建模型
from gensim.models import FastText
# print(help(FastText))
"""
sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025,
window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
max_final_vocab=None
"""
# model = FastText(train_data,vector_size=4,window=3,min_count=1,epochs=10)

"""
 sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
            max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
            sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
            trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
            comment=None, max_final_vocab=None,
"""
## skip-gram与CBOW
model = word2vec.Word2Vec(train_data,sg=1,workers=4,min_count=4,vector_size=200,epochs=1)

# 查找一个最近的词
print(model.wv.most_similar(['金融'],topn=10))

model_save_path = 'word2vec.model'
model.save(model_save_path)

# 载入模型
model = word2vec.Word2Vec.load(model_save_path)


猜你喜欢

转载自blog.csdn.net/Cocktail_py/article/details/119857792