1、语料准备
将每条文本切词并保存至txt文件中,输出文本格式如下:
代码如下(如有需要可以制定自己的停用词表,从而提高数据质量):
import jieba
with open('./data/data.txt','r',encoding='utf-8') as f:
data = f.readlines()
for line in data:
line = ' '.join(jieba.lcut(line.strip()))
with open('./data/train.txt','a',encoding='utf-8') as w:
w.write(line+'\n')
2、词向量模型训练
import time
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
# 指定训练文件路径
train_file = './data/train.txt'
class EpochLogger:
def __init__(self):
self.epoch = 0
self.start_time = 0
self.loss_previous_step = 0
def on_train_begin(self, model):
print("Training started.")
def on_batch_begin(self, model):
# print('batch started')
pass
def on_batch_end(self,model):
# print('batch end')
pass
def on_epoch_begin(self, model):
self.epoch += 1
print(f'Epoch{self.epoch}......')
self.start_time = time.time()
def on_train_end(self,model):
print('---------训练结束-----')
def on_epoch_end(self, model):
loss = model.get_latest_training_loss()
if self.epoch == 0:
print('Loss after epoch {}: {}'.format(self.epoch, loss))
else:
print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
self.loss_previous_step = loss
end_time = time.time()
run_time = end_time - self.start_time
learning_rate = model.alpha
print(f"Epoch {self.epoch} completed, Model loss:{model.get_latest_training_loss()}, Run Time: {run_time:.2f} seconds, Learning Rate: {learning_rate}")
# 使用LineSentence构造迭代器
callback = EpochLogger() # 创建回调函数实例
sentences = LineSentence(train_file,32)
# 设置模型参数并逐步训练
model = Word2Vec(size=300, window=3, min_count=4, workers=4, sg=1,compute_loss=True)
model.build_vocab(sentences)
for epoch in range(5): # 可根据需要设置训练的轮数
model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs,callbacks=[callback])
model.alpha -= 0.002 # 逐步降低学习率
np.save('embedding_goods.npy', model.wv.vectors) # 最终模型保存在model中,可以使用model进行后续操作
上面代码使用了gensim自带的数据加载器,分批次训练数据,因为一次性加载全部数据到内存,内存容易爆炸,本人测试直接600M的数据就用来10几个G内存,并且训练速度极慢。
最终为了满足分类模型的需求将权重保存成立npy格式。
注:EpochLogger类是打印信息用的,不过loss结果打印不出,不知道什么原因。