版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/yangfengling1023/article/details/82384908
当我们训练神经网络时,输入数据有时是训练好的词向量,有时是字向量,我们自己训练训练向量时,可以借用gensim中的word2vec,下面的代码可以同时实现词向量、字向量的训练
from gensim.models import Word2Vec
import os
import jieba
def h1():##该函数主要是对语料的前期处理,可以根据不同的情况进行修改
path1 = './qisu_yijianshu'
path2 = './xunwen_bilu'
files = os.listdir(path1)
for file in os.listdir(path2):
files.append(file)
f1 = open('data.txt','w')
for each in files:
if each.find('qisu') != -1:each = './qisu_yijianshu/'+each
else:each = './xunwen_bilu/'+each
with open(each,encoding='utf-8') as fp:
for line in fp.readlines():
line = line.strip()
if line == '':
continue
f1.write(line+'\n')
f1.close()
def h_zi(file):##该函数主要是将语料处理成单个的字
sentence = []
word = []
with open(file) as fp:
for line in fp.readlines():
line = line.strip()
if line == '':
continue
linshi = []
for each in line:
word.append(each)
linshi.append(each)
if linshi != []:
sentence.append(linshi)
word = list(set(word))
return sentence,word
def h_ci(file):##该函数主要是将语料处理成单个的词
sentence = []
word = []
with open(file) as fp:
for line in fp.readlines():
line = line.strip()
if line == '':
continue
line = jieba.cut(line,cut_all=True)
line = ' '.join(line)
linshi = []
for each in line.split():
word.append(each)
linshi.append(each)
if linshi != []:
sentence.append(linshi)
word = list(set(word))
return sentence,word
def train_model(sentence,name):##模型的训练
model = Word2Vec(sentence, sg=1, size=100, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=4)
model.save(name)
return name
def write_to_file(model_name,word,file_name):##训练出的向量写入文件
model = Word2Vec.load(model_name) # 加载模型
f1 = open(file_name, 'w', encoding='utf-8')
f1.write(str(len(word)) + ' ' + str(100) + '\n')
for each in word:
str1 = ''
for e in model[each]:
if str1 == '':
str1 = str(e)
else:
str1 = str1 + ' ' + str(e)
f1.write(each + ' ' + str1 + '\n')
f1.close()
if __name__ == '__main__': ##此处是主函数
i = 1 ##根据你的选择,执行下面不同的操作
if i == 0: ##训练字向量
sentence,word = h_zi('data.txt') ##以字为单位进行训练字向量
model_name = train_model(sentence,'dict_data_model_zi')
write_to_file(model_name,word,'data_vec_zi.txt')
elif i == 1: ##训练词向量
sentence,word = h_ci('data.txt') ##以词为单词为单位训练词向量
model_name = train_model(sentence,'dict_data_model_ci')
write_to_file(model_name,word,'data_vec_ci.txt')