预处理代码
分词
输入一句话
输出一个词的list
from pyltp import Segmentor
from zhon.hanzi import punctuation
def word_cut(sentence):
LTP_DATA_DIR = 'C:\\Users\\d84105613\\ltp_data'
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
segmentor = Segmentor() # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'lexicon') # 加载模型
words = segmentor.segment(sentence)
segmentor.release()
words = list(words)
#print(len(set(words)))
words = [c for c in words if c not in punctuation]
return words
加载词向量
输入:第一行是voc_size and emb_size
下面是词向量
输出:词表,词向量
def loadWord2Vec(filename):
vocab = []
embd = []
cnt = 0
fr = open(filename,'r',encoding='utf-8')
line = fr.readline().strip()
#print line
word_dim = int(line.split(' ')[1])
vocab.append("unk")
#print(word_dim)
embd.append([0]*word_dim)
for line in fr:
row = line.strip().split(' ')
vocab.append(row[0])
embd.append(row[1:])
print("loaded word2vec")
embd = np.asarray(embd).astype(np.float)
fr.close()
return vocab, embd
读取文本数据
输入:txt文件或者其他类型文本,sentence1 /t sentence2 /t label
输出:sentence1 list sentence2 list
def getTsvData(filepath):
# 读取输入的文件,文件分为三行,
# 行与行之间用\t分隔开,
# 前两行为需要计算相似度的两个句子,
# 后两行为类别标记
print("Loading training data from " + filepath)
x1 = []
x2 = []
y = []
# positive samples from file
for line in open(filepath,encoding='utf-8'):
l = line.strip().split("\t")
if len(l) < 2:
continue
# 随机打乱两个句子之间的位置关系
if random() > 0.5:
x1.append(l[0])
x2.append(l[1])
else:
x1.append(l[1])
x2.append(l[0])
y.append(int(l[2]))
return np.asarray(x1), np.asarray(x2), np.asarray(y)
一个完整的使用tf转换句子成词id的实例
from random import random
import numpy as np
import os
from pyltp import Segmentor
from zhon.hanzi import punctuation
from tensorflow.contrib import learn
LTP_DATA_DIR = 'C:\\Users\\d84105613\\ltp_data'
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
segmentor = Segmentor() # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'lexicon') # 加载模型
def getTsvData(filepath):
# 读取输入的文件,文件分为三行,
# 行与行之间用\t分隔开,
# 前两行为需要计算相似度的两个句子,
# 后两行为类别标记
print("Loading training data from " + filepath)
x1 = []
x2 = []
y = []
# positive samples from file
for line in open(filepath,encoding='utf-8'):
l = line.strip().split("\t")
if len(l) < 2:
continue
# 随机打乱两个句子之间的位置关系
if random() > 0.5:
x1.append(l[0])
x2.append(l[1])
else:
x1.append(l[1])
x2.append(l[0])
y.append(int(l[2]))
return np.asarray(x1), np.asarray(x2), np.asarray(y)
def word_cut(sentences):
for sentence in sentences:
words = segmentor.segment(sentence)
#segmentor.release()
words = list(words)
# print(len(set(words)))
words = [c for c in words if c not in punctuation]
yield words
if __name__ == '__main__':
x1_text, x2_text, y = getTsvData("atec_nlp_sim_data.txt")
vocab = learn.preprocessing.VocabularyProcessor(max_document_length=15,min_frequency=2,tokenizer_fn=word_cut)
a = vocab.fit_transform(np.concatenate((x2_text, x1_text), axis=0))
print(list(a))
#可以看到转换后的word_id
doc = list(a)
for i in vocab.reverse(doc):
print(i)
#可以看到原始的分词句子,不齐的它自动补齐<UNK>
使用训练好的词向量去生成embedding层
initW = np.random.uniform(-0.25, 0.25, (len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
# initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
# load any vectors from the word2vec
print("initializing initW with pre-trained word2vec embeddings")
for w in vocab_processor.vocabulary_._mapping:
arr = []
if w in inpH.pre_emb:
arr = inpH.pre_emb[w]
if len(arr) > 0:
idx = vocab_processor.vocabulary_.get(w)
initW[idx] = np.asarray(arr).astype(np.float32)
print("Done assigning intiW. len=" + str(len(initW)))