在TensorFlow RNN 深度学习下 BiLSTM+CRF 实现 sequence labeling
双向LSTM+CRF 序列标注问题
源码
去年底样子一直在做NLP相关task,是个关于序列标注问题。这 sequence labeling属于NLP的经典问题了,开始尝试用HMM,哦不,用CRF做baseline,by the way, 用的CRF++。
关于CRF的理论就不再啰嗦了,街货。顺便提下,CRF比HMM在理论上以及实际效果上都要好不少。但我要说的是CRF跑我这task还是不太乐观。P值0.6样子,R低的离谱,所以F1很不乐观。mentor告诉我说是特征不足,师兄说是这个task本身就比较难做,F1低算是正常了。
CRF做完baseline后,一直在着手用BiLSTM+CRF跑 sequence labeling,奈何项目繁多,没有多余的精力去按照正常的计划做出来。后来还是一点一点的,按照大牛们的步骤以及参考现有的代码,把 BiLSTM+CRF的实现拿下了。后来发现,跑出来的效果也不太理想……可能是这个task确实变态……抑或模型还要加强吧~
这里对比下CRF与LSTM的cell,先说RNN吧,RNN其实是比CNN更适合做序列问题的模型,RNN隐层当前时刻的输入有一部分是前一时刻的隐层输出,这使得他能通过循环反馈连接看到前面的信息,将一段序列的前面的context capture 过来参与此刻的计算,并且还具备非线性的拟合能力,这都是CRF无法超越的地方。而LSTM的cell很好的将RNN的梯度弥散问题优化解决了,他对门卫gate说:老兄,有的不太重要的信息,你该忘掉就忘掉吧,免得占用现在的资源。而双向LSTM就更厉害了,不仅看得到过去,还能将未来的序列考虑进来,使得上下文信息充分被利用。而CRF,他不像LSTM能够考虑长远的上下文信息,它更多地考虑整个句子的局部特征的线性加权组合(通过特征模板扫描整个句子),特别的一点,他计算的是联合概率,优化了整个序列,而不是拼接每个时刻的最优值。那么,将BILSTM与CRF一起就构成了还比较不错的组合,这目前也是学术界的流行做法~
另外针对目前的跑通结果提几个改进点:
1.+CNN,通过CNN的卷积操作去提取英文单词的字母细节。
2.+char representation,作用与上相似,提取更细粒度的细节。
3.more joint model to go.
fine,叨了不少。codes time:
完整代码以及相关预处理的数据请移步github: scofiled's github/bilstm+crf
requirements:
ubuntu14
python2.7
tensorflow 0.8
numpy
pandas0.15
BILSTM_CRF.py
-
import math
-
import helper
-
import numpy as np
-
import tensorflow as tf
-
from tensorflow.models.rnn import rnn, rnn_cell
-
-
class BILSTM_CRF(object):
-
-
def __init__(self, num_chars, num_classes, num_steps=200, num_epochs=100, embedding_matrix=None, is_training=True, is_crf=True, weight=False):
-
# Parameter
-
self.max_f1 = 0
-
self.learning_rate = 0.002
-
self.dropout_rate = 0.5
-
self.batch_size = 128
-
self.num_layers = 1
-
self.emb_dim = 100
-
self.hidden_dim = 100
-
self.num_epochs = num_epochs
-
self.num_steps = num_steps
-
self.num_chars = num_chars
-
self.num_classes = num_classes
-
-
# placeholder of x, y and weight
-
self.inputs = tf.placeholder(tf.int32, [ None, self.num_steps])
-
self.targets = tf.placeholder(tf.int32, [ None, self.num_steps])
-
self.targets_weight = tf.placeholder(tf.float32, [ None, self.num_steps])
-
self.targets_transition = tf.placeholder(tf.int32, [ None])
-
-
# char embedding
-
if embedding_matrix != None:
-
self.embedding = tf.Variable(embedding_matrix, trainable= False, name= "emb", dtype=tf.float32)
-
else:
-
self.embedding = tf.get_variable( "emb", [self.num_chars, self.emb_dim])
-
self.inputs_emb = tf.nn.embedding_lookup(self.embedding, self.inputs)
-
self.inputs_emb = tf.transpose(self.inputs_emb, [ 1, 0, 2])
-
self.inputs_emb = tf.reshape(self.inputs_emb, [ -1, self.emb_dim])
-
self.inputs_emb = tf.split( 0, self.num_steps, self.inputs_emb)
-
-
# lstm cell
-
lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim)
-
lstm_cell_bw = tf.nn.rnn_cell.BasicLSTMCell(self.hidden_dim)
-
-
# dropout
-
if is_training:
-
lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_fw, output_keep_prob=( 1 - self.dropout_rate))
-
lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_bw, output_keep_prob=( 1 - self.dropout_rate))
-
-
lstm_cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_fw] * self.num_layers)
-
lstm_cell_bw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_bw] * self.num_layers)
-
-
# get the length of each sample
-
self.length = tf.reduce_sum(tf.sign(self.inputs), reduction_indices= 1)
-
self.length = tf.cast(self.length, tf.int32)
-
-
# forward and backward
-
self.outputs, _, _ = rnn.bidirectional_rnn(
-
lstm_cell_fw,
-
lstm_cell_bw,
-
self.inputs_emb,
-
dtype=tf.float32,
-
sequence_length=self.length
-
)
-
-
# softmax
-
self.outputs = tf.reshape(tf.concat( 1, self.outputs), [ -1, self.hidden_dim * 2])
-
self.softmax_w = tf.get_variable( "softmax_w", [self.hidden_dim * 2, self.num_classes])
-
self.softmax_b = tf.get_variable( "softmax_b", [self.num_classes])
-
self.logits = tf.matmul(self.outputs, self.softmax_w) + self.softmax_b
-
-
if not is_crf:
-
pass
-
else:
-
self.tags_scores = tf.reshape(self.logits, [self.batch_size, self.num_steps, self.num_classes])
-
self.transitions = tf.get_variable( "transitions", [self.num_classes + 1, self.num_classes + 1])
-
-
dummy_val = -1000
-
class_pad = tf.Variable(dummy_val * np.ones((self.batch_size, self.num_steps, 1)), dtype=tf.float32)
-
self.observations = tf.concat( 2, [self.tags_scores, class_pad])
-
-
begin_vec = tf.Variable(np.array([[dummy_val] * self.num_classes + [ 0] for _ in range(self.batch_size)]), trainable= False, dtype=tf.float32)
-
end_vec = tf.Variable(np.array([[ 0] + [dummy_val] * self.num_classes for _ in range(self.batch_size)]), trainable= False, dtype=tf.float32)
-
begin_vec = tf.reshape(begin_vec, [self.batch_size, 1, self.num_classes + 1])
-
end_vec = tf.reshape(end_vec, [self.batch_size, 1, self.num_classes + 1])
-
-
self.observations = tf.concat( 1, [begin_vec, self.observations, end_vec])
-
-
self.mask = tf.cast(tf.reshape(tf.sign(self.targets),[self.batch_size * self.num_steps]), tf.float32)
-
-
# point score
-
self.point_score = tf.gather(tf.reshape(self.tags_scores, [ -1]), tf.range( 0, self.batch_size * self.num_steps) * self.num_classes + tf.reshape(self.targets,[self.batch_size * self.num_steps]))
-
self.point_score *= self.mask
-
-
# transition score
-
self.trans_score = tf.gather(tf.reshape(self.transitions, [ -1]), self.targets_transition)
-
-
# real score
-
self.target_path_score = tf.reduce_sum(self.point_score) + tf.reduce_sum(self.trans_score)
-
-
# all path score
-
self.total_path_score, self.max_scores, self.max_scores_pre = self.forward(self.observations, self.transitions, self.length)
-
-
# loss
-
self.loss = - (self.target_path_score - self.total_path_score)
-
-
# summary
-
self.train_summary = tf.scalar_summary( "loss", self.loss)
-
self.val_summary = tf.scalar_summary( "loss", self.loss)
-
-
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
-
-
def logsumexp(self, x, axis=None):
-
x_max = tf.reduce_max(x, reduction_indices=axis, keep_dims= True)
-
x_max_ = tf.reduce_max(x, reduction_indices=axis)
-
return x_max_ + tf.log(tf.reduce_sum(tf.exp(x - x_max), reduction_indices=axis))
-
-
def forward(self, observations, transitions, length, is_viterbi=True, return_best_seq=True):
-
length = tf.reshape(length, [self.batch_size])
-
transitions = tf.reshape(tf.concat( 0, [transitions] * self.batch_size), [self.batch_size, 6, 6])
-
observations = tf.reshape(observations, [self.batch_size, self.num_steps + 2, 6, 1])
-
observations = tf.transpose(observations, [ 1, 0, 2, 3])
-
previous = observations[ 0, :, :, :]
-
max_scores = []
-
max_scores_pre = []
-
alphas = [previous]
-
for t in range( 1, self.num_steps + 2):
-
previous = tf.reshape(previous, [self.batch_size, 6, 1])
-
current = tf.reshape(observations[t, :, :, :], [self.batch_size, 1, 6])
-
alpha_t = previous + current + transitions
-
if is_viterbi:
-
max_scores.append(tf.reduce_max(alpha_t, reduction_indices= 1))
-
max_scores_pre.append(tf.argmax(alpha_t, dimension= 1))
-
alpha_t = tf.reshape(self.logsumexp(alpha_t, axis= 1), [self.batch_size, 6, 1])
-
alphas.append(alpha_t)
-
previous = alpha_t
-
-
alphas = tf.reshape(tf.concat( 0, alphas), [self.num_steps + 2, self.batch_size, 6, 1])
-
alphas = tf.transpose(alphas, [ 1, 0, 2, 3])
-
alphas = tf.reshape(alphas, [self.batch_size * (self.num_steps + 2), 6, 1])
-
-
last_alphas = tf.gather(alphas, tf.range( 0, self.batch_size) * (self.num_steps + 2) + length)
-
last_alphas = tf.reshape(last_alphas, [self.batch_size, 6, 1])
-
-
max_scores = tf.reshape(tf.concat( 0, max_scores), (self.num_steps + 1, self.batch_size, 6))
-
max_scores_pre = tf.reshape(tf.concat( 0, max_scores_pre), (self.num_steps + 1, self.batch_size, 6))
-
max_scores = tf.transpose(max_scores, [ 1, 0, 2])
-
max_scores_pre = tf.transpose(max_scores_pre, [ 1, 0, 2])
-
-
return tf.reduce_sum(self.logsumexp(last_alphas, axis= 1)), max_scores, max_scores_pre
-
-
def train(self, sess, save_file, X_train, y_train, X_val, y_val):
-
saver = tf.train.Saver()
-
-
char2id, id2char = helper.loadMap( "char2id")
-
label2id, id2label = helper.loadMap( "label2id")
-
-
merged = tf.merge_all_summaries()
-
summary_writer_train = tf.train.SummaryWriter( 'loss_log/train_loss', sess.graph)
-
summary_writer_val = tf.train.SummaryWriter( 'loss_log/val_loss', sess.graph)
-
-
num_iterations = int(math.ceil( 1.0 * len(X_train) / self.batch_size))
-
-
cnt = 0
-
for epoch in range(self.num_epochs):
-
# shuffle train in each epoch
-
sh_index = np.arange(len(X_train))
-
np.random.shuffle(sh_index)
-
X_train = X_train[sh_index]
-
y_train = y_train[sh_index]
-
print "current epoch: %d" % (epoch)
-
for iteration in range(num_iterations):
-
# train
-
X_train_batch, y_train_batch = helper.nextBatch(X_train, y_train, start_index=iteration * self.batch_size, batch_size=self.batch_size)
-
y_train_weight_batch = 1 + np.array((y_train_batch == label2id[ 'B']) | (y_train_batch == label2id[ 'E']), float)
-
transition_batch = helper.getTransition(y_train_batch)
-
-
_, loss_train, max_scores, max_scores_pre, length, train_summary =\
-
sess.run([
-
self.optimizer,
-
self.loss,
-
self.max_scores,
-
self.max_scores_pre,
-
self.length,
-
self.train_summary
-
],
-
feed_dict={
-
self.targets_transition:transition_batch,
-
self.inputs:X_train_batch,
-
self.targets:y_train_batch,
-
self.targets_weight:y_train_weight_batch
-
})
-
-
predicts_train = self.viterbi(max_scores, max_scores_pre, length, predict_size=self.batch_size)
-
if iteration % 10 == 0:
-
cnt += 1
-
precision_train, recall_train, f1_train = self.evaluate(X_train_batch, y_train_batch, predicts_train, id2char, id2label)
-
summary_writer_train.add_summary(train_summary, cnt)
-
print "iteration: %5d, train loss: %5d, train precision: %.5f, train recall: %.5f, train f1: %.5f" % (iteration, loss_train, precision_train, recall_train, f1_train)
-
-
# validation
-
if iteration % 100 == 0:
-
X_val_batch, y_val_batch = helper.nextRandomBatch(X_val, y_val, batch_size=self.batch_size)
-
y_val_weight_batch = 1 + np.array((y_val_batch == label2id[ 'B']) | (y_val_batch == label2id[ 'E']), float)
-
transition_batch = helper.getTransition(y_val_batch)
-
-
loss_val, max_scores, max_scores_pre, length, val_summary =\
-
sess.run([
-
self.loss,
-
self.max_scores,
-
self.max_scores_pre,
-
self.length,
-
self.val_summary
-
],
-
feed_dict={
-
self.targets_transition:transition_batch,
-
self.inputs:X_val_batch,
-
self.targets:y_val_batch,
-
self.targets_weight:y_val_weight_batch
-
})
-
-
predicts_val = self.viterbi(max_scores, max_scores_pre, length, predict_size=self.batch_size)
-
precision_val, recall_val, f1_val = self.evaluate(X_val_batch, y_val_batch, predicts_val, id2char, id2label)
-
summary_writer_val.add_summary(val_summary, cnt)
-
print "iteration: %5d, valid loss: %5d, valid precision: %.5f, valid recall: %.5f, valid f1: %.5f" % (iteration, loss_val, precision_val, recall_val, f1_val)
-
-
if f1_val > self.max_f1:
-
self.max_f1 = f1_val
-
save_path = saver.save(sess, save_file)
-
print "saved the best model with f1: %.5f" % (self.max_f1)
-
-
def test(self, sess, X_test, X_test_str, output_path):
-
char2id, id2char = helper.loadMap( "char2id")
-
label2id, id2label = helper.loadMap( "label2id")
-
num_iterations = int(math.ceil( 1.0 * len(X_test) / self.batch_size))
-
print "number of iteration: " + str(num_iterations)
-
with open(output_path, "wb") as outfile:
-
for i in range(num_iterations):
-
print "iteration: " + str(i + 1)
-
results = []
-
X_test_batch = X_test[i * self.batch_size : (i + 1) * self.batch_size]
-
X_test_str_batch = X_test_str[i * self.batch_size : (i + 1) * self.batch_size]
-
if i == num_iterations - 1 and len(X_test_batch) < self.batch_size:
-
X_test_batch = list(X_test_batch)
-
X_test_str_batch = list(X_test_str_batch)
-
last_size = len(X_test_batch)
-
X_test_batch += [[ 0 for j in range(self.num_steps)] for i in range(self.batch_size - last_size)]
-
X_test_str_batch += [[ 'x' for j in range(self.num_steps)] for i in range(self.batch_size - last_size)]
-
X_test_batch = np.array(X_test_batch)
-
X_test_str_batch = np.array(X_test_str_batch)
-
results = self.predictBatch(sess, X_test_batch, X_test_str_batch, id2label)
-
results = results[:last_size]
-
else:
-
X_test_batch = np.array(X_test_batch)
-
results = self.predictBatch(sess, X_test_batch, X_test_str_batch, id2label)
-
-
for i in range(len(results)):
-
doc = ''.join(X_test_str_batch[i])
-
outfile.write(doc + "<@>" + " ".join(results[i]).encode( "utf-8") + "\n")
-
-
def viterbi(self, max_scores, max_scores_pre, length, predict_size=128):
-
best_paths = []
-
for m in range(predict_size):
-
path = []
-
last_max_node = np.argmax(max_scores[m][length[m]])
-
# last_max_node = 0
-
for t in range( 1, length[m] + 1)[:: -1]:
-
last_max_node = max_scores_pre[m][t][last_max_node]
-
path.append(last_max_node)
-
path = path[:: -1]
-
best_paths.append(path)
-
return best_paths
-
-
def predictBatch(self, sess, X, X_str, id2label):
-
results = []
-
length, max_scores, max_scores_pre = sess.run([self.length, self.max_scores, self.max_scores_pre], feed_dict={self.inputs:X})
-
predicts = self.viterbi(max_scores, max_scores_pre, length, self.batch_size)
-
for i in range(len(predicts)):
-
x = ''.join(X_str[i]).decode( "utf-8")
-
y_pred = ''.join([id2label[val] for val in predicts[i] if val != 5 and val != 0])
-
entitys = helper.extractEntity(x, y_pred)
-
results.append(entitys)
-
return results
-
-
def evaluate(self, X, y_true, y_pred, id2char, id2label):
-
precision = -1.0
-
recall = -1.0
-
f1 = -1.0
-
hit_num = 0
-
pred_num = 0
-
true_num = 0
-
for i in range(len(y_true)):
-
x = ''.join([str(id2char[val].encode( "utf-8")) for val in X[i]])
-
y = ''.join([str(id2label[val].encode( "utf-8")) for val in y_true[i]])
-
y_hat = ''.join([id2label[val] for val in y_pred[i] if val != 5])
-
true_labels = helper.extractEntity(x, y)
-
pred_labels = helper.extractEntity(x, y_hat)
-
hit_num += len(set(true_labels) & set(pred_labels))
-
pred_num += len(set(pred_labels))
-
true_num += len(set(true_labels))
-
if pred_num != 0:
-
precision = 1.0 * hit_num / pred_num
-
if true_num != 0:
-
recall = 1.0 * hit_num / true_num
-
if precision > 0 and recall > 0:
-
f1 = 2.0 * (precision * recall) / (precision + recall)
-
return precision, recall, f1
util.py
-
#encoding:utf-8
-
import re
-
import os
-
import csv
-
import time
-
import pickle
-
import numpy as np
-
import pandas as pd
-
-
def getEmbedding(infile_path="embedding"):
-
char2id, id_char = loadMap( "char2id")
-
row_index = 0
-
with open(infile_path, "rb") as infile:
-
for row in infile:
-
row = row.strip()
-
row_index += 1
-
if row_index == 1:
-
num_chars = int(row.split()[ 0])
-
emb_dim = int(row.split()[ 1])
-
emb_matrix = np.zeros((len(char2id.keys()), emb_dim))
-
continue
-
items = row.split()
-
char = items[ 0]
-
emb_vec = [float(val) for val in items[ 1:]]
-
if char in char2id:
-
emb_matrix[char2id[char]] = emb_vec
-
return emb_matrix
-
-
def nextBatch(X, y, start_index, batch_size=128):
-
last_index = start_index + batch_size
-
X_batch = list(X[start_index:min(last_index, len(X))])
-
y_batch = list(y[start_index:min(last_index, len(X))])
-
if last_index > len(X):
-
left_size = last_index - (len(X))
-
for i in range(left_size):
-
index = np.random.randint(len(X))
-
X_batch.append(X[index])
-
y_batch.append(y[index])
-
X_batch = np.array(X_batch)
-
y_batch = np.array(y_batch)
-
return X_batch, y_batch
-
-
def nextRandomBatch(X, y, batch_size=128):
-
X_batch = []
-
y_batch = []
-
for i in range(batch_size):
-
index = np.random.randint(len(X))
-
X_batch.append(X[index])
-
y_batch.append(y[index])
-
X_batch = np.array(X_batch)
-
y_batch = np.array(y_batch)
-
return X_batch, y_batch
-
-
# use "0" to padding the sentence
-
def padding(sample, seq_max_len):
-
for i in range(len(sample)):
-
if len(sample[i]) < seq_max_len:
-
sample[i] += [ 0 for _ in range(seq_max_len - len(sample[i]))]
-
return sample
-
-
def prepare(chars, labels, seq_max_len, is_padding=True):
-
X = []
-
y = []
-
tmp_x = []
-
tmp_y = []
-
-
for record in zip(chars, labels):
-
c = record[ 0]
-
l = record[ 1]
-
# empty line
-
if c == -1:
-
if len(tmp_x) <= seq_max_len:
-
X.append(tmp_x)
-
y.append(tmp_y)
-
tmp_x = []
-
tmp_y = []
-
else:
-
tmp_x.append(c)
-
tmp_y.append(l)
-
if is_padding:
-
X = np.array(padding(X, seq_max_len))
-
else:
-
X = np.array(X)
-
y = np.array(padding(y, seq_max_len))
-
-
return X, y
-
-
def extractEntity(sentence, labels):
-
entitys = []
-
re_entity = re.compile( r'BM*E')
-
m = re_entity.search(labels)
-
while m:
-
entity_labels = m.group()
-
start_index = labels.find(entity_labels)
-
entity = sentence[start_index:start_index + len(entity_labels)]
-
labels = list(labels)
-
# replace the "BM*E" with "OO*O"
-
labels[start_index: start_index + len(entity_labels)] = [ 'O' for i in range(len(entity_labels))]
-
entitys.append(entity)
-
labels = ''.join(labels)
-
m = re_entity.search(labels)
-
return entitys
-
-
def loadMap(token2id_filepath):
-
if not os.path.isfile(token2id_filepath):
-
print "file not exist, building map"
-
buildMap()
-
-
token2id = {}
-
id2token = {}
-
with open(token2id_filepath) as infile:
-
for row in infile:
-
row = row.rstrip().decode( "utf-8")
-
token = row.split( '\t')[ 0]
-
token_id = int(row.split( '\t')[ 1])
-
token2id[token] = token_id
-
id2token[token_id] = token
-
return token2id, id2token
-
-
def saveMap(id2char, id2label):
-
with open( "char2id", "wb") as outfile:
-
for idx in id2char:
-
outfile.write(id2char[idx] + "\t" + str(idx) + "\r\n")
-
with open( "label2id", "wb") as outfile:
-
for idx in id2label:
-
outfile.write(id2label[idx] + "\t" + str(idx) + "\r\n")
-
print "saved map between token and id"
-
-
def buildMap(train_path="train.in"):
-
df_train = pd.read_csv(train_path, delimiter= '\t', quoting=csv.QUOTE_NONE, skip_blank_lines= False, header= None, names=[ "char", "label"])
-
chars = list(set(df_train[ "char"][df_train[ "char"].notnull()]))
-
labels = list(set(df_train[ "label"][df_train[ "label"].notnull()]))
-
char2id = dict(zip(chars, range( 1, len(chars) + 1)))
-
label2id = dict(zip(labels, range( 1, len(labels) + 1)))
-
id2char = dict(zip(range( 1, len(chars) + 1), chars))
-
id2label = dict(zip(range( 1, len(labels) + 1), labels))
-
id2char[ 0] = "<PAD>"
-
id2label[ 0] = "<PAD>"
-
char2id[ "<PAD>"] = 0
-
label2id[ "<PAD>"] = 0
-
id2char[len(chars) + 1] = "<NEW>"
-
char2id[ "<NEW>"] = len(chars) + 1
-
-
saveMap(id2char, id2label)
-
-
return char2id, id2char, label2id, id2label
-
-
def getTrain(train_path, val_path, train_val_ratio=0.99, use_custom_val=False, seq_max_len=200):
-
char2id, id2char, label2id, id2label = buildMap(train_path)
-
df_train = pd.read_csv(train_path, delimiter= '\t', quoting=csv.QUOTE_NONE, skip_blank_lines= False, header= None, names=[ "char", "label"])
-
-
# map the char and label into id
-
df_train[ "char_id"] = df_train.char.map( lambda x : -1 if str(x) == str(np.nan) else char2id[x])
-
df_train[ "label_id"] = df_train.label.map( lambda x : -1 if str(x) == str(np.nan) else label2id[x])
-
-
# convert the data in maxtrix
-
X, y = prepare(df_train[ "char_id"], df_train[ "label_id"], seq_max_len)
-
-
# shuffle the samples
-
num_samples = len(X)
-
indexs = np.arange(num_samples)
-
np.random.shuffle(indexs)
-
X = X[indexs]
-
y = y[indexs]
-
-
if val_path != None:
-
X_train = X
-
y_train = y
-
X_val, y_val = getTest(val_path, is_validation= True, seq_max_len=seq_max_len)
-
else:
-
# split the data into train and validation set
-
X_train = X[:int(num_samples * train_val_ratio)]
-
y_train = y[:int(num_samples * train_val_ratio)]
-
X_val = X[int(num_samples * train_val_ratio):]
-
y_val = y[int(num_samples * train_val_ratio):]
-
-
print "train size: %d, validation size: %d" %(len(X_train), len(y_val))
-
-
return X_train, y_train, X_val, y_val
-
-
def getTest(test_path="test.in", is_validation=False, seq_max_len=200):
-
char2id, id2char = loadMap( "char2id")
-
label2id, id2label = loadMap( "label2id")
-
-
df_test = pd.read_csv(test_path, delimiter= '\t', quoting=csv.QUOTE_NONE, skip_blank_lines= False, header= None, names=[ "char", "label"])
-
-
def mapFunc(x, char2id):
-
if str(x) == str(np.nan):
-
return -1
-
elif x.decode( "utf-8") not in char2id:
-
return char2id[ "<NEW>"]
-
else:
-
return char2id[x.decode( "utf-8")]
-
-
df_test[ "char_id"] = df_test.char.map( lambda x:mapFunc(x, char2id))
-
df_test[ "label_id"] = df_test.label.map( lambda x : -1 if str(x) == str(np.nan) else label2id[x])
-
-
if is_validation:
-
X_test, y_test = prepare(df_test[ "char_id"], df_test[ "label_id"], seq_max_len)
-
return X_test, y_test
-
else:
-
df_test[ "char"] = df_test.char.map( lambda x : -1 if str(x) == str(np.nan) else x)
-
X_test, _ = prepare(df_test[ "char_id"], df_test[ "char_id"], seq_max_len)
-
X_test_str, _ = prepare(df_test[ "char"], df_test[ "char_id"], seq_max_len, is_padding= False)
-
print "test size: %d" %(len(X_test))
-
return X_test, X_test_str
-
-
def getTransition(y_train_batch):
-
transition_batch = []
-
for m in range(len(y_train_batch)):
-
y = [ 5] + list(y_train_batch[m]) + [ 0]
-
for t in range(len(y)):
-
if t + 1 == len(y):
-
continue
-
i = y[t]
-
j = y[t + 1]
-
if i == 0:
-
break
-
transition_batch.append(i * 6 + j)
-
transition_batch = np.array(transition_batch)
-
return transition_batch
train.py
-
import time
-
import helper
-
import argparse
-
import numpy as np
-
import pandas as pd
-
import tensorflow as tf
-
from BILSTM_CRF import BILSTM_CRF
-
-
# python train.py train.in model -v validation.in -c char_emb -e 10 -g 2
-
-
parser = argparse.ArgumentParser()
-
parser.add_argument( "train_path", help= "the path of the train file")
-
parser.add_argument( "save_path", help= "the path of the saved model")
-
parser.add_argument( "-v", "--val_path", help= "the path of the validation file", default= None)
-
parser.add_argument( "-e", "--epoch", help= "the number of epoch", default= 100, type=int)
-
parser.add_argument( "-c", "--char_emb", help= "the char embedding file", default= None)
-
parser.add_argument( "-g", "--gpu", help= "the id of gpu, the default is 0", default= 0, type=int)
-
-
args = parser.parse_args()
-
-
train_path = args.train_path
-
save_path = args.save_path
-
val_path = args.val_path
-
num_epochs = args.epoch
-
emb_path = args.char_emb
-
gpu_config = "/cpu:0"
-
#gpu_config = "/gpu:"+str(args.gpu)
-
num_steps = 200 # it must consist with the test
-
-
start_time = time.time()
-
print "preparing train and validation data"
-
X_train, y_train, X_val, y_val = helper.getTrain(train_path=train_path, val_path=val_path, seq_max_len=num_steps)
-
char2id, id2char = helper.loadMap( "char2id")
-
label2id, id2label = helper.loadMap( "label2id")
-
num_chars = len(id2char.keys())
-
num_classes = len(id2label.keys())
-
if emb_path != None:
-
embedding_matrix = helper.getEmbedding(emb_path)
-
else:
-
embedding_matrix = None
-
-
print "building model"
-
config = tf.ConfigProto(allow_soft_placement= True)
-
with tf.Session(config=config) as sess:
-
with tf.device(gpu_config):
-
initializer = tf.random_uniform_initializer( -0.1, 0.1)
-
with tf.variable_scope( "model", reuse= None, initializer=initializer):
-
model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, num_epochs=num_epochs, embedding_matrix=embedding_matrix, is_training= True)
-
-
print "training model"
-
tf.initialize_all_variables().run()
-
model.train(sess, save_path, X_train, y_train, X_val, y_val)
-
-
print "final best f1 is: %f" % (model.max_f1)
-
-
end_time = time.time()
-
print "time used %f(hour)" % ((end_time - start_time) / 3600)
test.py
-
import time
-
import helper
-
import argparse
-
import numpy as np
-
import pandas as pd
-
import tensorflow as tf
-
from BILSTM_CRF import BILSTM_CRF
-
-
# python test.py model test.in test.out -c char_emb -g 2
-
-
parser = argparse.ArgumentParser()
-
parser.add_argument( "model_path", help= "the path of model file")
-
parser.add_argument( "test_path", help= "the path of test file")
-
parser.add_argument( "output_path", help= "the path of output file")
-
parser.add_argument( "-c", "--char_emb", help= "the char embedding file", default= None)
-
parser.add_argument( "-g", "--gpu", help= "the id of gpu, the default is 0", default= 0, type=int)
-
args = parser.parse_args()
-
-
model_path = args.model_path
-
test_path = args.test_path
-
output_path = args.output_path
-
gpu_config = "/cpu:0"
-
emb_path = args.char_emb
-
num_steps = 200 # it must consist with the train
-
-
start_time = time.time()
-
-
print "preparing test data"
-
X_test, X_test_str = helper.getTest(test_path=test_path, seq_max_len=num_steps)
-
char2id, id2char = helper.loadMap( "char2id")
-
label2id, id2label = helper.loadMap( "label2id")
-
num_chars = len(id2char.keys())
-
num_classes = len(id2label.keys())
-
if emb_path != None:
-
embedding_matrix = helper.getEmbedding(emb_path)
-
else:
-
embedding_matrix = None
-
-
print "building model"
-
config = tf.ConfigProto(allow_soft_placement= True)
-
with tf.Session(config=config) as sess:
-
with tf.device(gpu_config):
-
initializer = tf.random_uniform_initializer( -0.1, 0.1)
-
with tf.variable_scope( "model", reuse= None, initializer=initializer):
-
model = BILSTM_CRF(num_chars=num_chars, num_classes=num_classes, num_steps=num_steps, embedding_matrix=embedding_matrix, is_training= False)
-
-
print "loading model parameter"
-
saver = tf.train.Saver()
-
saver.restore(sess, model_path)
-
-
print "testing"
-
model.test(sess, X_test, X_test_str, output_path)
-
-
end_time = time.time()
-
print "time used %f(hour)" % ((end_time - start_time) / 3600)
相关预处理的数据请参考github: scofiled's github/bilstm+crf
转自: Scofield's blog[http://blog.csdn.net/scotfield_msn] https://blog.csdn.net/Scotfield_msn/article/details/60339415