环境:python2.7,tensorflow1.12.0
1.数据预处理部分。
数据样例如下:
包 B-PER 公 I-PER 毕 O 竟 O 是 O 包 B-PER 公 I-PER , O 若 O 是 O
标注数据采用BIO形式标注,标注类型有PER,ORG,LOC。
tag2label = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6 }
首先应该读取数据,依照数据样例进行如下处理:
def read_corpus(data_path): r_file = codecs.open(data_path,"r","utf8") ret_datas = [] words = [] tags = [] for line in r_file.readlines(): if line != "\n": word,tag = line.split() words.append(word) tags.append(tag.strip()) else: ret_datas.append((words,tags)) words = [] tags = [] return ret_datas
把数据字符转换为字符ID。
def word2id(dict_path): r_file = codecs.open(dict_path,"r") ret_dict = pickle.load(r_file) id2word = {} for word in ret_dict.keys(): id2word[str(ret_dict[word])] = word return ret_dict,id2word
这样就把数据转换为{"我":1,"是":2...},这个是用来将[我,是]变成[1,2]这种数值型数据。
由于是用了biLSTM,输入句子的长度需要等长(当然,如果你训练时候每次输入一个句子就不需要这么做):
如下我们就把输入的一批话变成了所有句子所包含的词数量相同句子,不足部分补0即“O”标签。
def padding_squence(squences,pad_mark=0): max_len = max(map(lambda x:len(x), squences)) sque_list = [] sque_len_list = [] for sque in squences: sque = list(sque) sque_ = sque[:max_len] +[pad_mark for i in range(max(0,max_len-len(sque)))] sque_list.append(sque_) sque_len_list.append(min(len(sque),max_len)) return sque_list,sque_len_list
如上三步,我们读取数据,把字符转换成数字,然后把一批数字数据做长度上的对齐。这样三步结合起来我们就能得到一批对齐的数字形数据:
def sentence2id(sent_,word2id): ret_sent = [] for word in sent_: if word.isdigit(): word = "<NUM>" elif ('\u0041' <= word <= '\u005a') or ('\u0061' <= word <= '\u007a'): word = "<ENG>" if word not in word2id: word = "<UNK>" ret_sent.append(word2id.get(word)) return ret_sent
然后把所有的数据转换为对齐的数字形数据:
def batch_yield(datas,batch_size,vocab,tag2label,shuffle=False): if shuffle: random.shuffle(datas) seqs,labels = [],[] for (sent_,tag_) in datas: sent_ = sentence2id(sent_,vocab) label = [tag2label.get(tag) for tag in tag_] seqs.append(sent_) labels.append(label) if len(seqs) == batch_size: yield seqs,labels seqs = [] labels = [] if len(seqs) != 0: yield seqs,labels
如上我们就能得到一个数据的迭代器
2.然后我们有了数据就可以进行训练了
训练参数如下:
args = { "batch_size":60, "epoch_num":40, "train_data":"./data_path/train_data", "test_data":"./data_path/test_data", "CRF":True, "hidden_dim":300, "opt":"Adam", "lr":0.001, "dropout":0.5, "clip":5.0, "embedding_dim":300, "shuffle":True, "model":'train', "model_path":"./model_save/", "embedding_train":True, "num_tags":7 }
初始化占位符
def init_placeholder(self): #输入的数据 self.word2ids = tf.placeholder(dtype=tf.int32,shape=[None,None],name="word2ids") #输入数据的标签 self.tag2ids = tf.placeholder(dtype=tf.int32,shape=[None,None],name="tag2ids") #输入数据的各个句子的有效长度用来做MASK self.squence_length = tf.placeholder(dtype=tf.int32,shape=[None],name="squence_length")
初始化embedding层:
def init_lookup_layer(self): with tf.variable_scope("embedding"): _word_embeddings = tf.Variable(initial_value=self.embeddings,dtype=tf.float32, trainable=self.embedding_train, name="_word_embeddings") #[batch_size,sequence_length,embedding_size] word_embedding = tf.nn.embedding_lookup(_word_embeddings,self.word2ids,name="word_embedding") self.word_embedding = tf.nn.dropout(word_embedding,keep_prob=self.dropout)
定义BiLSTM
def init_BiLSTM_CRF_layer(self): with tf.variable_scope("bilstm"): #定义前后向的两个LSTMCELL cell_fw = LSTMCell(self.hidden_size) cell_bw = LSTMCell(self.hidden_size) print self.word_embedding print self.squence_length #用动态双向rnn处理数据 (outputs,states) = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs = self.word_embedding, sequence_length = self.squence_length, dtype = tf.float32 ) #对双向RNN处理结果进行合并, output = tf.concat(outputs,name="output",axis=2) output = tf.nn.dropout(output,self.dropout) #对合并结果进行全链接 with tf.variable_scope("proj"): W = tf.get_variable("proj_w",shape=[2*self.hidden_size,self.num_tags], initializer=tf.contrib.layers.xavier_initializer(), dtype=tf.float32) b = tf.get_variable("proj_b",shape=[self.num_tags], initializer=tf.zeros_initializer(), dtype=tf.float32) s = tf.shape(output) #[batch_size*sequence_length,2*hidden_size] output = tf.reshape(output,[-1,2*self.hidden_size]) pred = tf.matmul(output,W)+b #[batch_size,sequence_length,num_tags] self.logists = tf.reshape(pred,[-1,s[1],self.num_tags])
定义loss:
def init_loss(self): with tf.variable_scope("loss"): if self.CRF: # log_likelihood,self.transition = crf_log_likelihood(inputs=self.logists, tag_indices=self.tag2ids, sequence_lengths=self.squence_length) self.loss = -tf.reduce_mean(log_likelihood) else: losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.tag2ids,logits=self.logists) seq_mask = tf.sequence_mask(self.squence_length) losses = tf.boolean_mask(losses,seq_mask) self.loss = tf.reduce_mean(losses)
定义训练节点:
def init_train_op(self): with tf.variable_scope("train_op"): self.global_step = tf.Variable(0,trainable=False,name="global_step") if self.opt =="Adam": opt_ = tf.train.AdamOptimizer(self.lr) grads_and_vars = opt_.compute_gradients(self.loss) grads_and_vars_clip = [[tf.clip_by_value(g,-self.clip,self.clip),v] for g,v in grads_and_vars] self.train_op = opt_.apply_gradients(grads_and_vars_clip,global_step=self.global_step)
训练:
def train(self,train_datas): saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for epoch in range(self.epochs): self.run_one_epoch(sess,train_datas,tag2label,epoch,saver)
训练节点中的run_one_epoch:一个EPOCH的操作:
def run_one_epoch(self,sess,train_datas,tag2ids,epoch,saver): num_batchs = (len(train_datas)+self.batch_size-1)//self.batch_size start_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) print start_time #得到训练数据 batchs = batch_yield(train_datas,self.batch_size,self.vocab,tag2ids) for step,(seqs,labels) in enumerate(batchs): wordids,seqlength = padding_squence(seqs) labels_,_ = padding_squence(labels) feed_datas = { self.word2ids:wordids, self.tag2ids:labels_, self.squence_length:seqlength } #run 训练的节点 _,loss_,step_num = sess.run([self.train_op,self.loss,self.global_step],feed_dict=feed_datas) if (step+1) % 100 == 0: print "epoch %d for step %d get loss : %f"%(epoch,step,loss_) if (step+1) == num_batchs: saver.save(sess,self.model_save_path,global_step=step_num)
训练完毕进行test
def test(self,test_datas): saver = tf.train.Saver() with tf.Session() as sess: model_file = tf.train.latest_checkpoint(self.model_save_path) saver.restore(sess,model_file) #self.pred_one_epoch(sess,test_datas,self.tag2label) self.viterbi(sess,test_datas,self.tag2label)
test中的viterbi函数(如果不用viterbi动态规划解码,直接使用logits输出也可以得到结果,但是观察输出可以发现在缺少转移矩阵的情况下,会出现如下这种情况
中 B-ORG
国 I-LOC
共 I-ORG
产 I-ORG
党 I-ORG
中 O
央 I-ORG
委 I-ORG
员 I-ORG
会 I-ORG)
很明显是识别出来这是一个组织机构名称,但是神经网络没有学到I标签必须出现在B标签后面(人为定义),也就是说在转移矩阵中O ->I 的概率应该几乎为0.维特比动态规划就能很好的解决这个问题。
维特比求解函数:
def viterbi(self,sess,test_datas,tag2ids): #transition label2tags = {} for key in tag2ids.keys(): label2tags[tag2ids.get(key)] = key batchs = batch_yield(test_datas, self.batch_size, self.vocab, tag2ids) for step, (seq, labels) in enumerate(batchs): wordids, wordslen = padding_squence(seq) feed_datas = { self.word2ids: wordids, self.squence_length: wordslen } predict,trasiton = sess.run([self.logists,self.transition], feed_dict=feed_datas) if self.CRF: label_list = [] for logit, seq_len in zip(predict, wordslen): viterbi_seq, _ = viterbi_decode(logit[:seq_len], trasiton) label_list.append(viterbi_seq) pred_list = [] normal_datas = test_datas[step * self.batch_size:(step + 1) * self.batch_size] for ii in label_list: i = [] for tag_index in ii: tag = label2tags.get(tag_index) i.append(tag) pred_list.append(i) for idx, sen in enumerate(normal_datas): for i in range(len(sen[0])): print sen[0][i], pred_list[idx][i]
普通对logits求解函数:
def pred_one_epoch(self,sess,test_datas,tag2ids): label2tags = {} for key in tag2ids.keys(): label2tags[tag2ids.get(key)] = key batchs = batch_yield(test_datas, self.batch_size, self.vocab, tag2ids) for step,(seq,labels) in enumerate(batchs): wordids,wordslen = padding_squence(seq) feed_datas = { self.word2ids:wordids, self.squence_length:wordslen } predict = sess.run(self.logists,feed_dict=feed_datas) predict = tf.convert_to_tensor(predict) pred_max = tf.squeeze(tf.argmax(predict,axis=-1)) normal_datas = test_datas[step*self.batch_size:(step+1)*self.batch_size] pred_list = [] for ii in pred_max.eval(): i = [] for tag_index in ii: tag = label2tags.get(tag_index) i.append(tag) pred_list.append(i) for idx,sen in enumerate(normal_datas): for i in range(len(sen[0])): print sen[0][i],pred_list[idx][i]
源码和数据地址:链接: https://pan.baidu.com/s/19kKvfSjO8QZQSX2hyQnCAw 提取码: f99s 。
数据里面有训练40 EPOCH的模型可以直接调用,如果想自己训练直接把model.py下的train_变量改成True