Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification

现状

依赖lexical resources，比如WordNet和基于NLP的依存分析和命名实体识别（NER）；重要的信息有可能出现在句子的任何地方。

模型

在这里插入图片描述

输入层
Embedding层
BiLSTM层
Attention层: 从word-level特征生成sentence-level特征。
输出层

代码

    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size,
                 hidden_size, l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text')
        self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
        self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')

        initializer = tf.keras.initializers.glorot_normal

        # Word Embedding Layer
        with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
            self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name="W_text")
            self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_text)

        # Dropout for Word Embedding
        with tf.variable_scope('dropout-embeddings'):
            self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob)

        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            self.rnn_outputs = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size, return_sequences=True), merge_mode='sum')(self.embedded_chars)
        # Attention
        with tf.variable_scope('attention'):
            self.attn, self.alphas = attention(self.rnn_outputs)

        # Dropout
        with tf.variable_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)

        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer())
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")

其中attention代码如下：

def attention(inputs):
    # Trainable parameters
    hidden_size = inputs.shape[2].value
    u_omega = tf.get_variable("u_omega", [hidden_size], initializer=tf.keras.initializers.glorot_normal())

    with tf.name_scope('v'):
        v = tf.tanh(inputs)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')  # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    # Final output with tanh
    output = tf.tanh(output)

    return output, alphas

Semantic Relation Classification via Bidirectional LSTM Networks with Entity-aware Attention using Latent Entity Typing

现状

模型

在这里插入图片描述

Word Representation
Self Attention：Q=K=V，每个head在不同空间捕捉不同的信息，最后concatenation起来。
$\begin{aligned}&Attention(Q, K, V)=softmax(\frac{QK^T}{\sqrt{d_w}})V\\ &MultiHead(Q,K,V)=W^M[head_1;...head^r]\\ &head_r=Attention(W_i^QQ, W_i^KK, W_i^VV)\end{aligned}$
BiLSTM：双向LSTM
Entity-aware Attention： $p_i^{e_1}$ 和 $p_i^{e_2}$ 是position embedding， $h_{e_1}$ 和 $h_{e_2}$ 是两个实体在上层BiLSTM的输出， $h_i$ 是上层BiLSTM对应当前词的输出。 $c_i$ 是 $i$ -th latent type vector。
$\begin{aligned} u_i & =tanh(W^H[h_i;p_i^{e_1};p_i^{e_2}]+W^H[h_{e_1};t_1;h_{e_2};t_2]) \\ \alpha & = \frac{exp(v^Tu_i)}{\sum_{j=1}^{n}exp(v^Tu_i)} \\ z & =\sum_{i=1}^n\alpha_ih_i \\ a_i^j &= \frac{((h_{e_j})^Tc_i)}{\sum_{k=1}^K((h_{e_j})^Tc_i)} \\ t_{j\in\{1,2\}} &= \sum_{i=1}^Ka_i^jc_i \end{aligned}$
损失函数
$L=-\sum_{i=1}^{|D|}log(y^{(i)}|S^{(i)}, \theta)+\lambda\|\theta\|^2_2$

代码

    def __init__(self, sequence_length, num_classes,
                 vocab_size, embedding_size, pos_vocab_size, pos_embedding_size,
                 hidden_size, num_heads, attention_size,
                 use_elmo=False, l2_reg_lambda=0.0):
        # Placeholders for input, output and dropout
        self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x')
        self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
        self.input_text = tf.placeholder(tf.string, shape=[None, ], name='input_text')
        self.input_e1 = tf.placeholder(tf.int32, shape=[None, ], name='input_e1')
        self.input_e2 = tf.placeholder(tf.int32, shape=[None, ], name='input_e2')
        self.input_p1 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p1')
        self.input_p2 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p2')
        self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name='emb_dropout_keep_prob')
        self.rnn_dropout_keep_prob = tf.placeholder(tf.float32, name='rnn_dropout_keep_prob')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')

        if use_elmo:
            # Contextual Embedding Layer
            with tf.variable_scope("elmo-embeddings"):
                elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
                self.embedded_chars = elmo_model(self.input_text, signature="default", as_dict=True)["elmo"]
        else:
            # Word Embedding Layer
            with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
                self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name="W_text")
                self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_x)

        # Position Embedding Layer
        with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"):
            self.W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer())
            self.p1 = tf.nn.embedding_lookup(self.W_pos, self.input_p1)[:, :tf.shape(self.embedded_chars)[1]]
            self.p2 = tf.nn.embedding_lookup(self.W_pos, self.input_p2)[:, :tf.shape(self.embedded_chars)[1]]

        # Dropout for Word Embedding
        with tf.variable_scope('dropout-embeddings'):
            self.embedded_chars = tf.nn.dropout(self.embedded_chars,  self.emb_dropout_keep_prob)

        # Self Attention
        with tf.variable_scope("self-attention"):
            self.self_attn, self.self_alphas = multihead_attention(self.embedded_chars, self.embedded_chars,
                                                                   num_units=embedding_size, num_heads=num_heads)

        # Bidirectional LSTM
        with tf.variable_scope("bi-lstm"):
            _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
            fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob)
            _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
            bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob)
            self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                                  cell_bw=bw_cell,
                                                                  inputs=self.self_attn,
                                                                  sequence_length=self._length(self.input_x),
                                                                  dtype=tf.float32)
            self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1)

        # Attention
        with tf.variable_scope('attention'):
            self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention(self.rnn_outputs,
                                                                               self.input_e1, self.input_e2,
                                                                               self.p1, self.p2,
                                                                               attention_size=attention_size)

        # Dropout
        with tf.variable_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)

        # Fully connected layer
        with tf.variable_scope('output'):
            self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer())
            self.predictions = tf.argmax(self.logits, 1, name="predictions")

        # Calculate mean cross-entropy loss
        with tf.variable_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2

        # Accuracy
        with tf.variable_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")

其中multi-head attention代码如下：

def multihead_attention(queries, keys, num_units, num_heads,
                        dropout_rate=0, scope="multihead_attention", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # Linear projections
        Q = tf.layers.dense(queries, num_units, kernel_initializer=initializer())  # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, kernel_initializer=initializer())  # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, kernel_initializer=initializer())  # (N, T_k, C)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, C/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)

        # Scale
        outputs /= K_.get_shape().as_list()[-1] ** 0.5

        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)

        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)  # (h*N, T_q, T_k)

        # Activation
        alphas = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)

        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
        alphas *= query_masks  # broadcasting. (N, T_q, C)

        # Dropouts
        alphas = tf.layers.dropout(alphas, rate=dropout_rate, training=tf.convert_to_tensor(True))

        # Weighted sum
        outputs = tf.matmul(alphas, V_)  # ( h*N, T_q, C/h)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)

        # Linear
        outputs = tf.layers.dense(outputs, num_units, activation=tf.nn.relu, kernel_initializer=initializer())

        # Residual connection
        outputs += queries

        # Normalize
        outputs = layer_norm(outputs)  # (N, T_q, C)

    return outputs, alphas
    
def layer_norm(inputs, epsilon=1e-8, scope="layer_norm", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
        outputs = gamma * normalized + beta

    return outputs

entity-aware attention如下：

def attention(inputs, e1, e2, p1, p2, attention_size):
    # inputs = (batch, seq_len, hidden)
    # e1, e2 = (batch, seq_len)
    # p1, p2 = (batch, seq_len, dist_emb_size)
    # attention_size = scalar(int)
    def extract_entity(x, e):
        e_idx = tf.concat([tf.expand_dims(tf.range(tf.shape(e)[0]), axis=-1), tf.expand_dims(e, axis=-1)], axis=-1)
        return tf.gather_nd(x, e_idx)  # (batch, hidden)
    seq_len = tf.shape(inputs)[1]  # fixed at run-time
    hidden_size = inputs.shape[2].value  # fixed at compile-time
    latent_size = hidden_size

    # Latent Relation Variable based on Entities
    e1_h = extract_entity(inputs, e1)  # (batch, hidden)
    e2_h = extract_entity(inputs, e2)  # (batch, hidden)
    e1_type, e2_type, e1_alphas, e2_alphas = latent_type_attention(e1_h, e2_h,
                                                                   num_type=3,
                                                                   latent_size=latent_size)  # (batch, hidden)
    e1_h = tf.concat([e1_h, e1_type], axis=-1)  # (batch, hidden+latent)
    e2_h = tf.concat([e2_h, e2_type], axis=-1)  # (batch, hidden+latent)

    # v*tanh(W*[h;p1;p2]+W*[e1;e2]) 85.18%? 84.83% 84.55%
    e_h = tf.layers.dense(tf.concat([e1_h, e2_h], -1), attention_size, use_bias=False, kernel_initializer=initializer())
    e_h = tf.reshape(tf.tile(e_h, [1, seq_len]), [-1, seq_len, attention_size])
    v = tf.layers.dense(tf.concat([inputs, p1, p2], axis=-1), attention_size, use_bias=False, kernel_initializer=initializer())
    v = tf.tanh(tf.add(v, e_h))

    u_omega = tf.get_variable("u_omega", [attention_size], initializer=initializer())
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (batch, seq_len)
    alphas = tf.nn.softmax(vu, name='alphas')  # (batch, seq_len)
    # output
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)  # (batch, hidden)

    return output, alphas, e1_alphas, e2_alphas

def latent_type_attention(e1, e2, num_type, latent_size):
    # Latent Entity Type Vectors
    latent_type = tf.get_variable("latent_type", shape=[num_type, latent_size], initializer=initializer())

    # e1_h = tf.layers.dense(e1, latent_size, kernel_initializer=initializer())
    # e2_h = tf.layers.dense(e2, latent_size, kernel_initializer=initializer())

    e1_sim = tf.matmul(e1, tf.transpose(latent_type))  # (batch, num_type)
    e1_alphas = tf.nn.softmax(e1_sim, name='e1_alphas')  # (batch, num_type)
    e1_type = tf.matmul(e1_alphas, latent_type, name='e1_type')  # (batch, hidden)

    e2_sim = tf.matmul(e2, tf.transpose(latent_type))  # (batch, num_type)
    e2_alphas = tf.nn.softmax(e2_sim, name='e2_alphas')  # (batch, num_type)
    e2_type = tf.matmul(e2_alphas, latent_type, name='e2_type')  # (batch, hidden)

    return e1_type, e2_type, e1_alphas, e2_alphas

赵文淮

发布了10 篇原创文章 · 获赞 0 · 访问量 277

私信关注

随笔-关系抽取（二） — RNN-based Models

关系抽取（二） — RNN-based Models

Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification

现状

模型

代码

Semantic Relation Classification via Bidirectional LSTM Networks with Entity-aware Attention using Latent Entity Typing

现状

模型

代码

猜你喜欢