Let chatGPT use Tensor flow Keras to assemble Bert, GPT, Transformer

This article mainly shows the relationship and differences between the neural network structures of Transformer, Bert, and GPT. There are a lot of information on the Internet, but not many clearly show this relationship. This article is organized as a supplementary information, while using chatGPT, let it use Tensor flow Keras to assemble the corresponding mini-code to assist understanding.

From this assembly, it can be seen intuitively that:

Transformer: Encoder-Decoder modules are used
Bert: Only Transformer's Encoder is used for module assembly
GPT: Only Transformer's Decoder is used for module assembly

implement Transformer Model by Tensor flow Keras

insert image description here

There are a lot of things on the Internet that explain what each layer of Transformer does. This can be disassembled in a separate article for my understanding. This document assumes the reader already understands at this point.

import tensorflow as tf

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, d_ff, input_vocab_size, dropout_rate)
        self.decoder = Decoder(num_layers, d_model, num_heads, d_ff, target_vocab_size, dropout_rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inputs, targets, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inputs, enc_padding_mask)
        dec_output = self.decoder(targets, enc_output, look_ahead_mask, dec_padding_mask)
        final_output = self.final_layer(dec_output)
        return final_output

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size, dropout_rate=0.1):
        super(Encoder, self).__init__()

        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(vocab_size, d_model)
        self.encoder_layers = [EncoderLayer(d_model, num_heads, d_ff, dropout_rate) 
                                for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, inputs, padding_mask):
        embedded_input = self.embedding(inputs)
        positional_encoded_input = self.positional_encoding(embedded_input)

        encoder_output = self.dropout(positional_encoded_input)
        for i in range(self.num_layers):
            encoder_output = self.encoder_layers[i](encoder_output, padding_mask)
        
        return encoder_output

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, d_ff, vocab_size, dropout_rate=0.1):
        super(Decoder, self).__init__()

        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(vocab_size, d_model)
        self.decoder_layers = [DecoderLayer(d_model, num_heads, d_ff, dropout_rate) 
                                for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, inputs, encoder_output, look_ahead_mask, padding_mask):
        embedded_input = self.embedding(inputs)
        positional_encoded_input = self.positional_encoding(embedded_input)

        decoder_output = self.dropout(positional_encoded_input)
        for i in range(self.num_layers):
            decoder_output = self.decoder_layers[i](decoder_output, encoder_output, 
                                                    look_ahead_mask, padding_mask)
        
        return decoder_output

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()

        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, inputs, padding_mask):
        attention_output = self.multi_head_attention(inputs, inputs, inputs, padding_mask)
        attention_output = self.dropout1(attention_output)
        attention_output = self.layer_norm1(inputs + attention_output)

        ffn_output = self.ffn(attention_output)
        ffn_output = self.dropout2(ffn_output)
        encoder_output = self.layer_norm2(attention_output + ffn_output)

        return encoder_output

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()

        self.multi_head_attention1 = MultiHeadAttention(d_model, num_heads)
        self.multi_head_attention2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout3 = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, inputs, encoder_output, look_ahead_mask, padding_mask):
        attention1_output = self.multi_head_attention1(inputs, inputs, inputs, look_ahead_mask)
        attention1_output = self.dropout1(attention1_output)
        attention1_output = self.layer_norm1(inputs + attention1_output)

        attention2_output = self.multi_head_attention2(attention1_output, encoder_output, encoder_output, padding_mask)
        attention2_output = self.dropout2(attention2_output)
        attention2_output = self.layer_norm2(attention1_output + attention2_output)

        ffn_output = self.ffn(attention2_output)
        ffn_output = self.dropout3(ffn_output)
        decoder_output = self.layer_norm3(attention2_output + ffn_output)

        return decoder_output

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, query, key, value, mask):
        batch_size = tf.shape(query)[0]

        q = self.wq(query)
        k = self.wk(key)
        v = self.wv(value)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)

        return output, attention

implement Bert model by Tensor flow Keras

insert image description here

Among them, each Trm on the left represents, and the enlarged picture on the right is the Encoder part structure of the original Transformer. At the same time, it can be seen that Bert is on the left and assembles Transformer in two directions. Bert's training tasks include MLM (Masked Language Model) and NSP (Next Sentence Prediction). Bert's training is unsupervised, because MLM actually blocks some tokens of the corpus, so the output needs to know what the answer is ( annotation information) is actually included in the corpus. From this point of view, it is actually supervised.

import tensorflow as tf

class BERT(tf.keras.Model):
    def __init__(self, vocab_size, hidden_size, num_attention_heads, num_transformer_layers, intermediate_size):
        super(BERT, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, hidden_size)
        self.transformer_layers = [TransformerLayer(hidden_size, num_attention_heads, intermediate_size) 
                                    for _ in range(num_transformer_layers)]
        self.dropout = tf.keras.layers.Dropout(0.1)
    
    def call(self, inputs, attention_mask):
        embedded_input = self.embedding(inputs)
        hidden_states = embedded_input

        for transformer_layer in self.transformer_layers:
            hidden_states = transformer_layer(hidden_states, attention_mask)
        
        hidden_states = self.dropout(hidden_states)
        return hidden_states

class TransformerLayer(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_attention_heads, intermediate_size):
        super(TransformerLayer, self).__init__()

        self.attention = MultiHeadAttention(hidden_size, num_attention_heads)
        self.feed_forward = FeedForward(hidden_size, intermediate_size)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(0.1)
        self.dropout2 = tf.keras.layers.Dropout(0.1)
    
    def call(self, inputs, attention_mask):
        attention_output = self.attention(inputs, inputs, inputs, attention_mask)
        attention_output = self.dropout1(attention_output)
        attention_output = self.layer_norm1(inputs + attention_output)
        feed_forward_output = self.feed_forward(attention_output)
        feed_forward_output = self.dropout2(feed_forward_output)
        layer_output = self.layer_norm2(attention_output + feed_forward_output)
        return layer_output

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_attention_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_attention_heads = num_attention_heads
        self.attention_head_size = hidden_size // num_attention_heads

        self.query = tf.keras.layers.Dense(hidden_size)
        self.key = tf.keras.layers.Dense(hidden_size)
        self.value = tf.keras.layers.Dense(hidden_size)
        self.dense = tf.keras.layers.Dense(hidden_size)
    
    def call(self, query, key, value, attention_mask):
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        query = self._split_heads(query)
        key = self._split_heads(key)
        value = self._split_heads(value)

        attention_scores = tf.matmul(query, key, transpose_b=True)
        attention_scores /= tf.math.sqrt(tf.cast(self.attention_head_size, attention_scores.dtype))

        if attention_mask is not None:
            attention_scores += attention_mask

        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
        context_layer = tf.matmul(attention_probs, value)
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
        context_layer = tf.reshape(context_layer, (tf.shape(context_layer)[0], -1, hidden_size))

        attention_output = self.dense(context_layer)
        return attention_output
    
    def _split_heads(self, x):
        batch_size = tf.shape(x)[0]
        length = tf.shape(x)[1]
        x = tf.reshape(x, (batch_size, length, self.num_attention_heads,

implement GPT model by Tensor flow Keras

insert image description here

Among them, each Trm amplification on the left is the part on the right, which is the Decoder part in the original structure of Transformer. At the same time, it can be seen that GPT is on the left, and it is a one-way assembly of Transformer. The training task of GPT is to generate the next Token. GPT is unsupervised, because from the perspective of machine learning, the "label information" (the next Token) required to output data is already provided by the corpus. From this point of view, it is actually supervised.

import tensorflow as tf

class GPT(tf.keras.Model):
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads, intermediate_size, max_seq_length):
        super(GPT, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, hidden_size)
        self.positional_encoding = PositionalEncoding(max_seq_length, hidden_size)
        self.transformer_blocks = [TransformerBlock(hidden_size, num_heads, intermediate_size) 
                                    for _ in range(num_layers)]
        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.final_dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
    
    def call(self, inputs):
        embedded_input = self.embedding(inputs)
        positional_encoded_input = self.positional_encoding(embedded_input)

        hidden_states = positional_encoded_input
        for transformer_block in self.transformer_blocks:
            hidden_states = transformer_block(hidden_states)
        
        final_output = self.final_layer_norm(hidden_states)
        final_output = self.final_dense(final_output)
        return final_output

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_heads, intermediate_size):
        super(TransformerBlock, self).__init__()

        self.attention = MultiHeadAttention(hidden_size, num_heads)
        self.feed_forward = FeedForward(hidden_size, intermediate_size)
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs):
        attention_output = self.attention(inputs, inputs, inputs)
        attention_output = self.layer_norm1(inputs + attention_output)
        feed_forward_output = self.feed_forward(attention_output)
        layer_output = self.layer_norm2(attention_output + feed_forward_output)
        return layer_output

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads
        self.attention_head_size = hidden_size // num_heads

        self.query = tf.keras.layers.Dense(hidden_size)
        self.key = tf.keras.layers.Dense(hidden_size)
        self.value = tf.keras.layers.Dense(hidden_size)
        self.dense = tf.keras.layers.Dense(hidden_size)
    
    def call(self, query, key, value):
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)

        query = self._split_heads(query)
        key = self._split_heads(key)
        value = self._split_heads(value)

        attention_scores = tf.matmul(query, key, transpose_b=True)
        attention_scores /= tf.math.sqrt(tf.cast(self.attention_head_size, attention_scores.dtype))

        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
        context_layer = tf.matmul(attention_probs, value)
        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
        context_layer = tf.reshape(context_layer, (tf.shape(context_layer)[0], -1, hidden_size))

        attention_output = self.dense(context_layer)
        return attention_output
    
    def _split_heads(self, x):
        batch_size = tf.shape(x)[0]
        length = tf.shape(x)[1]
        x = tf.reshape(x, (batch_size, length, self.num_heads, self.attention_head_size))
        return tf.transpose(x, perm=[0, 2, 1, 3])