Series Article Directory
Tip: This article defines a total of 3 classes (class)
1.TransformerEncoderLayer basic multi-head Encoder
2.TransformerEncoder multiple Encoders, the final call is the above class
3.Transformer overall structure, including the use of multiple Encode and multiple Decode
Article directory
foreword
Tip: Here you can add the general content to be recorded in this article:
For example: with the continuous development of artificial intelligence, the technology of machine learning is becoming more and more important. Many people have started to learn machine learning. This article introduces the basics of machine learning. content.
Tip: The following is the text of this article, and the following cases are for reference
1. First define a multi-head Transformer
The code is as follows (example):
1. nn.MultiheadAttention is a built-in function, you only need to pass in the input sequence and the number of multiheads, the mask is not necessary
2. The forward process generally uses forward_post, and the Norm normalization in it is after the attention
class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False):
super().__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.activation = _get_activation_fn(activation)
self.normalize_before = normalize_before
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
return tensor if pos is None else tensor + pos
def forward_post(self,
src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
q = k = self.with_pos_embed(src, pos) # + (768,2,256)
src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = src + self.dropout2(src2)
src = self.norm2(src)
return src
def forward_pre(self, src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
src2 = self.norm1(src)
q = k = self.with_pos_embed(src2, pos)
src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
key_padding_mask=src_key_padding_mask)[0]
src = src + self.dropout1(src2)
src2 = self.norm2(src)
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
src = src + self.dropout2(src2)
return src
def forward(self, src,
src_mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
if self.normalize_before:
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
So how does it make calls?
src= self.forward_post(src, src_mask, src_key_padding_mask, pos)
# pos为位置编码,等下会写。src为输入序列,
2. Multiple Encoders:
The code is as follows (example):
class TransformerEncoder(nn.Module):
def __init__(self, encoder_layer, num_layers, norm=None):
super().__init__()
##-----------------主要调用的层--------------------##
self.layers = _get_clones(encoder_layer, num_layers)
##-------------------------------------------------##
self.num_layers = num_layers
self.norm = norm
def forward(self, src,
mask: Optional[Tensor] = None,
src_key_padding_mask: Optional[Tensor] = None,
pos: Optional[Tensor] = None):
output = src
#----- 主要的前向函数。mask为None , padding_mask为(bs,h*w),-------
for layer in self.layers:
output = layer(output, src_mask=mask,
src_key_padding_mask=src_key_padding_mask, pos=pos)
#------------------------------------------------------------------
if self.norm is not None:
output = self.norm(output)
return output
Among the multiple encoder_layers cloned, the top multi-headed Trans is called by the following Transformer class
class Transformer(nn.Module):
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False,
return_intermediate_dec=False):
super().__init__()
encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
def forward(self, src, mask, pos_embed):
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) # ([768, 2, 256])
hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
pos=pos_embed, query_pos=query_embed) # (6,100,2,256)
return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
3. How to use multiple Encoders
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) # ([768, 2, 256])
# self.encoder的定义为上方Transformer类的最后一行
So how is this large Transformer structure defined and used in the end?
from .transformer import build_transformer
self.transformer = transformer
hs = self.transformer(self.input_proj(src), mask, pos[-1])[0]
Fourth, look at two important functions at the end
Both from model/transformer.py
def _get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def build_transformer(args):
return Transformer(
d_model=args.hidden_dim,
dropout=args.dropout,
nhead=args.nheads,
dim_feedforward=args.dim_feedforward,
num_encoder_layers=args.enc_layers,
num_decoder_layers=args.dec_layers,
normalize_before=args.pre_norm,
return_intermediate_dec=True,
)
5. Mask and pos in DETR
mask = F.interpolate(tensor_list.mask[None].float(), size=x.shape[-2:]).bool()[0] # ([2, 38, 25])
## 以输入为(2,3,1194,800)为例,利用输入tensor_list的mask[None],尺度也是1194,800。
这里猜测tensor_list.mask来源于实例分割的标签
pos=self[1](x).to(x.tensors.dtype) # x为特征图(2,2048,38,25)
self[1]= position_embedding = build_position_encoding(args)
这里的位置编码采用的是正余弦,在models/position_encode里是一个单独的类
position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
# N_steps = args.hidden_dim // 2
class PositionEmbeddingSine(nn.Module):
"""
This is a more standard version of the position embedding, very similar to the one
used by the Attention is all you need paper, generalized to work on images.
"""
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
super().__init__()
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale
def forward(self, tensor_list):
x = tensor_list.tensors
mask = tensor_list.mask
not_mask = ~mask
y_embed = not_mask.cumsum(1, dtype=torch.float32)
x_embed = not_mask.cumsum(2, dtype=torch.float32)
if self.normalize:
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
pos_x = x_embed[:, :, :, None] / dim_t
pos_y = y_embed[:, :, :, None] / dim_t
pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
return pos
Summarize
Tip: The above method can be used to build a Transformer independently