1. Project configuration instructions
Parameter Description:
data set:
--name cifar10-100_500
--dataset cifar10
Which version of the model:
--model_type ViT-B_16
Pretrained weights:
--pretrained_dir checkpoint/ViT-B_16.npz
2.patch embeding与position_embedding
For image coding, take VIT-B/16 as an example, first use the convolution with a convolution kernel size of 16*16 and a step size of 16 to transform the image. At this time, the image dimension becomes 16 * 768 * 14 * 14 , and then transform the dimension to [16, 196, 768], and then connect the 0patch with the dimension of 16*1*768.
For positional encoding, construct a vector of 1*197*768
Finally, the encoding is completed by adding the image encoding and position encoding.
code show as below:
class Embeddings(nn.Module):
"""Construct the embeddings from patch, position embeddings.
"""
def __init__(self, config, img_size, in_channels=3):
super(Embeddings, self).__init__()
self.hybrid = None
img_size = _pair(img_size)
# patch_size 大小 与 patch数量 n_patches
if config.patches.get("grid") is not None:
grid_size = config.patches["grid"]
patch_size = (img_size[0] // 16 // grid_size[0], img_size[1] // 16 // grid_size[1])
n_patches = (img_size[0] // 16) * (img_size[1] // 16)
self.hybrid = True
else:
patch_size = _pair(config.patches["size"])
n_patches = (img_size[0] // patch_size[0]) * (img_size[1] // patch_size[1])
self.hybrid = False
# 使用混合模型
if self.hybrid:
self.hybrid_model = ResNetV2(block_units=config.resnet.num_layers,
width_factor=config.resnet.width_factor)
in_channels = self.hybrid_model.width * 16
# patch_embeding 16 * 768 * 14 * 14
self.patch_embeddings = Conv2d(in_channels=in_channels,
out_channels=config.hidden_size,
kernel_size=patch_size,
stride=patch_size)
# 初始化 position_embeddings: 1 * 197 * 768
self.position_embeddings = nn.Parameter(torch.zeros(1, n_patches+1, config.hidden_size))
# 初始化第 0 个patch,表示分类特征 1*1*768
self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
# dropout层
self.dropout = Dropout(config.transformer["dropout_rate"])
def forward(self, x):
print(x.shape)
B = x.shape[0]
# 拓展cls_tokens的维度:16 *1*768
cls_tokens = self.cls_token.expand(B, -1, -1)
print(cls_tokens.shape)
# 混合模型
if self.hybrid:
x = self.hybrid_model(x)
# 编码:16 * 768 * 14 * 14
x = self.patch_embeddings(x)
print(x.shape)
# 变换维度:16 * 768 * 14 * 14-->[16, 768, 196]
x = x.flatten(2)
print(x.shape)
# [16, 768, 196] --> [16, 196, 768]
x = x.transpose(-1, -2)
print(x.shape)
# 加入分类特征patch
x = torch.cat((cls_tokens, x), dim=1)
print(x.shape)
# 加入位置编码
embeddings = x + self.position_embeddings
print(embeddings.shape)
# dropout层
embeddings = self.dropout(embeddings)
print(embeddings.shape)
return embeddings
3.ecoder
Multi-head attention module:
First construct three auxiliary vectors of q, k, and v, because we use a multi-head attention mechanism (12), first, we need to convert the dimensions of q, k, and v from 16, 197, 768 to 16, 12, 197, 64 , and then obtain the similarity qk of q and k, because the relationship between the two is obtained, so the dimensions are 16, 12, 197, 197, and the dimension is eliminated. After softmax, the extracted feature vector qkv is obtained, and the dimension 16, 12, 197, 64, and then restore the dimension to 16, 197, 768
class Attention(nn.Module):
def __init__(self, config, vis):
super(Attention, self).__init__()
self.vis = vis
# heads数量
self.num_attention_heads = config.transformer["num_heads"]
# 每个head的向量维度
self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
# 总head_size
self.all_head_size = self.num_attention_heads * self.attention_head_size
# query向量
self.query = Linear(config.hidden_size, self.all_head_size)
# key向量
self.key = Linear(config.hidden_size, self.all_head_size)
# value向量
self.value = Linear(config.hidden_size, self.all_head_size)
# 全连接层
self.out = Linear(config.hidden_size, config.hidden_size)
# dropout层
self.attn_dropout = Dropout(config.transformer["attention_dropout_rate"])
self.proj_dropout = Dropout(config.transformer["attention_dropout_rate"])
self.softmax = Softmax(dim=-1)
def transpose_for_scores(self, x):
# 维度:16, 197, 768-->16,197,12,64
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
# print(new_x_shape)
x = x.view(*new_x_shape)
# print(x.shape)
# print(x.permute(0, 2, 1, 3).shape)
# 16,197,12,64 --> 16, 12, 197, 64
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states):
# print(hidden_states.shape)
# q,k,v:16, 197, 768
mixed_query_layer = self.query(hidden_states)
# print(mixed_query_layer.shape)
mixed_key_layer = self.key(hidden_states)
# print(mixed_key_layer.shape)
mixed_value_layer = self.value(hidden_states)
# print(mixed_value_layer.shape)
# q,k,v:16, 197, 768-->16, 12, 197, 64
query_layer = self.transpose_for_scores(mixed_query_layer)
# print(query_layer.shape)
key_layer = self.transpose_for_scores(mixed_key_layer)
# print(key_layer.shape)
value_layer = self.transpose_for_scores(mixed_value_layer)
# print(value_layer.shape)
# q,k的相似性:16, 12, 197, 197
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
# print(attention_scores.shape)
# 消除量纲
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# print(attention_scores.shape)
attention_probs = self.softmax(attention_scores)
# print(attention_probs.shape)
weights = attention_probs if self.vis else None
attention_probs = self.attn_dropout(attention_probs)
# print(attention_probs.shape)
# print(value_layer.shape)
# 特征向量:qkv:16, 12, 197, 64
context_layer = torch.matmul(attention_probs, value_layer)
# print(context_layer.shape)
# 16, 12, 197, 64-->16, 12, 197, 64
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
# print(context_layer.shape)
# 16, 12, 197, 64-->16, 197, 768
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
# print(context_layer.shape)
# 全连接层:16, 197, 768
attention_output = self.out(context_layer)
# print(attention_output.shape)
# dropout层
attention_output = self.proj_dropout(attention_output)
# print(attention_output.shape)
return attention_output, weights
transformer encoder
For the input x, firstly, after layer normalization, input the multi-head attention mechanism, perform residual connection on the result, then go through layer normalization, after two layers of full connection, after residual connection, get a module result, stack L layer, output the final result
class Block(nn.Module):
def __init__(self, config, vis):
super(Block, self).__init__()
# 序列的大小:768
self.hidden_size = config.hidden_size
# 层归一化
self.attention_norm = LayerNorm(config.hidden_size, eps=1e-6)
self.ffn_norm = LayerNorm(config.hidden_size, eps=1e-6)
# MLP层
self.ffn = Mlp(config)
# 多头注意力机制
self.attn = Attention(config, vis)
def forward(self, x):
# print(x.shape)
# 16, 197, 768
h = x
# 层归一化
x = self.attention_norm(x)
# print(x.shape)
# 多头注意力机制
x, weights = self.attn(x)
# 残差连接
x = x + h
# print(x.shape)
h = x
# 层归一化
x = self.ffn_norm(x)
# print(x.shape)
# MLP层
x = self.ffn(x)
# print(x.shape)
# 残差连接
x = x + h
# print(x.shape)
return x, weights
Overall structure
For the input x, after patch embedding and position embedding, the dimension is 16*197*768 at this time, input to the encoder, after the encoding module of the L layer, the encoding result of the 0th patch (representing the classification feature) is taken out, and input to the classification layer , to get the predicted result.
class VisionTransformer(nn.Module):
def __init__(self, config, img_size=224, num_classes=21843, zero_head=False, vis=False):
super(VisionTransformer, self).__init__()
self.num_classes = num_classes
self.zero_head = zero_head
self.classifier = config.classifier
self.transformer = Transformer(config, img_size, vis)
self.head = Linear(config.hidden_size, num_classes)
def forward(self, x, labels=None):
x, attn_weights = self.transformer(x)
print(x.shape)
# X.shape:16, 197, 768 logits.shape:16, 10
logits = self.head(x[:, 0])
print(logits.shape)
# 交叉熵
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
return loss
else:
return logits, attn_weights