Mecanismo de Atención S2-MLPv2
La atención de S2-MLPv2 es un mecanismo de atención para tareas de procesamiento de lenguaje natural. Se mejora sobre la base del modelo S2-MLP (Sparse-to-Dense Multi-Level Perceptron).
La idea principal de la atención S2-MLPv2 es utilizar un perceptrón multicapa (MLP) para calcular los pesos de atención para capturar información importante en la secuencia de entrada. En comparación con el mecanismo de atención tradicional, la atención de S2-MLPv2 introduce una transformación no lineal al calcular los pesos de atención, lo que mejora la expresividad del modelo.
Dirección en papel: https://arxiv.org/pdf/2108.01072.pdf
El código es el siguiente:
import numpy as np
import torch
from torch import nn
from torch.nn import init
# https://arxiv.org/abs/2108.01072
def spatial_shift1(x):
b,w,h,c = x.size()
x[:,1:,:,:c//4] = x[:,:w-1,:,:c//4]
x[:,:w-1,:,c//4:c//2] = x[:,1:,:,c//4:c//2]
x[:,:,1:,c//2:c*3//4] = x[:,:,:h-1,c//2:c*3//4]
x[:,:,:h-1,3*c//4:] = x[:,:,1:,3*c//4:]
return x
def spatial_shift2(x):
b,w,h,c = x.size()
x[:,:,1:,:c//4] = x[:,:,:h-1,:c//4]
x[:,:,:h-1,c//4:c//2] = x[:,:,1:,c//4:c//2]
x[:,1:,:,c//2:c*3//4] = x[:,:w-1,:,c//2:c*3//4]
x[:,:w-1,:,3*c//4:] = x[:,1:,:,3*c//4:]
return x
class SplitAttention(nn.Module):
def __init__(self,channel=512,k=3):
super().__init__()
self.channel=channel
self.k=k
self.mlp1=nn.Linear(channel,channel,bias=False)
self.gelu=nn.GELU()
self.mlp2=nn.Linear(channel,channel*k,bias=False)
self.softmax=nn.Softmax(1)
def forward(self,x_all):
b,k,h,w,c=x_all.shape
x_all=x_all.reshape(b,k,-1,c)
a=torch.sum(torch.sum(x_all,1),1)
hat_a=self.mlp2(self.gelu(self.mlp1(a)))
hat_a=hat_a.reshape(b,self.k,c)
bar_a=self.softmax(hat_a)
attention=bar_a.unsqueeze(-2)
out=attention*x_all
out=torch.sum(out,1).reshape(b,h,w,c)
return out
class S2Attention(nn.Module):
def __init__(self, channels=512 ):
super().__init__()
self.mlp1 = nn.Linear(channels,channels*3)
self.mlp2 = nn.Linear(channels,channels)
self.split_attention = SplitAttention()
def forward(self, x):
b,c,w,h = x.size()
x=x.permute(0,2,3,1)
x = self.mlp1(x)
x1 = spatial_shift1(x[:,:,:,:c])
x2 = spatial_shift2(x[:,:,:,c:c*2])
x3 = x[:,:,:,c*2:]
x_all=torch.stack([x1,x2,x3],1)
a = self.split_attention(x_all)
x = self.mlp2(a)
x=x.permute(0,3,1,2)
return x