Transformer介绍
class SublayerConnection(nn.Module):
def __init__(self, size, dropout=0.1):
super(SublayerConnection, self).__init__()
self.norm = LayerNorm(size)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x, sublayer):
return x + self.dropout(sublayer(self.norm(x)))
size = 512
dropout = 0.2
head = 8
d_model = 512
x = out_pe
mask = torch.zeros(8, 4, 4)
self_attn = MultiHeadedAttention(head, d_model, dropout)
sublayer = lambda x: self_attn(x, x, x, mask)
sc = SublayerConnection(size, dropout)
out_sc = sc(x, sublayer)
print(out_sc)
print(out_sc.shape)
tensor([[[ 0.2420, 0.4013, 6.0859, ..., -21.6484, 35.4178, 39.0050],
[ 0.1984, 23.6501, -10.4857, ..., 10.2693, 31.9493, -22.9047],
[-16.8140, -4.8281, -10.9470, ..., 0.0000, -26.1868, 10.2506],
[ 3.6501, 50.1923, 45.1169, ..., 6.3982, 14.3296, -0.1648]],
[[ 0.2490, 0.0000, 0.0000, ..., -9.2741, -0.0000, 13.0224],
[-15.2243, 8.4786, 4.4173, ..., -5.1241, 13.1085, 15.1671],
[ -7.2062, -6.4299, -36.1564, ..., -11.9902, 0.2249, -13.1845],
[ 0.1943, 38.0378, 16.3203, ..., -0.4592, -1.9879, -24.8804]]],
grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])