自然语言处理(二十一):Transformer子层连接结构

自然语言处理笔记总目录


Transformer介绍

class SublayerConnection(nn.Module):
    def __init__(self, size, dropout=0.1):
        super(SublayerConnection, self).__init__()
        
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))
size = 512
dropout = 0.2
head = 8
d_model = 512

x = out_pe
mask = torch.zeros(8, 4, 4)
# 假设子层中装的是多头注意力层, 实例化这个类
self_attn = MultiHeadedAttention(head, d_model, dropout)

# 使用lambda获得一个函数类型的子层
sublayer = lambda x: self_attn(x, x, x, mask)

sc = SublayerConnection(size, dropout)
out_sc = sc(x, sublayer)

print(out_sc)
print(out_sc.shape)
tensor([[[  0.2420,   0.4013,   6.0859,  ..., -21.6484,  35.4178,  39.0050],
         [  0.1984,  23.6501, -10.4857,  ...,  10.2693,  31.9493, -22.9047],
         [-16.8140,  -4.8281, -10.9470,  ...,   0.0000, -26.1868,  10.2506],
         [  3.6501,  50.1923,  45.1169,  ...,   6.3982,  14.3296,  -0.1648]],

        [[  0.2490,   0.0000,   0.0000,  ...,  -9.2741,  -0.0000,  13.0224],
         [-15.2243,   8.4786,   4.4173,  ...,  -5.1241,  13.1085,  15.1671],
         [ -7.2062,  -6.4299, -36.1564,  ..., -11.9902,   0.2249, -13.1845],
         [  0.1943,  38.0378,  16.3203,  ...,  -0.4592,  -1.9879, -24.8804]]],
       grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])

猜你喜欢

转载自blog.csdn.net/weixin_45707277/article/details/122635640