转载自：PyTorch 70.einops：优雅地操作张量维度

einops 三大操作

from einops import rearrange, reduce, repeat

einops通过rearrange, reduce, repeat这3个方法，可以起到 stacking, reshape, transposition, squeeze/unsqueeze, repeat, tile, concatenate, view 以及各种reduction操作的效果，大大简化了原来复杂的高维张量操作，提高了代码可读性，支持 numpy、pytorch、tensorflow 等等。

导入6张测试图片：

# There are 6 images of shape 96x96 with 3 color channels packed into tensor
print(ims.shape, ims.dtype)
# (6, 96, 96, 3) float64
# (b,h,w,c)

ims[0]

ims[1]

转换张量维度
旧：

y = x.transpose(0, 2, 3, 1)

新：

y = rearrange(x, 'b c h w -> b h w c')

rearrange：重新安排维度

交换维度、压缩维度、分解维度、

rearrange(ims[0], 'h w c -> w h c') # 交换维度

在这里插入图片描述

rearrange(ims, 'b h w c -> (b h) w c') # 压缩维度 (b和h)

在这里插入图片描述

扫描二维码关注公众号，回复： 15137247 查看本文章

# or compose a new dimension of batch and width
rearrange(ims, 'b h w c -> h (b w) c') # 压缩维度 (b和w)
# length of newly composed axis is a product of components
# [6, 96, 96, 3] -> [96, (6 * 96), 3] -> [96, 576, 3]
rearrange(ims, 'b h w c -> h (b w) c').shape

在这里插入图片描述

# decomposition is the inverse process - represent an axis as a combination of new axes
# several decompositions possible, so b1=2 is to decompose 6 to b1=2 and b2=3
rearrange(ims, '(b1 b2) h w c -> b1 b2 h w c ', b1=2).shape  # 分解维度，原始b=6, 指定b1=2, b2自动计算=6/b1
# (2, 3, 96, 96, 3)

# finally, combine composition and decomposition:
rearrange(ims, '(b1 b2) h w c -> (b1 h) (b2 w) c ', b1=2) # 分解维度b，并将b1和h、b2和w进行压缩

在这里插入图片描述

#height变为2倍，width变为1半：
rearrange(ims, 'b h (w w2) c -> (h w2) (b w) c', w2=2)  # 分解维度w(w1=w/2)，将h和w2合并，即h*2，将b和w1合并，即6张图横向排列，宽为w1

在这里插入图片描述

rearrange(ims, 'b h w c -> h (b w) c')

在这里插入图片描述
注意前面我们压缩维度时，只要涉及b都是b在前，如(b w)、(b h)、(b w1)等，括号里的顺序是有意义的。

# 与上例做个对比, leftmost digit is the most significant
rearrange(ims, 'b h w c -> h (w b) c')  # b在

在这里插入图片描述

# what if b1 and b2 are reordered before composing to width?
rearrange(ims, '(b1 b2) h w c -> h (b1 b2 w) c ', b1=2) # 原始batch中图片顺序 'einops'
rearrange(ims, '(b1 b2) h w c -> h (b2 b1 w) c ', b1=2) # 重排序为 'eoipns'

在这里插入图片描述

reduce

# average over batch
reduce(ims, 'b h w c -> h w c', 'mean')  # 对张量的b维度取平均值

在这里插入图片描述

# the previous is identical to familiar:
ims.mean(axis=0) # 使用老方法应该这么写
# but is so much more readable

# besides mean, there are also min, max, sum, prod
reduce(ims, 'b h w c -> h w', 'min') # 除了mean 还可以求min, max, sum, prod

在这里插入图片描述

# this is mean-pooling with 2x2 kernel
# image is split into 2x2 patches, each patch is averaged
reduce(ims, 'b (h h2) (w w2) c -> h (b w) c', 'mean', h2=2, w2=2) # 图像分割为2x2的patch，每个patch分别进行mean，并压缩b和w显示
# 效果就是变小了

在这里插入图片描述

# yet another example. Can you compute result shape?
reduce(ims, '(b1 b2) h w c -> (b2 h) (b1 w)', 'mean', b1=2)

在这里插入图片描述

将张量放入list中，einops 会默认按batch划分，即6张图像，对该list操作效果等同于对原始张量操作

# rearrange can also take care of lists of arrays with the same shape
x = list(ims)
# that's how we can stack inputs
# "list axis" becomes first ("b" in this case), and we left it there
rearrange(x, 'b h w c -> b h w c').shape
# (6, 96, 96, 3)

rearrange(x, 'b h w c -> h w c b').shape
# (96, 96, 3, 6)

numpy.array_equal(rearrange(x, 'b h w c -> h w c b'), numpy.stack(x, axis=3)) # rearrange交换维度 等价于 numpy.stack
# True

增加或删除张量维度

x = rearrange(ims, 'b h w c -> b 1 h w 1 c') # functionality of numpy.expand_dims
#(6, 1, 96, 96, 1, 3)

rearrange(x, 'b 1 h w 1 c -> b h w c').shape # functionality of numpy.squeeze
#(6, 96, 96, 3)

# compute max in each image individually, then show a difference 
x = reduce(ims, 'b h w c -> b () () c', 'max') - ims
rearrange(x, 'b h w c -> h (b w) c')

在这里插入图片描述

# interweaving pixels of different pictures
# all letters are observable
rearrange(ims, '(b1 b2) h w c -> (h b1) (w b2) c ', b1=2)

在这里插入图片描述

# interweaving along vertical for couples of images
rearrange(ims, '(b1 b2) h w c -> (h b1) (b2 w) c', b1=2)

在这里插入图片描述

# disproportionate resize
reduce(ims, 'b (h 4) (w 3) c -> (h) (b w)', 'mean')

在这里插入图片描述

# interweaving lines for couples of images
# exercise: achieve the same result without einops in your favourite framework
reduce(ims, '(b1 b2) h w c -> h (b2 w) c', 'max', b1=2)

在这里插入图片描述

# spilt each image in two halves, compute mean of the two
reduce(ims, 'b (h1 h2) w c -> h2 (b w)', 'mean', h1=2)

在这里插入图片描述

# split in small patches and transpose each patch
rearrange(ims, 'b (h1 h2) (w1 w2) c -> (h1 w2) (b w1 h2) c', h2=8, w2=8)

在这里插入图片描述

rearrange(ims, '(b1 b2) (h1 h2) (w1 w2) c -> (h1 b1 h2) (w1 b2 w2) c', h1=3, w1=3, b2=3)

在这里插入图片描述

repeat

# pixelate: first downscale by averaging, then upscale back using the same pattern
averaged = reduce(ims, 'b (h h2) (w w2) c -> b h w c', 'mean', h2=6, w2=8)
repeat(averaged, 'b h w c -> (h h2) (b w w2) c', h2=6, w2=8)

在这里插入图片描述

rearrange(ims, 'b h w c -> w (b h) c')

在这里插入图片描述

einops操作PyTorch张量

生成张量

from einops import rearrange, reduce
import numpy as np
x = np.random.RandomState(42).normal(size=[10, 32, 100, 200])
from utils import guess

# select one from 'chainer', 'gluon', 'tensorflow', 'pytorch' 

flavour = 'pytorch'

print('selected {} backend'.format(flavour))
if flavour == 'tensorflow':
    import tensorflow as tf
    tape = tf.GradientTape(persistent=True)
    tape.__enter__()
    x = tf.Variable(x) + 0
elif flavour == 'pytorch':
    import torch
    x = torch.from_numpy(x)
    x.requires_grad = True
elif flavour == 'chainer':
    import chainer
    x = chainer.Variable(x)
else:
    assert flavour == 'gluon'
    import mxnet as mx
    mx.autograd.set_recording(True)
    x = mx.nd.array(x, dtype=x.dtype)
    x.attach_grad()

type(x), x.shape
# (torch.Tensor, torch.Size([10, 32, 100, 200]))
y = rearrange(x, 'b c h w -> b h w c')
guess(y.shape)
# (torch.Tensor, torch.Size([10, 100, 200, 32]))

y = rearrange(x, 'b c h w -> b (c h w)')
y.shape
# (torch.Tensor, torch.Size([10, 640000]))

y = rearrange(x, 'b c (h h1) (w w1) -> b (h1 w1 c) h w', h1=2, w1=2)
y.shape
# (torch.Tensor, torch.Size([10, 128, 50, 100]))

y = rearrange(x, 'b (c h1 w1) h w -> b c (h h1) (w w1)', h1=2, w1=2)
y.shape
# (torch.Tensor, torch.Size([10, 8, 200, 400]))

反向传播

y0 = x
y1 = reduce(y0, 'b c h w -> b c', 'max')
y2 = rearrange(y1, 'b c -> c b')
y3 = reduce(y2, 'c b -> ', 'sum')

if flavour == 'tensorflow':
    print(reduce(tape.gradient(y3, x), 'b c h w -> ', 'sum'))
else:   # pytorch等
    y3.backward()
    print(reduce(x.grad, 'b c h w -> ', 'sum'))

转化成numpy

from einops import asnumpy
y3_numpy = asnumpy(y3)

print(type(y3_numpy), y3_numpy.shape)
# <class 'numpy.ndarray'>

张量增加与删除维度

Squeeze and unsqueeze (expand_dims)：

# models typically work only with batches, 
# so to predict a single image ...
image = rearrange(x[0, :3], 'c h w -> h w c')
# ... create a dummy 1-element axis ...
y = rearrange(image, 'h w c -> () c h w')
# ... imagine you predicted this with a convolutional network for classification,
# we'll just flatten axes ...
predictions = rearrange(y, 'b c h w -> b (c h w)')
# ... finally, decompose (remove) dummy axis
predictions = rearrange(predictions, '() classes -> classes')

per-channel mean-normalization for each image：

y = x - reduce(x, 'b c h w -> b c 1 1', 'mean')
y.shape
(torch.Tensor, torch.Size([10, 32, 100, 200]))

per-channel mean-normalization for whole batch：

y = x - reduce(y, 'b c h w -> 1 c 1 1', 'mean')
y.shape
(torch.Tensor, torch.Size([10, 32, 100, 200]))

拼接张量

concatenate over the first dimension：

tensors = rearrange(list_of_tensors, 'b c h w -> (b h) w c')
tensors.shape
(torch.Tensor, torch.Size([1000, 200, 32]))

打乱

channel shuffle：

y = rearrange(x, 'b (g1 g2 c) h w-> b (g2 g1 c) h w', g1=4, g2=4)
y.shape
(torch.Tensor, torch.Size([10, 32, 100, 200]))

分割维度

Example: when a network predicts several bboxes for each position

Assume we got 8 bboxes, 4 coordinates each. To get coordinated into 4 separate variables, you move corresponding dimension to front and unpack tuple.

bbox_x, bbox_y, bbox_w, bbox_h = \
    rearrange(x, 'b (coord bbox) h w -> coord b bbox h w', coord=4, bbox=8)
# now you can operate on individual variables
max_bbox_area = reduce(bbox_w * bbox_h, 'b bbox h w -> b h w', 'max')
guess(bbox_x.shape)
guess(max_bbox_area.shape)
(torch.Tensor, torch.Size([10, 8, 100, 200]))
(torch.Tensor, torch.Size([10, 100, 200]))

rearrange实现卷积

Finally, how to convert any operation into a strided operation?
(like convolution with strides, aka dilated/atrous convolution)

# each image is split into subgrids, each is now a separate "image"
y = rearrange(x, 'b c (h hs) (w ws) -> (hs ws b) c h w', hs=2, ws=2)
y = convolve_2d(y)
# pack subgrids back to an image
y = rearrange(y, '(hs ws b) c h w -> b c (h hs) (w ws)', hs=2, ws=2)

assert y.shape == x.shape

reduce实现池化

Simple global average pooling：

y = reduce(x, 'b c h w -> b c', reduction='mean')
y.shape
#(torch.Tensor, torch.Size([10, 32]))

max-poolingwith a kernel 2x2：

y = reduce(x, 'b c (h h1) (w w1) -> b c h w', reduction='max', h1=2, w1=2)
y.shape
#(torch.Tensor, torch.Size([10, 32, 50, 100]))

1d, 2d and 3d pooling are defined in a similar way
for sequential 1-d models, you’ll probably want pooling over time

reduce(x, '(t 2) b c -> t b c', reduction='max')

reduce(x, 'b c (x 2) (y 2) (z 2) -> b c x y z', reduction='max')

定义网络结构Layers

For frameworks that prefer operating with layers, layers are available.

You’ll need to import a proper one depending on your backend：

from einops.layers.torch import Rearrange, Reduce

Einops layers are identical to operations, and have same parameters.

(for the exception of first argument, which should be passed during call)：

# example given for pytorch, but code in other frameworks is almost identical
from torch.nn import Sequential, Conv2d, MaxPool2d, Linear, ReLU
from einops.layers.torch import Reduce

model = Sequential(
    Conv2d(3, 6, kernel_size=5),
    MaxPool2d(kernel_size=2),
    Conv2d(6, 16, kernel_size=5),
    # combined pooling and flattening in a single step
    Reduce('b c (h 2) (w 2) -> b (c h w)', 'max'), 
    Linear(16*5*5, 120), 
    ReLU(),
    Linear(120, 10), 
)

经典网络的变化

Simple ConvNet：

老代码：

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

conv_net_old = Net()

新代码：

conv_net_new = nn.Sequential(
    nn.Conv2d(1, 10, kernel_size=5),
    nn.MaxPool2d(kernel_size=2),
    nn.ReLU(),
    nn.Conv2d(10, 20, kernel_size=5),
    nn.MaxPool2d(kernel_size=2),
    nn.ReLU(),
    nn.Dropout2d(),
    Rearrange('b c h w -> b (c h w)'),
    nn.Linear(320, 50),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(50, 10),
    nn.LogSoftmax(dim=1)
)

Channel shuffle (from shufflenet)：

老代码：

def channel_shuffle_old(x, groups):
    batchsize, num_channels, height, width = x.data.size()

    channels_per_group = num_channels // groups
    
    # reshape
    x = x.view(batchsize, groups, 
        channels_per_group, height, width)

    # transpose
    # - contiguous() required if transpose() is used before view().
    #   See https://github.com/pytorch/pytorch/issues/764
    x = torch.transpose(x, 1, 2).contiguous()

    # flatten
    x = x.view(batchsize, -1, height, width)

    return x

新代码：

def channel_shuffle_new(x, groups):
    return rearrange(x, 'b (c1 c2) h w -> b (c2 c1) h w', c1=groups)

Simple attention：

老代码：

class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
    
    def forward(self, K, V, Q):
        A = torch.bmm(K.transpose(1,2), Q) / np.sqrt(Q.shape[1])
        A = F.softmax(A, 1)
        R = torch.bmm(V, A)
        return torch.cat((R, Q), dim=1)

新代码：

def attention(K, V, Q):
    _, n_channels, _ = K.shape
    A = torch.einsum('bct,bcl->btl', [K, Q])
    A = F.softmax(A * n_channels ** (-0.5), 1)
    R = torch.einsum('bct,btl->bcl', [V, A])
    return torch.cat((R, Q), dim=1)

Transformer’s attention needs more attention:

老代码：

class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):

        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn



class MultiHeadAttentionOld(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)


    def forward(self, q, k, v, mask=None):
        
        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        
        sz_b, len_q, _ = q.size()
        sz_b, len_k, _ = k.size()
        sz_b, len_v, _ = v.size()
        
        residual = q
        
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
        
        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
        
        mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
        output, attn = self.attention(q, k, v, mask=mask)
        
        output = output.view(n_head, sz_b, len_q, d_v)
        output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
        
        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)
        
        return output, attn

新代码：

class MultiHeadAttentionNew(nn.Module):
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()
        self.n_head = n_head
        
        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        
        nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
        
        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)
        self.dropout = nn.Dropout(p=dropout)
        self.layer_norm = nn.LayerNorm(d_model)

    def forward(self, q, k, v, mask=None):
        residual = q
        q = rearrange(self.w_qs(q), 'b l (head k) -> head b l k', head=self.n_head)
        k = rearrange(self.w_ks(k), 'b t (head k) -> head b t k', head=self.n_head)
        v = rearrange(self.w_vs(v), 'b t (head v) -> head b t v', head=self.n_head)
        attn = torch.einsum('hblk,hbtk->hblt', [q, k]) / np.sqrt(q.shape[-1])
        if mask is not None:
            attn = attn.masked_fill(mask[None], -np.inf)
        attn = torch.softmax(attn, dim=3)
        output = torch.einsum('hblt,hbtv->hblv', [attn, v])
        output = rearrange(output, 'head b l v -> b l (head v)')
        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)
        return output, attn

einops 优雅地操作张量维度(基础操作+pytorch操作+网络设计)

einops 三大操作

rearrange：重新安排维度

reduce

repeat

einops操作PyTorch张量

生成张量

反向传播

转化成numpy

张量增加与删除维度

拼接张量

打乱

分割维度

rearrange实现卷积

reduce实现池化

定义网络结构Layers

经典网络的变化

Simple ConvNet：

Channel shuffle (from shufflenet)：

Simple attention：

Transformer’s attention needs more attention:

猜你喜欢