pytorch 从入门到精通

GitHub - mint-lab/dl_tutorial: Deep Learning Tutorial with PyTorch:slides

先做减法，具体例子带你了解torch使用的基本套路（分类和时间序列小例子）

pytorch中Tensor和Variable有什么区别？PyTorch内部机制

DataSet要实现哪几个函数？

二元分类为什么不能用MSE做为损失函数?

怎么获取网络的计算量和模型大小？有哪几种模型加载和保存方式？pytorch-summary, flops-counter.pytorch

有哪些提高pytorch 训练速度的trick? 预处理加速albumentations

PyTorch 深度学习:60分钟快速入门 ImageNet training in PyTorch, 训练一个图像分类模型

PyTorch源码解读之torch.utils.data.DataLoader、torchvision.transforms、torchvision.models,

PyTorch实战指南、trick 集锦、半小时学会 PyTorch Hook、深度学习模型转换与部署那些事(含ONNX格式详细分析)

扫描二维码关注公众号，回复： 14937281 查看本文章

详解Pytorch中的网络构造、resnet50pytorch, Autograd

PyTorch分布式训练简明教程: horovod

部署PyTorch模型到终端: demonet

以训练最简单的mnist为例，完整的例子为:

import os
import numpy as np
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

from tqdm import tqdm
from torchsummary import summary

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

trainset = torchvision.datasets.MNIST(root='./data/MNIST', train=True,download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=256,shuffle=True, num_workers=8)

testset = torchvision.datasets.MNIST(root='./data/MNIST', train=False,download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=8,shuffle=False, num_workers=8)

def readtest():
    for images, _ in tqdm(trainloader):
        img = torchvision.utils.make_grid(images,4)
        img = img.numpy()*2+0.5
        img = np.transpose(img, (1, 2, 0))
        cv2.imshow("img",img)
        cv2.waitKey()

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

def val(net):
    net.eval()
    correct = 0
    total = 0
    pbar = tqdm(testloader)
    for images, labels in pbar:
        images = images.to(device)
        labels = labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
        acc = correct * 100.0 / total
        pbar.set_description("acc: {acc:.2f}".format(acc=acc))
    acc = correct * 100.0 / total
    print("val acc={acc:.3f}".format(acc=acc))
    return acc

def train(net):
    bestacc = 0
    if os.path.exists("best.pth"):
        model = torch.load("best.pth")
        net.load_state_dict(model)
        bestacc = val(net)
        print("Resuming from acc = {acc:.3f}".format(acc = bestacc))
    optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    epochs = 100000
    for epoch in range(epochs):
        print("Epoch: "+str(epoch))
        net.train()
        pbar = tqdm(trainloader)
        for images, labels in pbar:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            acc = (predicted == labels).sum() * 100.0 / labels.size(0)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            pbar.set_description("loss: {loss:.3f},acc: {acc:.2f}".format(loss=loss,acc=acc))
        acc = val(net)
        if acc > bestacc:
            torch.save(net.state_dict(),"best.pth")
            bestacc = acc
            print("best improve to {acc:.3f}".format(acc=acc))
        torch.save(net.state_dict(),"last.pth")

def main():
    net = Net()
    net.to(device)
    #summary(net,(1,28,28))
    #readtest()
    train(net)

if __name__=="__main__":
    main()

pytorch2caffe ShuffleNet_V2_pytorch_caffe

首推PytorchToCaffe, 其支持 0.3-1.*版本的转换，注意1.1有些BUG暂不支持；还有就是不支持双线性插值的上采样层，这个在分割模型里用的比较多；还有就是torchvison的版本一定得是0.2，不然自带的alexnet模型转换报错，使用说明见pytorch模型转caffe

原理呢其实很简单，pytorch的model.state_dict().items()里保存了所有层的信息，最朴素的方法就是将其逐个提取出来，再转换为对应的caffe层，pytorch转caffe步骤就是这么弄的，很明显工作量很大，那有没有更取巧的方法呢？当然有，pyTorch-To-Caffe利用了python的trace机制，在回调函数中捕获网络所调用的原子操作，然后将对应的操作使用caffe的python接口进行映射，通过frame.f_code.co_name和frmae.f_locals可以获得网络传递过程中的函数名和参数，但可惜的是完成度不高，作者也没给出源码

f_code: The code object being executed in this frame
    co_name: Function name
    co_varnames: A tuple containing the names of the local variables
f_locals: The dictionary used to look up local variables
f_back: The previous stack frame

pytorch里权重保存也是[out_channels,in_channel,h,w]的形式，和caffe的是一致的，拿到data直接赋值就成.

难道就没有可用的方法了吗？所谓山重水复疑无路，柳暗花明又一村. PytorchToCaffe作为目前完成度最高的一份代码，给我们提供了很好的指南，只是刚拿到这份代码时感到一头雾水，不知道到底是怎么做到的.

模块初始化时会创建Rp类的对象，并用这个对象覆盖pytorch中的层实现，例如卷积层的实现F.conv2d=Rp(F.conv2d,_conv2d)

在工具使用中会调用pytorch网络的forward()方法，此时在调用到F.conv2d层是就会调用刚才覆盖的Rp(F.conv2d,_conv2d)这个对象中的__call__方法，并在此方法中调用_conv2d

_conv2d是工具内部定义的方法，作用是计算pytorch中的conv，将该层的名字以及计算得到的blob加入到之前创建的Translog中，并创建caffe中的conv实现，将pytorch中的相关权重写入caffe层中

恍然大悟，其实就是用自己定义的函数替换pytorch内置的计算，顺便把参数保存下来，不得不说真是高明呀.

反向操作的话见把Caffe的模型转换为Pytorch模型

ResNet模块

如下的左图对应于resnet-18/34使用的基本块，右图是50/101/152所使用的，由于他们都比较深，所以右图相比于左图使用了1x1卷积来降维。

图片描述

(a) conv3x3: 没啥好解释的，将原有的pytorch函数固定卷积和尺寸为3重新封装了一次；
(b) BasicBlock: 搭建上图左边的模块。

(1) 每个卷积块后面连接BN层进行归一化；
(2) 残差连接前的3x3卷积之后只接入BN，不使用ReLU，避免加和之后的特征皆为正，保持特征的多样；

(3) 跳层连接：两种情况，当模块输入和残差支路（3x3->3x3）的通道数一致时，直接相加；当两者通道不一致时（一般发生在分辨率降低之后，同分辨率一般通道数一致），需要对模块输入特征使用1x1卷积进行升/降维（步长为2，上面说了分辨率会降低），之后同样接BN，不用ReLU。
(c) Bottleneck: 搭建上图右边的模块。

(1) 使用1x1卷积先降维，再使用3x3卷积进行特征提取，最后再使用1x1卷积把维度升回去；
(2) 每个卷积块后面连接BN层进行归一化；

(2) 残差连接前的1x1卷积之后只接入BN，不使用ReLU，避免加和之后的特征皆为正，保持特征的多样性。

(3) 跳层连接：两种情况，当模块输入和残差支路（1x1->3x3->1x1）的通道数一致时，直接相加；当两者通道不一致时（一般发生在分辨率降低之后，同分辨率一般通道数一致），需要对模块输入特征使用1x1卷积进行升/降维（步长为2，上面说了分辨率会降低），之后同样接BN，不用ReLU。

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out