pytorch from entry to proficiency

GitHub - mint-lab/dl_tutorial: Deep Learning Tutorial with PyTorch:slides

 

Do the subtraction first, and the specific examples will take you to understand the basic routines used by torch (small examples of classification and time series)

What is the difference between Tensor and Variable in pytorch ? PyTorch internal mechanism

Which functions should DataSet implement?

Why can't binary classification use MSE as a loss function?

How to obtain the calculation amount and model size of the network? What are the ways to load and save models? pytorch-summaryflops-counter.pytorch

What are the tricks to improve the speed of pytorch training? Preprocessing to accelerate albumations

PyTorch deep learning: 60 minutes quick start  ImageNet training in PyTorchtrain an image classification model

PyTorch source code interpretation of torch.utils.data.DataLoader , torchvision.transforms , torchvision.models ,

PyTorch practical guide , trick collection , half an hour to learn PyTorch Hook , deep learning model conversion and deployment (including detailed analysis of ONNX format)

Detailed explanation of the network structure in Pytorch , resnet50pytorchAutograd

PyTorch Distributed Training Concise Tutorialhorovod

Deploy the PyTorch model to the terminaldemonet

Take training the simplest mnist as an example, the complete example is:

import os
import numpy as np
import cv2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

from tqdm import tqdm
from torchsummary import summary

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])

trainset = torchvision.datasets.MNIST(root='./data/MNIST', train=True,download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=256,shuffle=True, num_workers=8)

testset = torchvision.datasets.MNIST(root='./data/MNIST', train=False,download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=8,shuffle=False, num_workers=8)

def readtest():
    for images, _ in tqdm(trainloader):
        img = torchvision.utils.make_grid(images,4)
        img = img.numpy()*2+0.5
        img = np.transpose(img, (1, 2, 0))
        cv2.imshow("img",img)
        cv2.waitKey()

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

def val(net):
    net.eval()
    correct = 0
    total = 0
    pbar = tqdm(testloader)
    for images, labels in pbar:
        images = images.to(device)
        labels = labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
        acc = correct * 100.0 / total
        pbar.set_description("acc: {acc:.2f}".format(acc=acc))
    acc = correct * 100.0 / total
    print("val acc={acc:.3f}".format(acc=acc))
    return acc

def train(net):
    bestacc = 0
    if os.path.exists("best.pth"):
        model = torch.load("best.pth")
        net.load_state_dict(model)
        bestacc = val(net)
        print("Resuming from acc = {acc:.3f}".format(acc = bestacc))
    optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    epochs = 100000
    for epoch in range(epochs):
        print("Epoch: "+str(epoch))
        net.train()
        pbar = tqdm(trainloader)
        for images, labels in pbar:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            acc = (predicted == labels).sum() * 100.0 / labels.size(0)
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            pbar.set_description("loss: {loss:.3f},acc: {acc:.2f}".format(loss=loss,acc=acc))
        acc = val(net)
        if acc > bestacc:
            torch.save(net.state_dict(),"best.pth")
            bestacc = acc
            print("best improve to {acc:.3f}".format(acc=acc))
        torch.save(net.state_dict(),"last.pth")

def main():
    net = Net()
    net.to(device)
    #summary(net,(1,28,28))
    #readtest()
    train(net)

if __name__=="__main__":
    main()

pytorch2caffe ShuffleNet_V2_pytorch_caffe

PytorchToCaffe is the most recommended , which supports the conversion of 0.3-1.* version. Note that some BUGs in 1.1 are not supported for the time being; there is also an upsampling layer that does not support bilinear interpolation, which is used more in segmentation models; and there is The version of torchvison must be 0.2, otherwise the built-in alexnet model conversion error will be reported. For instructions, see pytorch model to caffe

The principle is actually very simple. The model.state_dict().items() of pytorch stores all the information of the layers. The simplest method is to extract them one by one, and then convert them into the corresponding caffe layers. The steps of converting pytorch to caffe are like this It is obvious that the workload is very heavy, so is there a more tricky way? Of course, pyTorch-To-Caffe uses python's trace mechanism to capture the atomic operations called by the network in the callback function, and then maps the corresponding operations using the python interface of caffe, through frame.f_code.co_name and frmae.f_locals The function name and parameters in the network transmission process can be obtained, but unfortunately the degree of completion is not high, and the author did not give the source code

f_code: The code object being executed in this frame
    co_name: Function name
    co_varnames: A tuple containing the names of the local variables
f_locals: The dictionary used to look up local variables
f_back: The previous stack frame

The weight storage in pytorch is also in the form of [out_channels, in_channel, h, w], which is consistent with that of caffe, and it can be directly assigned to the data.

Isn't there a method available? The so-called mountains and rivers are full of doubts, and there is another village.  PytorchToCaffe , as the most complete code at present, provides us with a good guide, but I was confused when I first got this code. I don’t know what it is How did you do it.

When the module is initialized, an object of the Rp class will be created, and this object will be used to cover the layer implementation in pytorch, such as the implementation of the convolutional layer F.conv2d=Rp(F.conv2d,_conv2d)

The forward() method of the pytorch network will be called during the use of the tool. At this time, when the F.conv2d layer is called, the __call__ method in the Rp(F.conv2d,_conv2d) object just covered will be called, and here Call _conv2d in the method

_conv2d is a method defined inside the tool. Its function is to calculate the conv in pytorch, add the name of the layer and the calculated blob to the previously created Translog, and create the conv implementation in caffe, and write the relevant weights in pytorch In the caffe layer

It suddenly dawned on me that it is actually replacing the built-in calculation of pytorch with the function defined by myself, and saving the parameters by the way, I have to say it is really clever.

For the reverse operation, see Converting the Caffe model to the Pytorch model

ResNet module

The following left picture corresponds to the basic block used by resnet-18/34, and the right picture is used by 50/101/152. Since they are relatively deep, the right picture uses 1x1 convolution to reduce the dimensionality compared to the left picture .

image description

  • (a)  conv3x3: There is nothing to explain, the original pytorch function fixed convolution and size 3 are repackaged once;
  • (b)  BasicBlock: Build the module on the left of the picture above.

    (1) Each convolutional block is connected to the BN layer for normalization;

    (2) After the 3x3 convolution before the residual connection, only BN is connected, ReLU is not used, and the features after summing are avoided to be all positive, and the features are kept diverse;

    (3) Skip layer connection: in two cases, when the number of channels of the module input and the residual branch (3x3->3x3) are the same, add them directly; when the two channels are inconsistent (usually after the resolution is reduced , the same as the general channel number of the resolution), it is necessary to use 1x1 convolution to increase/decrease the dimensionality of the module input features (the step size is 2, the resolution will be reduced as mentioned above), and then connect to BN instead of ReLU.

  • (c)   Bottleneck: Build the module on the right side of the picture above.

    (1) Use 1x1 convolution to reduce the dimension first, then use 3x3 convolution for feature extraction, and finally use 1x1 convolution to increase the dimension back;

    (2) Each convolutional block is connected to the BN layer for normalization;

    (2) After the 1x1 convolution before the residual connection, only BN is connected, ReLU is not used, and the features after summing are avoided to be all positive, and the diversity of features is maintained.

    (3) Skip layer connection: in two cases, when the number of channels of the module input and the residual branch (1x1->3x3->1x1) are the same, add them directly; when the channels of the two are inconsistent (usually occurs in the resolution After the rate is reduced, the number of channels is the same as the resolution), it is necessary to use 1x1 convolution to increase/decrease the dimensionality of the module input features (the step size is 2, the resolution will be reduced as mentioned above), and then connect to BN instead of ReLU .

def conv3x3(in_planes, out_planes, stride=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
                               bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
                                  momentum=BN_MOMENTUM)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

Guess you like

Origin blog.csdn.net/minstyrain/article/details/105468221
Recommended