c++调用pytorch模型并使用GPU进行预测

pytorch很好用，但是真实部署还是使用c++较多，因此需要用c++调用训练好的pytorch模型。参考官方示例https://github.com/apachecn/pytorch-doc-zh/blob/master/docs/1.0/cpp_export.md安装libtorch。但是官方示例中并没有使用真实的例子，且使用的是cpu版本的。

下面用一个真实的例子并用gpu，本示例的所有程序在https://github.com/zhangming8/pytorch-cpp-model

我使用的环境(没试过其他)：ubuntu16.04， cuda9，采用源码安装的opencv3.4(https://blog.csdn.net/u010397980/article/details/89439515)，libtorch（https://download.pytorch.org/libtorch/cu90/libtorch-shared-with-deps-latest.zip），python3，torch1.0.0。

1. 首先转化模型：

新建: vim create_model.py文件，内容如下，并执行下面的脚本：python3 create_model.py：

#coding:utf-8
import numpy as np
import os
import glob
import cv2
import shutil
import time
import torch
import torch.nn as nn

from mobilenetv2 import MobileNetV2


# 获取模型实例
model = MobileNetV2()
model.classifier = nn.Sequential(nn.Linear(1280, 8), nn.Sigmoid())
#model.load_state_dict(torch.load("latest.pt"))

img_size = 224
# 生成一个样本供网络前向传播 forward()
example = torch.rand(1, 3, img_size, img_size)

# 使用 torch.jit.trace 生成 torch.jit.ScriptModule 来跟踪
traced_script_module = torch.jit.trace(model, example)

img_list = ["test.jpg"]

s = time.time()
for i in img_list:
    img_org = cv2.imread(i)
    org_shape = img_org.shape[:-1]
    org_shape = org_shape[::-1]
    # process data
    img = cv2.resize(img_org, (img_size, img_size))
    img = img[:, :, ::-1].transpose(2, 0, 1)  # 1. BGR to RGB; 2. change hxwx3 to 3xhxw
    img = np.ascontiguousarray(img, dtype=np.float32)  # uint8 to float32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0

    inputs = torch.from_numpy(img)
    inputs = inputs.unsqueeze(0)
    output = traced_script_module(inputs)
    print("output:", output)

traced_script_module.save("model_cpp.pt")
print("create c++ model done...")

mobilenetv2.py文件的内容如下：

import torch
import torch.nn as nn
import math

# A PyTorch implementation of MobileNet V2 architecture and pretrained model.
# from https://github.com/tonylins/pytorch-mobilenet-v2
# pretrained model can be downloaded from https://drive.google.com/file/d/1jlto6HRVD3ipNkAl1lNhDbkBp7HylaqR/view
# env: python3

def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        #print("1 x.size()", x.size())  # bs,1280,7,7
        x = x.mean(3).mean(2)
        #print("2 x.size()", x.size())  # bs,1280
        x = self.classifier(x)
        #print("3 x.size()", x.size())  # bs,num_classes
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()


if __name__ == "__main__":
    img = torch.rand((2, 3, 224, 224))
    net = MobileNetV2(n_class=1000)
    print(net)
    state_dict = torch.load('finetune_weight/mobilenet_v2.pth.tar') # add map_location='cpu' if no gpu
    net.load_state_dict(state_dict)
    print("***"*50)
    pred = net(img)
    print(pred.size())

    '''
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

    input_size = 224
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(input_size, scale=(0.2, 1.0)), 
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=n_worker, pin_memory=True)
    
    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(int(input_size/0.875)),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=batch_size, shuffle=False,
        num_workers=n_worker, pin_memory=True)
    '''

从代码可以知道，该模型的输出为8个置信度为0-1的值(网络最后我用了一个sigmoid函数)，当然你可以修改模型的结构产生其他输出。为了便于演示这个模型是随机初始化的，因此在实际中你需要加载已经训练好的模型再转，如使用model.load_state_dict(torch.load("latest.pt"))加载。

会生成一个mobilenetv2的model_cpp.pt文件，该文件是c++调用的模型。并输出8个结果，由于模型是采用的随机初始化，所以每个的值都不一样。这是python调用pytorch模型的结果。

2.新建一个predict-app.cpp文件

该文件主要用于调用pytorch模型。vim predict-app.cpp

注意修改model_path为模型model_cpp.pt的路径和测试图像的路径test_path，并在test_path中随便放几张.jpg的图像。

该cpp文件首先加载并初始化了model_path指定的模型，把模型放到gpu上，之后遍历test_path下面所有".jpg"图像，再对图像进行预处理，送进网络，并把最大概率对应的索引位置输出。可以参考注释

#include <torch/torch.h>
#include <ATen/ATen.h>
#include <torch/script.h>

#include <iostream>
#include <memory>
#include <string>
#include <chrono>

#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/opencv.hpp"

using namespace cv;
using namespace std;
using namespace chrono;

#include <experimental/filesystem>
namespace fs = std::experimental::filesystem;

int main(int argc, const char* argv[]) {
    string model_path;
    if (argc == 2){
        model_path = argv[1];
    }
    else {
        model_path = "/home/xxxxxxxxx/model_cpp.pt";
    }
    cout << "using model:" << model_path << endl;
    string test_path = "/home/xxxxxxxxx/data/";
    
    // init model
    int img_size = 224;  // resize img to 224
    vector<torch::jit::IValue> inputs;  //def an input
    shared_ptr<torch::jit::script::Module> module = torch::jit::load(model_path);  //load model
    module->to(at::kCUDA);  // put model to gpu
    assert(module != nullptr);
    cout << "[INFO] init model done...\n";

    int i = 0;
    double t_start, t_end, t_cost;
    t_start = getTickCount(); // get now time
    
    Mat src, image, float_image;
    for (const auto &p : fs::directory_iterator(test_path)){ //遍历文件夹中的所有文件
        string s = p.path();  //get one file path
        string suffix = s.substr(s.find_last_of(".")+1);  //获取文件的格式(后缀)
        if (suffix != "jpg"){
            continue;
        }
        cout << i << "-------------------------" << endl;
        cout << p.path() << '\n';

        src = imread(s);  //读图
	// 图像预处理 注意需要和python训练时的预处理一致
        resize(src, image, Size(img_size, img_size));  // resize 图像
        cvtColor(image, image, CV_BGR2RGB);  // bgr -> rgb
	image.convertTo(float_image, CV_32F, 1.0 / 255);   //归一化到[0,1]区间
        //cout << float_image.at<Vec3f>(100,100)[1] << endl;  //输出一个像素点点值
        auto img_tensor = torch::CPU(torch::kFloat32).tensorFromBlob(float_image.data, {1, img_size, img_size, 3});   //将cv::Mat转成tensor,大小为1,224,224,3
        img_tensor = img_tensor.permute({0, 3, 1, 2});  //调换顺序变为torch输入的格式 1,3,224,224
        //img_tensor[0][0] = img_tensor[0][0].sub_(0.485).div_(0.229);  //减去均值,除以标准差
        //img_tensor[0][1] = img_tensor[0][1].sub_(0.456).div_(0.224);
        //img_tensor[0][2] = img_tensor[0][2].sub_(0.406).div_(0.225);
        
        auto img_var = torch::autograd::make_variable(img_tensor, false);  //不需要梯度
	inputs.emplace_back(img_var.to(at::kCUDA));  // 把预处理后的图像放入gpu
        torch::Tensor result = module->forward(inputs).toTensor();  //前向传播获取结果
        inputs.pop_back();
        cout << "result:" << result << endl;

        auto pred = result.argmax(1);
        cout << "max index:" << pred << endl;

	/*std::tuple<torch::Tensor,torch::Tensor> res_sort = result.sort(-1, true);
	torch::Tensor top_scores = get<0>(res_sort)[0];
	torch::Tensor top_idxs = get<1>(res_sort)[0].toType(torch::kInt32);
        auto top_scores_a = top_scores.accessor<float,1>();
	auto top_idxs_a = top_idxs.accessor<int,1>();
        for (int j = 0; j < 3; ++j) {
            int idx = top_idxs_a[j];
            cout << "top-" << j+1 << " index: " << idx << ", score: " << top_scores_a[j] << endl;
        }*/

        i++;
        if (i > 1000){
            break;
        }
    }
    t_end = getTickCount();
    t_cost = t_end - t_start;
    //t_cost = t_cost / getTickFrequency();
    printf("time cost: %4.f ms\n", t_cost/1000000.0);
    return 0;
}

/*
void Mycircle(){
    Point p = Point(320, 190); //圆的中心点
    int r= 50; //圆的半径
    Scalar color = Scalar(0, 255, 0);
    circle(src, p, r, color);
}
*/

3. 编译

默认opencv和libtorch都已经安装好了。

首先新建一个CMakeLists.txt文件，内容如下：vim CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(custom_ops)

find_package(Torch REQUIRED)
find_package( OpenCV REQUIRED )
include_directories( ${OpenCV_INCLUDE_DIRS} )

add_executable(predict-app predict-app.cpp)
target_link_libraries(predict-app "${TORCH_LIBRARIES}" ${OpenCV_LIBS} stdc++fs)
set_property(TARGET predict-app PROPERTY CXX_STANDARD 11)

再依次执行：

mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH=/media/DATA2/libtorch ..

(其中/media/DATA2/libtorch为你的libtorch的路径)
make

如果不出意外的话会出现以下界面：

以及

发现在build文件夹下生成了predict-app可执行文件，及其他文件：

最后执行./predict-app即可运行程序，界面如下所示：

发现使用c++的模型和使用python的模型结果输出是一样的(只要预处理没问题)。

c++调用pytorch模型并使用GPU进行预测

猜你喜欢