caffe Softmax层TensorRT IPlugin代码实现

TensorRT只支持对channel通道的softmax，对其他通道不支持，而SSD中的softmax不是对channel通道的分类，故要想实现对SSD的TensorRT加速，需要手动编写softmax层的IPugin代码。
//Softmax layer . TensorRT softmax only support cross channel
class SoftmaxPlugin : public IPlugin
{
public:
    SoftmaxPlugin(int softmax_axis) {// 通过构造函数，将prototxt中SoftMax层的参数（分类的维度索引）传进来
      softmax_axis_ = softmax_axis;
    }
    SoftmaxPlugin(const void* buffer, size_t size) {// 将通过serialize函数保存到engine文件中的内容解析出来
        assert(size == 3 * sizeof(int));
        const int* d = reinterpret_cast<const int*>(buffer);
        outer_num_ = d[0];
        shape_softmax_axis_ = d[1];
        inner_num_ = d[2];
    }

    inline int getNbOutputs() const override { return 1; }//第一步
    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override {//第二步
        assert(1 == nbInputDims);
        assert(0 == index);
        assert(3 == inputs[index].nbDims);
        // softmax层的输入输出维度完全一致
        return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]);
    }

    void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize) override {//第三步
      assert(softmax_axis_ < 4);
      // softmax层的数据维度以维度softmax_axis_为中心分割开，参考caffe softmax层的源码
      int count = 1;
      for (int i = 0; i < softmax_axis_; ++i) {
        if(0 == i)
          count *= maxBatchSize;
        else
          count *= inputs[0].d[i-1];
      }
      outer_num_ = count;
      if(0 == softmax_axis_)
        shape_softmax_axis_ = maxBatchSize;
      else
        shape_softmax_axis_ = inputs[0].d[softmax_axis_-1];
      count = 1;
      for (int i = softmax_axis_+1; i < 4; ++i) {
        if(0 == i)
          count *= maxBatchSize;
        else
          count *= inputs[0].d[i-1];
      }
      inner_num_ = count;
    }

    int initialize() override {//第四步
        // Initialize CUDNN.
        CUDNN_CHECK(cudnnCreate(&handle_));
        cudnn::createTensor4dDesc<float>(&bottom_desc_);
        cudnn::createTensor4dDesc<float>(&top_desc_);

        int N = outer_num_;
        int K = shape_softmax_axis_;
        int H = inner_num_;
        int W = 1;
        cudnn::setTensor4dDesc<float>(&bottom_desc_, N, K, H, W);
        cudnn::setTensor4dDesc<float>(&top_desc_, N, K, H, W);

        handles_setup_ = true;
        return 0;
    }

    size_t getSerializationSize() override {
        return 3 * sizeof(int);
    }

    void serialize(void* buffer) override {//第五步
        int* d = reinterpret_cast<int*>(buffer);
        d[0] = outer_num_; d[1] = shape_softmax_axis_; d[2] = inner_num_;
    }

    inline void terminate() override {//第六步
        // Check that handles have been setup before destroying.
        if (!handles_setup_) { return; }

        cudnnDestroyTensorDescriptor(bottom_desc_);
        cudnnDestroyTensorDescriptor(top_desc_);
        cudnnDestroy(handle_);
    }

    inline size_t getWorkspaceSize(int) const override { return 0; }
    int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override {
        // 参考caffe softmax层cudnn前向计算代码
        const float* bottom_data = (float*)inputs[0];
        float* top_data = (float*)outputs[0];
        CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE,
                                        CUDNN_SOFTMAX_MODE_CHANNEL,
                                        cudnn::dataType<float>::one,
                                        bottom_desc_, bottom_data,
                                        cudnn::dataType<float>::zero,
                                        top_desc_, top_data));
        return 0;
    }


protected:
    int outer_num_;
    int inner_num_;
    int softmax_axis_;
    int shape_softmax_axis_;

    bool handles_setup_;
    cudnnHandle_t             handle_;
    cudnnTensorDescriptor_t bottom_desc_;
    cudnnTensorDescriptor_t top_desc_;
};
caffe Softmax层TensorRT IPlugin代码实现

猜你喜欢