Keras版本的mask-rcnn，如何迁移到windows,c++，tensorflow上？

参考：
https://github.com/matterport/Mask_RCNN/issues/1115
https://github.com/matterport/Mask_RCNN/issues/222#issuecomment-373130661

首先将keras中的模型保存下来，最初想先保存成h5,然后转换成pb，但是一起保存模型和参数有很多问题，然后就在代码中直接保存成pb格式。

这个只是其中在tensorflow c++加载模型测试的步骤：

// given inputMat of type RGB (not BGR) / CV_8UC3 (possibly from an imread + cvtColor)
    // also given dest of type cv::Mat(inputMat.size(), CV_8UC1)
    // we trained on 256x256 , so TF_MASKRCNN_IMG_WIDTHHEIGHT = 256
    // we copied MEAN_PIXEL configs, so cv::Scalar TF_MASKRCNN_MEAN_PIXEL(123.7, 116.8, 103.9);
    // we statically defined float TF_MASKRCNN_IMAGE_METADATA[10] = {  0 ,TF_MASKRCNN_IMG_WIDTHHEIGHT ,TF_MASKRCNN_IMG_WIDTHHEIGHT , 3 , 0 , 0 ,TF_MASKRCNN_IMG_WIDTHHEIGHT ,TF_MASKRCNN_IMG_WIDTHHEIGHT , 0 , 0 }; 

    // Resize to square with max dim, so we can resize it to 512x512
    int largestDim = inputMat.size().height > inputMat.size().width ? inputMat.size().height : inputMat.size().width;
    cv::Mat squareInputMat(cv::Size(largestDim, largestDim), CV_8UC3);
    int leftBorder = (largestDim - inputMat.size().width) / 2;
    int topBorder = (largestDim - inputMat.size().height) / 2;
    cv::copyMakeBorder(inputMat, squareInputMat, topBorder, largestDim - (inputMat.size().height + topBorder), leftBorder, largestDim - (inputMat.size().width + leftBorder), cv::BORDER_CONSTANT, cv::Scalar(0));
    cv::Mat resizedInputMat(cv::Size(TF_MASKRCNN_IMG_WIDTHHEIGHT, TF_MASKRCNN_IMG_WIDTHHEIGHT), CV_8UC3);
    cv::resize(squareInputMat, resizedInputMat, resizedInputMat.size(), 0, 0);
    
    // Need to "mold_image" like in mask rcnn
    cv::Mat moldedInput(resizedInputMat.size(), CV_32FC3);
    resizedInputMat.convertTo(moldedInput, CV_32FC3);
    cv::subtract(moldedInput, TF_MASKRCNN_MEAN_PIXEL, moldedInput);
    
    // Move the data into the input tensor
    // remove memory copies by using code at https://github.com/tensorflow/tensorflow/issues/8033#issuecomment-332029092
    // allocate a Tensor and get pointer to memory for that Tensor, allocate a "fake" cv::Mat from it to use as a  basis to convert
    tensorflow::Tensor inputTensor(tensorflow::DT_FLOAT, {1, moldedInput.size().height, moldedInput.size().width, 3}); // single image instance with 3 channels
    float_t *p = inputTensor.flat<float_t>().data();
    cv::Mat inputTensorMat(moldedInput.size(), CV_32FC3, p);
    moldedInput.convertTo(inputTensorMat, CV_32FC3);
    
    // Copy the TF_MASKRCNN_IMAGE_METADATA data into a tensor
    tensorflow::Tensor inputMetadataTensor(tensorflow::DT_FLOAT, {1, TF_MASKRCNN_IMAGE_METADATA_LENGTH});
    auto inputMetadataTensorMap = inputMetadataTensor.tensor<float, 2>();
    for (int i = 0; i < TF_MASKRCNN_IMAGE_METADATA_LENGTH; ++i) {
        inputMetadataTensorMap(0, i) = TF_MASKRCNN_IMAGE_METADATA[i];
    }
    
    // Run tensorflow
    cv::TickMeter tm;
    tm.start();
    std::vector<tensorflow::Tensor> outputs;
    tensorflow::Status run_status = tfSession->Run({{"input_image", inputTensor}, {"input_image_meta", inputMetadataTensor}},
                                                      {"output_detections", "output_mrcnn_class", "output_mrcnn_bbox", "output_mrcnn_mask",
                                                          "output_rois", "output_rpn_class", "output_rpn_bbox"},
                                                       {},
                                                       &outputs);
    if (!run_status.ok()) {
        std::cerr << "tfSession->Run failed: " << run_status << std::endl;
    }
    tm.stop();
    std::cout << "Inference time, ms: " << tm.getTimeMilli()  << std::endl;
    
    if (outputs[3].shape().dims() != 5 || outputs[3].shape().dim_size(4) != 2) {
        throw std::runtime_error("Expected mask dimensions to be [1,100,28,28,2] but got: " + outputs[3].shape().DebugString());
    }
    
    auto detectionsMap = outputs[0].tensor<float, 3>();

    for (int i = 0; i < outputs[3].shape().dim_size(1); ++i) {
        auto scoreAtI = detectionsMap(0, i, 5);
        auto detectedClass = detectionsMap(0, i, 4);
        auto y1 = detectionsMap(0, i, 0), x1 = detectionsMap(0, i, 1), y2 = detectionsMap(0, i, 2), x2 = detectionsMap(0, i, 3);
        auto maskHeight = y2 - y1, maskWidth = x2 - x1;

        if (maskHeight != 0 && maskWidth != 0) {
            // Pointer arithmetic
            const int i0 = 0, /* size0 = (int)outputs[3].shape().dim_size(1), */ i1 = i, size1 = (int)outputs[3].shape().dim_size(1), size2 = (int)outputs[3].shape().dim_size(2), size3 = (int)outputs[3].shape().dim_size(3), i4 = (int)detectedClass /*, size4 = 2 */;
            int pointerLocationOfI = (i0*size1 + i1)*size2;
            float_t *maskPointer = outputs[3].flat<float_t>().data();
        
            // The shape of the detection is [28,28,2], where the last index is the class of interest.
            // We'll extract index 1 because it's the toilet seat.
            cv::Mat initialMask(cv::Size(size2, size3), CV_32FC2, &maskPointer[pointerLocationOfI]); // CV_32FC2 because I know size4 is 2
            cv::Mat detectedMask(initialMask.size(), CV_32FC1);
            cv::extractChannel(initialMask, detectedMask, i4);
        
            // Convert to B&W
            cv::Mat binaryMask(detectedMask.size(), CV_8UC1);
            cv::threshold(detectedMask, binaryMask, 0.5, 255, cv::THRESH_BINARY);
        
            // First scale and offset in relation to TF_MASKRCNN_IMG_WIDTHHEIGHT
            cv::Mat scaledDetectionMat(maskHeight, maskWidth, CV_8UC1);
            cv::resize(binaryMask, scaledDetectionMat, scaledDetectionMat.size(), 0, 0);
            cv::Mat scaledOffsetMat(moldedInput.size(), CV_8UC1, cv::Scalar(0));
            scaledDetectionMat.copyTo(scaledOffsetMat(cv::Rect(x1, y1, maskWidth, maskHeight)));
        
            // Second, scale and offset in relation to our original inputMat
            cv::Mat detectionScaledToSquare(squareInputMat.size(), CV_8UC1);
            cv::resize(scaledOffsetMat, detectionScaledToSquare, detectionScaledToSquare.size(), 0, 0);
           detectionScaledToSquare(cv::Rect(leftBorder, topBorder, inputMat.size().width, inputMat.size().height)).copyTo(dest);
        }
    }

大佬提供了核心代码，但是有很多地方不匹配：

1.输入一共有三个参数，{ “input_image”, inputTensor },{ “input_image_meta”, inputMetadataTensor },{“input_anchors”,input_anchors } 但是此代码只有两个，不懂为什么

2.并且第二个参数有14个，但是这里只有10个？？

anchors.txt. anchors在python代码中保存下来

#define COMPILER_MSVC
#define NOMINMAX
#define _SCL_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS

#include <fstream>
#include <utility>
#include <vector>
#include <iostream>
#include <sstream>
#include <string>


#include <tensorflow/cc/ops/array_ops.h>
#include "tensorflow/cc/ops/const_op.h"
#include "tensorflow/cc/ops/image_ops.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/graph/default_device.h"
#include "tensorflow/core/graph/graph_def_builder.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/stringpiece.h"
#include "tensorflow/core/lib/core/threadpool.h"
#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/lib/strings/stringprintf.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/init_main.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/public/session.h"
#include "tensorflow/core/util/command_line_flags.h"
#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
// These are all common classes it's handy to reference with no namespace.
using tensorflow::Flag;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::int32;

using namespace std;

// ensure TensorFlow C++ build OK
//int main() {
//	printf("Hello World from Tensorflow C libnrary version %s\n", TF_Version());
//	tensorflow::Session* session = tensorflow::NewSession(tensorflow::SessionOptions());
//	return 0;
//}


int main(int argc, char* argv[])
{
	// given inputMat of type RGB (not BGR) / CV_8UC3 (possibly from an imread + cvtColor)
	// also given dest of type cv::Mat(inputMat.size(), CV_8UC1)
	// we trained on 256x256 , so TF_MASKRCNN_IMG_WIDTHHEIGHT = 256
	// we copied MEAN_PIXEL configs, so cv::Scalar TF_MASKRCNN_MEAN_PIXEL((69.3405, 137.1447, 75.6487);
	// we statically defined float TF_MASKRCNN_IMAGE_METADATA[10] = {  0 ,TF_MASKRCNN_IMG_WIDTHHEIGHT ,TF_MASKRCNN_IMG_WIDTHHEIGHT , 3 , 0 , 0 ,TF_MASKRCNN_IMG_WIDTHHEIGHT ,TF_MASKRCNN_IMG_WIDTHHEIGHT , 0 , 0 };

	// Resize to square with max dim, so we can resize it to 512x512

	cv::Mat inputMat;
	inputMat = cv::imread("C:\\Qiuhao_workspace\\aaaa\\10.bmp", CV_LOAD_IMAGE_COLOR);
	int TF_MASKRCNN_IMG_WIDTHHEIGHT = 256;
	cv::Scalar TF_MASKRCNN_MEAN_PIXEL(69.3405, 137.1447, 75.6487);
	float TF_MASKRCNN_IMAGE_METADATA[14] = { 0, TF_MASKRCNN_IMG_WIDTHHEIGHT, TF_MASKRCNN_IMG_WIDTHHEIGHT, 3, TF_MASKRCNN_IMG_WIDTHHEIGHT, TF_MASKRCNN_IMG_WIDTHHEIGHT, 0, 0, TF_MASKRCNN_IMG_WIDTHHEIGHT, TF_MASKRCNN_IMG_WIDTHHEIGHT,1, 0, 0 };
	cv::Mat dest = cv::Mat(inputMat.size(), CV_8UC1);
	dest = inputMat.clone();

	//Resizr to square with max dim, so we can resize it to 256x256
	int largestDim = inputMat.size().height > inputMat.size().width ? inputMat.size().height : inputMat.size().width;
	cv::Mat squareInputMat(cv::Size(largestDim, largestDim), CV_8UC3);
	int leftBorder = (largestDim - inputMat.size().width) / 2;
	int topBorder = (largestDim - inputMat.size().height) / 2;
	cv::copyMakeBorder(inputMat, squareInputMat, topBorder, largestDim - (inputMat.size().height + topBorder), leftBorder, largestDim - (inputMat.size().width + leftBorder), cv::BORDER_CONSTANT, cv::Scalar(0));
	cv::Mat resizedInputMat(cv::Size(TF_MASKRCNN_IMG_WIDTHHEIGHT, TF_MASKRCNN_IMG_WIDTHHEIGHT), CV_8UC3);
	cv::resize(squareInputMat, resizedInputMat, resizedInputMat.size(), 0, 0);

	// Need to "mold_image" like in mask rcnn
	cv::Mat moldedInput(resizedInputMat.size(), CV_32FC3);
	resizedInputMat.convertTo(moldedInput, CV_32FC3);
	cv::subtract(moldedInput, TF_MASKRCNN_MEAN_PIXEL, moldedInput);


	//moldedInput = cv::imread("C:\\Qiuhao_workspace\\aaaa\\test_python.jpg", CV_LOAD_IMAGE_COLOR);
	//display the molded image
	//cv::imshow("molded image",moldedInput);
	//cv::imwrite("C:\\Qiuhao_workspace\\aaaa\\test.jpg", moldedInput);

	// Move the data into the input tensor
	// remove memory copies by using code at tensorflow/tensorflow#8033 (comment)
	// allocate a Tensor and get pointer to memory for that Tensor, allocate a "fake" cv::Mat from it to use as a  basis to convert
	// tensorflow::Tensor inputTensor(tensorflow::DT_FLOAT, tensorflow::TensorShape(3)); // single image instance with 3 channels  { 1, moldedInput.size().height, moldedInput.size().width, 3 }
	tensorflow::Tensor inputTensor(tensorflow::DT_FLOAT, { 1, moldedInput.size().height, moldedInput.size().width, 3 }); // single image instance with 3 channels
	float_t *p = inputTensor.flat<float_t>().data();
	cv::Mat inputTensorMat(moldedInput.size(), CV_32FC3, p);
	moldedInput.convertTo(inputTensorMat, CV_32FC3);

	int TF_MASKRCNN_IMAGE_METADATA_LENGTH = 14;

	// Copy the TF_MASKRCNN_IMAGE_METADATA data into a tensor
	tensorflow::Tensor inputMetadataTensor(tensorflow::DT_FLOAT, { 1, TF_MASKRCNN_IMAGE_METADATA_LENGTH });
	auto inputMetadataTensorMap = inputMetadataTensor.tensor<float, 2>();
	for (int i = 0; i < TF_MASKRCNN_IMAGE_METADATA_LENGTH; ++i) {
		inputMetadataTensorMap(0, i) = TF_MASKRCNN_IMAGE_METADATA[i];
	}

	// for specific 1920x1280 images
	auto input_anchors = tensorflow::Tensor(tensorflow::DT_FLOAT, tensorflow::TensorShape({ 1,16368,4 }));
	auto anchors_API = input_anchors.tensor<float, 3>();
	//input_anchors.flat<float_t>()(0, 0, 0) = 1.111111;
	string fileName = "C:\\qiuhao_workspace\\aaaa\\anchors.txt";
	fstream in;
	in.open(fileName.c_str(), ios::in);
	if (!in.is_open()) {
		cout << "Can not find " << fileName << endl;
		system("pause");
	}
	string buff;
	int i = 0; //line i
	while (getline(in, buff)) {
		vector<float> nums;
		// string->char *
		char *s_input = (char *)buff.c_str();
		const char * split = ",";
		char *p2 = strtok(s_input, split);
		double a;
		while (p2 != NULL) {
			// char * -> int
			a = atof(p2);
			//cout << a << endl;
			nums.push_back(a);
			p2 = strtok(NULL, split);
		}//end while
		for (int b = 0; b < nums.size(); b++) {
			anchors_API(0, i, b ) = nums[b];
		}//end for
		i++;
	}//end while
	in.close();


	string root_dir = "";
	string graph = "C:\\Qiuhao_workspace\\aaaa\\mask_rcnn_6.pb";
	// First we load and initialize the model.

	string graph_path = tensorflow::io::JoinPath(root_dir, graph);
	tensorflow::GraphDef graph_def;
	tensorflow::SessionOptions options;
	std::unique_ptr<tensorflow::Session> session(tensorflow::NewSession(options));

	Status load_graph_status =
		ReadBinaryProto(tensorflow::Env::Default(), graph_path, &graph_def);
		//for (int n = 0; n < graph_def.node_size(); ++n) {
		//	graph_def.mutable_node(n)->clear_device();
		//}

		//tfSession.reset(tensorflow::NewSession(tensorflow::SessionOptions()));
	TF_CHECK_OK(session->Create(graph_def));
		//Status session_create_status = session->Create(graph_def);

		//Status load_graph_status = LoadGraph(graph_path, &session);
	if (!load_graph_status.ok()) {
		LOG(ERROR) << "LoadGraph ERROR!!!!" << load_graph_status;
		cout << load_graph_status << endl;
		return -1;
	}

	// Actually run the image through the model.
	std::vector<Tensor> outputs;
	tensorflow::Status run_status = session->Run({ { "input_image", inputTensor },{ "input_image_meta", inputMetadataTensor },{"input_anchors",input_anchors } },
		{ "output_detections", "output_mrcnn_class", "output_mrcnn_bbox", "output_mrcnn_mask",
				"output_rois", "output_rpn_class", "output_rpn_bbox" },
		{},
		&outputs);
	if (!run_status.ok()) {
		LOG(ERROR) << "Running model failed: " << run_status;
		return -1;
	}

	if (outputs[3].shape().dims() != 5 || outputs[3].shape().dim_size(4) != 2)
	{
		throw std::runtime_error("Expected mask dimensions to be [1,100,28,28,2] but got: " + outputs[3].shape().DebugString());
	}

	auto detectionsMap = outputs[0].tensor<float, 3>();
	auto mask = outputs[3].tensor<float, 5>();
	for (int i = 0; i < outputs[3].shape().dim_size(1); ++i)
	{
		auto y1 = detectionsMap(0, i, 0); float x1 = detectionsMap(0, i, 1); auto y2 = detectionsMap(0, i, 2); float x2 = detectionsMap(0, i, 3) ; auto scoreAtI = detectionsMap(0, i, 5); // detectionsMap(0, i, 1) 0.8862123; detectionsMap(0, i, 3) 0.91774625

		auto detectedClass = detectionsMap(0, i, 4);
		auto walala = detectionsMap(0, i, 6);
		auto maskHeight = y2 - y1, maskWidth = x2 - x1;

		if (maskHeight != 0 && maskWidth != 0) {
			// Pointer arithmetic
			const int i0 = 0, /* size0 = (int)outputs[3].shape().dim_size(1), */ i1 = i, size1 = (int)outputs[3].shape().dim_size(1), size2 = (int)outputs[3].shape().dim_size(2), size3 = (int)outputs[3].shape().dim_size(3), i4 = (int)detectedClass /*, size4 = 2 */;
			int pointerLocationOfI = (i0*size1 + i1)*size2;
			float_t *maskPointer = outputs[3].flat<float_t>().data();

			// The shape of the detection is [28,28,2], where the last index is the class of interest.
			// We'll extract index 1 because it's the toilet seat.
			cv::Mat initialMask(cv::Size(size2, size3), CV_32FC2, &maskPointer[pointerLocationOfI]); // CV_32FC2 because I know size4 is 2
			cv::Mat detectedMask(initialMask.size(), CV_32FC1);
			cv::extractChannel(initialMask, detectedMask, i4);

			// Convert to B&W
			cv::Mat binaryMask(detectedMask.size(), CV_8UC1);
			cv::threshold(detectedMask, binaryMask, 0.5, 255, cv::THRESH_BINARY);

			// First scale and offset in relation to TF_MASKRCNN_IMG_WIDTHHEIGHT
			cv::Mat scaledDetectionMat(maskHeight, maskWidth, CV_8UC1);
			cv::resize(binaryMask, scaledDetectionMat, scaledDetectionMat.size(), 0, 0);

			cv::Mat scaledOffsetMat(moldedInput.size(), CV_8UC1, cv::Scalar(0));
			scaledDetectionMat.copyTo(scaledOffsetMat(cv::Rect(x1, y1, maskWidth, maskHeight)));

			// Second, scale and offset in relation to our original inputMat
			cv::Mat detectionScaledToSquare(squareInputMat.size(), CV_8UC1);
			cv::resize(scaledOffsetMat, detectionScaledToSquare, detectionScaledToSquare.size(), 0, 0);
			detectionScaledToSquare(cv::Rect(leftBorder, topBorder, inputMat.size().width, inputMat.size().height)).copyTo(dest);
		}
		/**/

	}
	cv::imshow("Detection Result", dest);
	cv::waitKey();
	cv::imwrite("C:\\Qiuhao_workspace\\aaaa\\test.jpg", dest);
	return 0;
}

前天刚解决完，今天就看到了有人发了代码，要是早点有就不用折腾这么久，不过自己从中也学习了不少，这个代码和我的思路一样，第二个参数也是14维（这个我是根据python中的输入修改的），anchors.txt同样是保存下来了使用，应该和我一样是用python保存的。唯一的区别就是我是先读取，用数组保存下来，然后转换成tensor，他是直接边读取边转换tensor。

感觉自己能力还是差很多，而且出现了消极情绪之后无法静下来去好好解决问题。

Keras版本的mask-rcnn，如何迁移到windows,c++，tensorflow上？

猜你喜欢