tflite本地构建和调用

ref

TensorFlow Lite的 GPU 委托代理(Delegate)是什么,怎么加速模型推理 - 知乎

https://www.tensorflow.org/lite/android/lite_build

TensorFlowLite + Armnn 实现神经网络推理_陌生的天花板的博客-CSDN博客_神经网络推理引擎

wsl2 ubuntu构建环境搭建

Set up build environment without Docker

Install Bazel and Android Prerequisites

Bazel is the primary build system for TensorFlow. To build with it, you must have it and the Android NDK and SDK installed on your system.

  1. Install the latest version of the Bazel build system.
  2. The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 19c, which may be found here.
  3. The Android SDK and build tools may be obtained here, or alternatively as part of Android Studio. Build tools API >= 23 is the recommended version for building TensorFlow Lite.

ndk, sdk下载:

ndk版本:20.1.5948944,ndk可以自己单独下载,也可以用下面的sdk manager工具下载。

也可以用android studio的sdk manager下载。比如wsl2里面要使用adb工具需要使用windows 安装的android studio里面sdk manager下载adb。

sdk manager命令行工具下载:

https://developer.android.com/studio/command-line/sdkmanager

列出可以安装的包

先安装java

sudo apt install -y default-jre default-jdk
./bin/sdkmanager --list --channel=0 --sdk_root=./sdk

例如:

build-tools;33.0.0
cmake;3.18.1
cmdline-tools;latest
emulator
ndk;20.1.5948944
platform-tools
platforms;android-30
sources;android-32

安装sdk ndk等一系列包

./bin/sdkmanager \
  "platform-tools" \
  "platforms;android-30" \
  "sources;android-30" \
  "cmake;3.18.1" \
  "build-tools;33.0.0" \
  "cmdline-tools;latest" \
  "ndk;24.0.8215888" \
  --channel=0 --sdk_root=./sdk

sdk:java开发; ndk:C++开发; 应用通常使用java通过jni方式调用C++。java函数生成C++头文件,然后使用C++进行实现。

安装和配置bazel

tensorflow bazel编译问题_Luchang-Li的博客-CSDN博客

下载代码和编译

git clone -b v2.9.1 https://github.com/tensorflow/tensorflow.git
./configure配置

配置选项:for android build选y,然后ndk-bundle设置为sdk路径,例如path/sdk/ndk/20.1.5948944/, Android SDK路径设置,例如path/sdk/,其他可以默认。

bazel执行命令编译

bazel build -c opt --config=android_arm64 --cpu=arm64-v8a tensorflow/lite:libtensorflowlite.so

编译结果:bazel-bin/tensorflow/lite/libtensorflowlite.so

使用XNNPACK:

XNNPACK backend on Windows, Linux, and Mac is enabled via a build-time opt-in mechanism. When building TensorFlow Lite with Bazel, simply add --define tflite_with_xnnpack=true, and the TensorFlow Lite interpreter will use the XNNPACK backend by default.

或者通过代码配置启用。

bazel build -c opt --config=android_arm64 --cpu=arm64-v8a --define tflite_with_xnnpack=true tensorflow/lite:libtensorflowlite.so

GPU程序只链接libtensorflowlite.so出错:undefined reference to  TfLiteGpuDelegateOptionsV2Default,需要编译和链接GPU代理。

GPU代理编译

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/gpu_advanced.md

bazel build -c opt --config android_arm64 --cpu=arm64-v8a tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so  # for dynamic library

编译结果:bazel-bin/tensorflow/lite/delegates/gpu/libtensorflowlite_gpu_delegate.so

tflite gpu delegate使用example

参考上面内容先编译libtensorflowlite.so和libtensorflowlite_gpu_delegate.so

构建脚本build.sh参考

Android基于ndk和cmake开发C++命令行程序_Luchang-Li的博客-CSDN博客

运行该build.sh结合下面源代码和CMakeLists.txt编译命令行执行程序。

main.cpp:

注意这里面的优先级配置,如果配置了性能优先则tflite默认使用FP16对FP32模型进行推理,性能更优。

#include <stdio.h>
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/tools/gen_op_registration.h"
#include "tensorflow/lite/delegates/gpu/delegate.h"
#include <chrono>
#include <iostream>
#include <string>
#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"

using namespace std;

int main(int argc, char* argv[]) {
  // create net and session
  if (argc <= 1) {
    cout << "please set input model path" << endl;
    return -1;
  }

  int warmup_num = 20;
  int eval_num = 20;
  std::string model_path = argv[1];
  cout << "model_path: " << model_path << endl;

  std::unique_ptr<tflite::FlatBufferModel> model = tflite::FlatBufferModel::BuildFromFile(model_path.c_str());
  if (!model) {
    printf("Failed to load model\n");
    exit(0);
  }

  tflite::ops::builtin::BuiltinOpResolver resolver;
  std::unique_ptr<tflite::Interpreter> interpreter;
  tflite::InterpreterBuilder(*model.get(), resolver)(&interpreter);

  TfLiteGpuDelegateOptionsV2 gpu_options = TfLiteGpuDelegateOptionsV2Default();
  gpu_options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
  gpu_options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
  gpu_options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
  gpu_options.inference_preference = TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED;

  tflite::Interpreter::TfLiteDelegatePtr gpu_delegate(
      TfLiteGpuDelegateV2Create(&gpu_options), [](TfLiteDelegate* delegate) { TfLiteGpuDelegateV2Delete(delegate); });

  if (interpreter->ModifyGraphWithDelegate(gpu_delegate.get()) != kTfLiteOk) {
    std::cout << "modify graph by gpu delegate failed" << std::endl;
    return 1;
  }

  /*
    // use xnnpack delegate
    int cpu_num_threads = 4;
    TfLiteXNNPackDelegateOptions xnnpack_opts = TfLiteXNNPackDelegateOptionsDefault();
    // xnnpack_opts.num_threads = cpu_num_threads;

    tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
        TfLiteXNNPackDelegateCreate(&xnnpack_opts),
        [](TfLiteDelegate* delegate) { TfLiteXNNPackDelegateDelete(delegate); });

    interpreter->ModifyGraphWithDelegate(xnnpack_delegate.get());
  */

  // Resize input tensors, if desired.
  interpreter->AllocateTensors();

  // float* input = interpreter->typed_input_tensor<float>(0);
  // Dummy input for testing

  for (int i = 0; i < warmup_num; i++) {
    interpreter->Invoke();
  }
  auto t_eval1 = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < eval_num; i++) {
    interpreter->Invoke();
  }

  auto t_eval2 = std::chrono::high_resolution_clock::now();
  auto duration_eval = std::chrono::duration_cast<std::chrono::microseconds>(t_eval2 - t_eval1).count();

  float eval_time = duration_eval / 1000.0f / eval_num;

  cout << "model_path:" << model_path << endl;
  cout << "eval time ms:" << eval_time << endl;

  // float* output = interpreter->typed_output_tensor<float>(0);
  // printf("Result is: %f\n", *output);

  return 0;
}

CMakeLists.txt:

cmake_minimum_required(VERSION 3.10)
 
project(cmake_study LANGUAGES CXX)
# set(CMAKE_CXX_STANDARD 11)
 
# without these flags, the cmake generated binary file is much bigger than ndk-build
# you can also pass -DCMAKE_C_FLAGS="-s" to the CMake call.
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
# add_compile_options(-fPIC)
 
add_executable(
    tflite_eval_gpu
    main.cpp
)
 
target_include_directories(
    tflite_eval_gpu
    PUBLIC
    tensorflow/
    flatbuffers-2.0.6/include/
)
 
target_link_libraries(
    tflite_eval_gpu
    PUBLIC 
    tflite_release_2.11/libtensorflowlite.so
    tflite_release_2.11/libtensorflowlite_gpu_delegate.so
    log # liblog.so not found
    z
    EGL
    GLESv2
)

flatbuffers头文件根据tensorflow里面配置的版本下载代码引用。

XNNPACK delegate使用

#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"

  // use xnnpack delegate
  int cpu_num_threads = 4;
  TfLiteXNNPackDelegateOptions xnnpack_opts =
      TfLiteXNNPackDelegateOptionsDefault();
  // xnnpack_opts.num_threads = cpu_num_threads;

  tflite::Interpreter::TfLiteDelegatePtr xnnpack_delegate(
      TfLiteXNNPackDelegateCreate(&xnnpack_opts),
      [](TfLiteDelegate* delegate) { TfLiteXNNPackDelegateDelete(delegate); });

  interpreter->ModifyGraphWithDelegate(xnnpack_delegate.get());

调用armnn delegate

这里采用直接下载https://github.com/ARM-software/armnn/releases编译好的so,当前其配套的是tf-2.5.0,因此下载tf-2.5.0编译tflite然后引用两者的so和头文件进行编译测试。

main.cpp

#include <chrono>
#include <iostream>
#include <string>
#include <vector>

#include "armnn_delegate.hpp"
#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/kernels/register.h"
#include "tensorflow/lite/model.h"
#include "tensorflow/lite/tools/gen_op_registration.h"

using uint8 = unsigned char;

size_t GetElemNum(const std::vector<int>& shape) {
  int elem_num = 1;
  for (auto elem : shape) {
    elem_num *= elem;
  }
  return elem_num;
}

void RandInit(uint8* data, const std::vector<int>& shape) {
  size_t elem_num = GetElemNum(shape);
  for (size_t i = 0; i < elem_num; i++) {
    data[i] = uint8(rand());
  }
}

int main(int argc, char* argv[]) {
  std::string model_path = "models/inception_v3_quant.tflite";

  std::unique_ptr<tflite::FlatBufferModel> model =
      tflite::FlatBufferModel::BuildFromFile(model_path.c_str());

  if (!model) {
    printf("Failed to mmap model\n");
    return 1;
  }

  tflite::ops::builtin::BuiltinOpResolver resolver;
  std::unique_ptr<tflite::Interpreter> interpreter;
  tflite::InterpreterBuilder(*model.get(), resolver)(&interpreter);

  std::vector<armnn::BackendOptions> backendOptions;
  armnn::BackendOptions gpuAcc(
      "GpuAcc", {
   
   {"FastMathEnabled", true},
                 // {"SaveCachedNetwork", m_SaveCachedNetwork},
                 // {"CachedNetworkFilePath", m_CachedNetworkFilePath},
                 // {"MLGOTuningFilePath", m_MLGOTuningFilePath}
               });
  unsigned int numberOfThreads = 4;
  armnn::BackendOptions cpuAcc(
      "CpuAcc",
      {
   
   {"FastMathEnabled", true}, {"NumberOfThreads", numberOfThreads}});
 
  // Create the ArmNN Delegate, such as GpuAcc, CpuAcc
  std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
  if (device_type == "cpu") {
    backends = {armnn::Compute::CpuAcc};
    std::cout << "create CPU delegate" << std::endl;

    backendOptions.push_back(cpuAcc);
  } else {
    std::cout << "create GPU delegate" << std::endl;
    backendOptions.push_back(gpuAcc);
  }
  armnnDelegate::DelegateOptions delegateOptions(backends, backendOptions);

  // armnn::OptimizerOptions optimizerOptions;
  // optimizerOptions.m_ReduceFp32ToFp16=true;
  // armnn::BackendOptions modelOptionGpu("GpuAcc", {
   
   {"FastMathEnabled", true}, {"TuningLevel", 3}}); // 
  // optimizerOptions.m_ModelOptions.push_back(modelOptionGpu);
  // armnnDelegate::DelegateOptions delegateOptions(armnn::Compute::GpuAcc, optimizerOptions);

  std::unique_ptr<TfLiteDelegate,
                  decltype(&armnnDelegate::TfLiteArmnnDelegateDelete)>
      theArmnnDelegate(
          armnnDelegate::TfLiteArmnnDelegateCreate(delegateOptions),
          armnnDelegate::TfLiteArmnnDelegateDelete);
  // Modify armnnDelegateInterpreter to use armnnDelegate
  interpreter->ModifyGraphWithDelegate(theArmnnDelegate.get());

  // Resize input tensors, if desired.
  interpreter->AllocateTensors();
  // uint8[1,299,299,3]
  uint8* input = interpreter->typed_input_tensor<uint8>(0);
  // TfLiteTensor* input_tensor = interpreter->input_tensor(0);
  // void* input = input_tensor->data.data;

  RandInit(input, {1, 299, 299, 3});

  int warm_up_num = 50;
  int eval_num = 50;

  if (argc >= 3) {
    warm_up_num = atoi(argv[1]);
    eval_num = atoi(argv[2]);
  }

  std::cout << "model_path: "<<model_path<<std::endl;
  std::cout << "warm_up_num: " << warm_up_num << std::endl;
  std::cout << "eval_num: " << eval_num << std::endl;

  for (int i = 0; i < warm_up_num; i++) {
    interpreter->Invoke();
  }
  auto t1 = std::chrono::high_resolution_clock::now();

  for (int i = 0; i < eval_num; i++) {
    interpreter->Invoke();
  }

  auto t2 = std::chrono::high_resolution_clock::now();
  auto duration =
      std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();

  float mean_time = duration / 1000.0f / eval_num;
  std::cout << "mean time per eval (ms): " << mean_time << std::endl;

  // uint8[1,1001]
  uint8* output = interpreter->typed_output_tensor<uint8>(0);
  // TfLiteTensor* output_tensor = interpreter->output_tensor(0);
  // void* output = output_tensor->data.data;

  printf("Result is: %d\n", output[0]);

  return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.10)

project(cmake_study LANGUAGES CXX)
# set(CMAKE_CXX_STANDARD 11)

# without these flags, the cmake generated binary file is much bigger than ndk-build
# you can also pass -DCMAKE_C_FLAGS="-s" to the CMake call.
# set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -s")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
# add_compile_options(-fPIC)

set(TF_VER "tensorflow-v2.5.0")

add_executable(
    main
    src/main.cpp
)

target_include_directories(
    main
    PUBLIC
    /root/codes/tflite_test/${TF_VER}/
    ${CMAKE_CURRENT_SOURCE_DIR}/flatbuffers-1.12.0/include/
    /root/codes/armnn-22.05.01/armnn-22.05.01/delegate/include/
    /root/codes/armnn-22.05.01/armnn-22.05.01/include/
    /root/codes/armnn-22.05.01/armnn-22.05.01/profiling/
)

target_link_libraries(
	main
	PUBLIC 
    /root/codes/tflite_test/${TF_VER}/tflite_release/libtensorflowlite.so
    /root/codes/armnn-22.05.01/ArmNN-android-29-arm64-v8.2-a/libarmnn.so
    /root/codes/armnn-22.05.01/ArmNN-android-29-arm64-v8.2-a/libarmnnDelegate.so
    log # liblog.so not found
    z
)

猜你喜欢

转载自blog.csdn.net/u013701860/article/details/125009412