I. Overview
Document layout analysis is the process of locating and classifying areas of interest on a picture or page scan. Its main goal is to enable machines to understand document structure, that is, divide document images into regions with different types of content and analyze the relationships between these regions. This is a key step before content identification, which can usually be divided into two aspects: page segmentation and logical structure analysis.
Page segmentation methods focus on appearance features by using visual cues to divide the page into different areas. These areas usually include different elements such as text, graphics, images, and tables. Page segmentation methods identify and separate these elements for subsequent processing and analysis of different types of content.
Logical structure analysis focuses more on the semantic understanding and classification of regional content. It strives to provide more fine-grained semantic labels for each region, such as identifying paragraph text regions and distinguishing them from content such as headings or document titles. Logical structure analysis aims to achieve a deeper semantic understanding of the content in the document so that subsequent document processing and understanding can be more precise.
2. NDK development
Android NDK allows developers to write code in native languages such as C and C++ and mix it with Java code. The NDK is designed to build performance-optimized applications and can also be used to integrate existing C/C++ libraries into Android applications.
Under certain circumstances, using the Android NDK can improve the performance and responsiveness of your application. Especially in applications that need to process large amounts of data or perform computationally intensive tasks, using native languages such as C/C++ can bring higher performance and a better user experience. In addition, NDK can avoid duplication of code and enable developers to easily integrate existing C/C++ libraries into Android applications.
1.Create project
Create a Native C++ project:
2. Install NDK:
Select the NDK you want to use. You can install the NDK through the SDK Manager in Android Studio, or you can download and install it manually from the Android Developer website.
3. Configure project Gradle:
In the project's build.gradle file, add the appropriate configuration as follows:
plugins {
id 'com.android.application'
}
android {
compileSdk 33
defaultConfig {
applicationId "com.example.layout_analysis"
minSdk 30
targetSdk 33
versionCode 1
versionName "1.0"
testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
externalNativeBuild {
cmake {
cppFlags ''
}
}
}
buildTypes {
release {
minifyEnabled false
proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
}
}
compileOptions {
sourceCompatibility JavaVersion.VERSION_1_8
targetCompatibility JavaVersion.VERSION_1_8
}
externalNativeBuild {
cmake {
path file('src/main/cpp/CMakeLists.txt')
version '3.10.2'
}
}
buildFeatures {
viewBinding true
}
}
dependencies {
implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'com.google.android.material:material:1.3.0'
implementation 'androidx.constraintlayout:constraintlayout:2.0.4'
testImplementation 'junit:junit:4.+'
androidTestImplementation 'androidx.test.ext:junit:1.1.2'
androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
}
4. Create C/C++ code:
In Android Studio, you can right-click the app directory, select New->C/C++ Source File in the Project view, and select the corresponding C or C++ file type. Write the code in the file and save it to the app/src/main/cpp directory.
5.Import the model
Create an assets directory in the same directory as CPP, and put the used model into this directory:
The C++ implementation interface is as follows:
#include "LayoutSeg.h"
#include <numeric>
#include <android/log.h>
#include <android/asset_manager.h>
#include <android/asset_manager_jni.h>
namespace LT
{
void *getModelDataFromAssets(AAssetManager *mgr, const char *modelName, int &size) {
if (mgr == NULL) {
return NULL;
}
AAsset *asset = AAssetManager_open(mgr, modelName, AASSET_MODE_UNKNOWN);
if (asset == NULL) {
return NULL;
}
off_t bufferSize = AAsset_getLength(asset);
void *modelData = malloc(bufferSize + 1);
size = AAsset_read(asset, modelData, bufferSize);
AAsset_close(asset);
return modelData;
}
bool LayoutSeg::read_model(const std::string& modelPath)
{
if (_batchSize < 1) _batchSize = 1;
try
{
#ifdef _WIN32
std::wstring model_path(modelPath.begin(), modelPath.end());
_OrtSession = new Ort::Session(_OrtEnv, model_path.c_str(), _OrtSessionOptions);
#else
_OrtSession = new Ort::Session(_OrtEnv, modelPath.c_str(), _OrtSessionOptions);
#endif
Ort::AllocatorWithDefaultOptions allocator;
//init input
_inputNodesNum = _OrtSession->GetInputCount();
#if ORT_API_VERSION < ORT_OLD_VISON
_inputName = _OrtSession->GetInputName(0, allocator);
_inputNodeNames.push_back(_inputName);
#else
_inputName = std::move(_OrtSession->GetInputNameAllocated(0, allocator));
_inputNodeNames.push_back(_inputName.get());
#endif
//cout << _inputNodeNames[0] << endl;
Ort::TypeInfo inputTypeInfo = _OrtSession->GetInputTypeInfo(0);
auto input_tensor_info = inputTypeInfo.GetTensorTypeAndShapeInfo();
_inputNodeDataType = input_tensor_info.GetElementType();
_inputTensorShape = input_tensor_info.GetShape();
if (_inputTensorShape[0] == -1)
{
_isDynamicShape = true;
_inputTensorShape[0] = _batchSize;
}
if (_inputTensorShape[2] == -1 || _inputTensorShape[3] == -1) {
_isDynamicShape = true;
_inputTensorShape[2] = _netHeight;
_inputTensorShape[3] = _netWidth;
}
//init output
_outputNodesNum = _OrtSession->GetOutputCount();
#if ORT_API_VERSION < ORT_OLD_VISON
_output_name0 = _OrtSession->GetOutputName(0, allocator);
_outputNodeNames.push_back(_output_name0);
#else
_output_name0 = std::move(_OrtSession->GetOutputNameAllocated(0, allocator));
_outputNodeNames.push_back(_output_name0.get());
#endif
Ort::TypeInfo type_info_output0(nullptr);
type_info_output0 = _OrtSession->GetOutputTypeInfo(0); //output0
auto tensor_info_output0 = type_info_output0.GetTensorTypeAndShapeInfo();
_outputNodeDataType = tensor_info_output0.GetElementType();
_outputTensorShape = tensor_info_output0.GetShape();
}
catch (const std::exception&) {
return false;
}
return true;
}
bool LayoutSeg::read_model(AAssetManager *mgr,const std::string modelPath)
{
if (_batchSize < 1) _batchSize = 1;
try
{
int dbModelDataLength = 0;
void *dbModelData = getModelDataFromAssets(mgr, modelPath.c_str(), dbModelDataLength);
_OrtSession = new Ort::Session(_OrtEnv, dbModelData, dbModelDataLength, _OrtSessionOptions);
free(dbModelData);
// _OrtSession = new Ort::Session(_OrtEnv, modelPath.c_str(), _OrtSessionOptions);
Ort::AllocatorWithDefaultOptions allocator;
//init input
_inputNodesNum = _OrtSession->GetInputCount();
#if ORT_API_VERSION < ORT_OLD_VISON
_inputName = _OrtSession->GetInputName(0, allocator);
_inputNodeNames.push_back(_inputName);
#else
_inputName = std::move(_OrtSession->GetInputNameAllocated(0, allocator));
_inputNodeNames.push_back(_inputName.get());
#endif
//cout << _inputNodeNames[0] << endl;
Ort::TypeInfo inputTypeInfo = _OrtSession->GetInputTypeInfo(0);
auto input_tensor_info = inputTypeInfo.GetTensorTypeAndShapeInfo();
_inputNodeDataType = input_tensor_info.GetElementType();
_inputTensorShape = input_tensor_info.GetShape();
if (_inputTensorShape[0] == -1)
{
_isDynamicShape = true;
_inputTensorShape[0] = _batchSize;
}
if (_inputTensorShape[2] == -1 || _inputTensorShape[3] == -1) {
_isDynamicShape = true;
_inputTensorShape[2] = _netHeight;
_inputTensorShape[3] = _netWidth;
}
//init output
_outputNodesNum = _OrtSession->GetOutputCount();
#if ORT_API_VERSION < ORT_OLD_VISON
_output_name0 = _OrtSession->GetOutputName(0, allocator);
_outputNodeNames.push_back(_output_name0);
#else
_output_name0 = std::move(_OrtSession->GetOutputNameAllocated(0, allocator));
_outputNodeNames.push_back(_output_name0.get());
#endif
Ort::TypeInfo type_info_output0(nullptr);
type_info_output0 = _OrtSession->GetOutputTypeInfo(0); //output0
auto tensor_info_output0 = type_info_output0.GetTensorTypeAndShapeInfo();
_outputNodeDataType = tensor_info_output0.GetElementType();
_outputTensorShape = tensor_info_output0.GetShape();
}
catch (const std::exception&) {
return false;
}
return true;
}
int LayoutSeg::Preprocessing(const std::vector<cv::Mat>&srcImgs, std::vector<cv::Mat>&outSrcImgs, std::vector<cv::Vec4d>¶ms)
{
outSrcImgs.clear();
cv::Size input_size = cv::Size(_netWidth, _netHeight);
for (int i = 0; i < srcImgs.size(); ++i)
{
cv::Mat temp_img = srcImgs[i];
cv::Vec4d temp_param = {
1,1,0,0 };
if (temp_img.size() != input_size)
{
cv::Mat borderImg;
LetterBox(temp_img, borderImg, temp_param, input_size, false, false, true, 32);
//cout << borderImg.size() << endl;
outSrcImgs.push_back(borderImg);
params.push_back(temp_param);
}
else
{
outSrcImgs.push_back(temp_img);
params.push_back(temp_param);
}
}
int lack_num = _batchSize - srcImgs.size();
if (lack_num > 0)
{
for (int i = 0; i < lack_num; ++i)
{
cv::Mat temp_img = cv::Mat::zeros(input_size, CV_8UC3);
cv::Vec4d temp_param = {
1,1,0,0 };
outSrcImgs.push_back(temp_img);
params.push_back(temp_param);
}
}
return 0;
}
bool LayoutSeg::detect(cv::Mat & srcImg, std::vector<OutputSeg>&output)
{
std::vector<cv::Mat> input_data = {
srcImg };
std::vector<std::vector<OutputSeg>> tenp_output;
if (OnnxBatchDetect(input_data, tenp_output))
{
output = tenp_output[0];
return true;
}
else return false;
}
bool LayoutSeg::OnnxBatchDetect(std::vector<cv::Mat>&srcImgs, std::vector<std::vector<OutputSeg>>&output)
{
std::vector<cv::Vec4d> params;
std::vector<cv::Mat> input_images;
cv::Size input_size(_netWidth, _netHeight);
//preprocessing
Preprocessing(srcImgs, input_images, params);
cv::Mat blob = cv::dnn::blobFromImages(input_images, 1 / 255.0, input_size, cv::Scalar(0, 0, 0), true, false);
int64_t input_tensor_length = VectorProduct(_inputTensorShape);
std::vector<Ort::Value> input_tensors;
std::vector<Ort::Value> output_tensors;
input_tensors.push_back(Ort::Value::CreateTensor<float>(_OrtMemoryInfo, (float*)blob.data, input_tensor_length, _inputTensorShape.data(), _inputTensorShape.size()));
output_tensors = _OrtSession->Run(Ort::RunOptions{
nullptr },
_inputNodeNames.data(),
input_tensors.data(),
_inputNodeNames.size(),
_outputNodeNames.data(),
_outputNodeNames.size()
);
//post-process
float* all_data = output_tensors[0].GetTensorMutableData<float>();
_outputTensorShape = output_tensors[0].GetTensorTypeAndShapeInfo().GetShape();
int net_width = _outputTensorShape[1];
int64_t one_output_length = VectorProduct(_outputTensorShape) / _outputTensorShape[0];
for (int img_index = 0; img_index < srcImgs.size(); ++img_index)
{
cv::Mat output0 = cv::Mat(cv::Size((int)_outputTensorShape[2], (int)_outputTensorShape[1]), CV_32F, all_data).t(); //[bs,116,8400]=>[bs,8400,116]
all_data += one_output_length;
float* pdata = (float*)output0.data;
int rows = output0.rows;
std::vector<int> class_ids;//结果id数组
std::vector<float> confidences;//结果每个id对应置信度数组
std::vector<cv::Rect> boxes;//每个id矩形框
for (int r = 0; r < rows; ++r) {
//stride
cv::Mat scores(1, _className.size(), CV_32F, pdata + 4);
cv::Point classIdPoint;
double max_class_socre;
minMaxLoc(scores, 0, &max_class_socre, 0, &classIdPoint);
max_class_socre = (float)max_class_socre;
if (max_class_socre >= _classThreshold)
{
//rect [x,y,w,h]
float x = (pdata[0] - params[img_index][2]) / params[img_index][0]; //x
float y = (pdata[1] - params[img_index][3]) / params[img_index][1]; //y
float w = pdata[2] / params[img_index][0]; //w
float h = pdata[3] / params[img_index][1]; //h
int left = MAX(int(x - 0.5 * w + 0.5), 0);
int top = MAX(int(y - 0.5 * h + 0.5), 0);
class_ids.push_back(classIdPoint.x);
confidences.push_back(max_class_socre);
boxes.push_back(cv::Rect(left, top, int(w + 0.5), int(h + 0.5)));
}
pdata += net_width;//下一行
}
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, _classThreshold, _nmsThreshold, nms_result);
std::vector<std::vector<float>> temp_mask_proposals;
cv::Rect holeImgRect(0, 0, srcImgs[img_index].cols, srcImgs[img_index].rows);
std::vector<OutputSeg> temp_output;
for (int i = 0; i < nms_result.size(); ++i) {
int idx = nms_result[i];
OutputSeg result;
result.id = class_ids[idx];
result.confidence = confidences[idx];
result.box = boxes[idx] & holeImgRect;
temp_output.push_back(result);
}
output.push_back(temp_output);
}
if (output.size())
return true;
else
return false;
}
}
6. Import dependent libraries
Processing images depends on the two libraries onnxruntime and opencv. You can find the corresponding Android NDK library on their official website. After downloading, place it in the same directory as the C++ code.
7. UI layout
The controls to be implemented in the UI are the view for displaying images, opening image buttons, and processing image buttons:
<?xml version="1.0" encoding="utf-8"?>
<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
android:orientation="vertical"
android:layout_width="fill_parent"
android:layout_height="fill_parent">
<ImageView
android:id="@+id/imageView"
android:layout_width="fill_parent"
android:layout_height="fill_parent"
android:layout_weight="1" />
<LinearLayout
android:orientation="horizontal"
android:layout_width="fill_parent"
android:layout_height="wrap_content">
<Button
android:id="@+id/btn_openImage"
android:layout_width="120dp"
android:layout_height="match_parent"
android:text="打开图像" />
<Button
android:id="@+id/btn_layout"
android:layout_width="120dp"
android:layout_height="match_parent"
android:text="版面分析" />
</LinearLayout>
</LinearLayout>
8. Define interface in Java
Add a java interface class in jave, define only the interface to be used, and then implement the interface in JNI:
package com.example.layout_analysis;
import android.content.res.AssetManager;
import android.graphics.Bitmap;
public class LayoutAnalysis {
//初始化
public native boolean init(AssetManager mgr);
public native Bitmap layout(Bitmap bitmap);
static {
System.loadLibrary("layout_analysis");
}
}
9.JNI implements Java interface
Add the Native cpp file in the CPP directory and use extern "C", which means that the function is compiled in C language to interact with Java.
The function prototype is:
extern "C" JNIEXPORT jobject JNICALL Java_com_example_layout_1analysis_LayoutAnalysis_xxx(JNIEnv *env, jobject, jobject image) {
}
Parameters include:
JNIEnv *env
: JNI environment pointer, used to interact with Java in JNI functions.jobject
: This parameter is not used in the function and is usually used as an implicit this pointer.jobject image
: This is a reference to a Java object representing an image that may be used by JNI functions to process image data.
Inside this JNI function, there should be code to process the incomingimage
object, possibly performing image processing or layout analysis. This function may use the functions provided by JNI to obtain the image data and process it in a C/C++ environment.
Code needs to be added to this function according to specific requirements to process the incoming image object, perform related image layout analysis or processing, and return relevant data to the Java layer as needed.
The overall code is as follows:
#include <jni.h>
#include <string>
#include "LayoutSeg.h"
#include "BitmapUtils.h"
static LT::LayoutSeg *layout_seg;
JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved)
{
layout_seg = new LT::LayoutSeg();
return JNI_VERSION_1_4;
}
extern "C" JNIEXPORT void JNI_OnUnload(JavaVM* vm, void* reserved)
{
}
extern "C" JNIEXPORT jboolean JNICALL
Java_com_example_layout_1analysis_LayoutAnalysis_init(JNIEnv* env, jobject thiz, jobject assetManager)
{
AAssetManager *mgr = AAssetManager_fromJava(env, assetManager);
layout_seg->read_model(mgr);
return JNI_TRUE;
}
extern "C" JNIEXPORT jobject JNICALL
Java_com_example_layout_1analysis_LayoutAnalysis_layout(JNIEnv *env,jobject, jobject image)
{
cv::Mat cv_RGBA;
bitmapToMat(env,image,cv_RGBA);
cv::Mat cv_BGR;
cv::cvtColor(cv_RGBA, cv_BGR, cv::COLOR_RGBA2BGR);
std::vector<LT::OutputSeg> result;
layout_seg->detect(cv_BGR,result);
std::vector<cv::Scalar> color;
srand(time(0));
for (int i = 0; i < 10; i++)
{
int b = (rand() + 50) % 255;
int g = (rand() + 10) % 255;
int r = (rand() + 100) % 255;
color.push_back(cv::Scalar(b, g, r));
}
LT::DrawPred(cv_BGR, result, layout_seg->_className, color);
cv::cvtColor(cv_BGR, cv_RGBA, cv::COLOR_BGR2RGBA);
matToBitmap(env,cv_RGBA,image);
return image;
}
10. Configure CMake:
Create a CMakeLists.txt file and configure the C/C++ source files and library files that need to be built, as well as other build options. A sample CMakeLists.txt file is as follows:
cmake_minimum_required(VERSION 3.10.2)
project("layout_analysis")
## 添加opencv 库
set(OpenCV_DIR "${CMAKE_SOURCE_DIR}/sdk/native/jni")
find_package(OpenCV REQUIRED)
if (OpenCV_FOUND)
message(STATUS "OpenCV_LIBS: ${OpenCV_LIBS}")
message(STATUS "OpenCV_INCLUDE_DIRS: ${OpenCV_INCLUDE_DIRS}")
else ()
message(FATAL_ERROR "opencv Not Found!")
endif (OpenCV_FOUND)
##添加onnxruntime库
include(${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime-shared/OnnxRuntimeWrapper.cmake)
find_package(OnnxRuntime REQUIRED)
if (OnnxRuntime_FOUND)
message(STATUS "OnnxRuntime_LIBS: ${OnnxRuntime_LIBS}")
message(STATUS "OnnxRuntime_INCLUDE_DIRS: ${OnnxRuntime_INCLUDE_DIRS}")
else ()
message(FATAL_ERROR "onnxruntime Not Found!")
endif (OnnxRuntime_FOUND)
add_library(layout_analysis SHARED layout_analysis.cpp LayoutUtils.cpp LayoutSeg.cpp BitmapUtils.cpp)
find_library(log-lib log)
target_link_libraries(layout_analysis ${log-lib} ${OnnxRuntime_LIBS} ${OpenCV_LIBS} jnigraphics)
11. Calling the interface in Java
Call the JNI implementation in Java's MainActivity to implement the interface:
package com.example.layout_analysis;
import androidx.appcompat.app.AppCompatActivity;
import android.content.Intent;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.graphics.Matrix;
import android.media.ExifInterface;
import android.net.Uri;
import android.os.Bundle;
import android.util.Log;
import android.view.View;
import android.widget.Button;
import android.widget.ImageView;
import com.example.layout_analysis.databinding.ActivityMainBinding;
import java.io.FileNotFoundException;
import java.io.IOException;
public class MainActivity extends AppCompatActivity {
private ActivityMainBinding binding;
private LayoutAnalysis layout = new LayoutAnalysis();
private static final int SELECT_IMAGE = 1;
private ImageView imageView;
private Bitmap bitmap = null;
private Bitmap showImage = null;
private Bitmap bitmapCopy = null;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
binding = ActivityMainBinding.inflate(getLayoutInflater());
setContentView(binding.getRoot());
imageView = (ImageView) findViewById(R.id.imageView);
//初始化类
boolean ret_init = layout.init(getAssets());
if (!ret_init)
{
Log.e("MainActivity", "LayoutAnalysis Init failed");
}
//打开图像
Button openFile = (Button) findViewById(R.id.btn_openImage);
openFile.setOnClickListener(new View.OnClickListener()
{
@Override
public void onClick(View arg0)
{
Intent i = new Intent(Intent.ACTION_PICK);
i.setType("image/*");
startActivityForResult(i, SELECT_IMAGE);
}
});
//版面分析
Button btn_print = (Button) findViewById(R.id.btn_layout);
btn_print.setOnClickListener(new View.OnClickListener()
{
@Override
public void onClick(View arg0)
{
if (showImage == null) {
return;
}
Bitmap bitmap = layout.layout(showImage);
imageView.setImageBitmap(bitmap);
}
});
}
@Override
protected void onActivityResult(int requestCode, int resultCode, Intent data)
{
super.onActivityResult(requestCode, resultCode, data);
if (resultCode == RESULT_OK && null != data) {
Uri selectedImage = data.getData();
try
{
if (requestCode == SELECT_IMAGE) {
bitmap = decodeUri(selectedImage);
showImage = bitmap.copy(Bitmap.Config.ARGB_8888, true);
bitmapCopy = bitmap.copy(Bitmap.Config.ARGB_8888, true);
imageView.setImageBitmap(bitmap);
}
}
catch (FileNotFoundException e)
{
Log.e("MainActivity", "FileNotFoundException");
return;
}
}
}
private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
{
// Decode image size
BitmapFactory.Options o = new BitmapFactory.Options();
o.inJustDecodeBounds = true;
BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);
// The new size we want to scale to
final int REQUIRED_SIZE = 640;
// Find the correct scale value. It should be the power of 2.
int width_tmp = o.outWidth, height_tmp = o.outHeight;
int scale = 1;
while (true) {
if (width_tmp / 2 < REQUIRED_SIZE
|| height_tmp / 2 < REQUIRED_SIZE) {
break;
}
width_tmp /= 2;
height_tmp /= 2;
scale *= 2;
}
// Decode with inSampleSize
BitmapFactory.Options o2 = new BitmapFactory.Options();
o2.inSampleSize = scale;
Bitmap bitmap = BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
// Rotate according to EXIF
int rotate = 0;
try
{
ExifInterface exif = new ExifInterface(getContentResolver().openInputStream(selectedImage));
int orientation = exif.getAttributeInt(ExifInterface.TAG_ORIENTATION, ExifInterface.ORIENTATION_NORMAL);
switch (orientation) {
case ExifInterface.ORIENTATION_ROTATE_270:
rotate = 270;
break;
case ExifInterface.ORIENTATION_ROTATE_180:
rotate = 180;
break;
case ExifInterface.ORIENTATION_ROTATE_90:
rotate = 90;
break;
}
}
catch (IOException e)
{
Log.e("MainActivity", "ExifInterface IOException");
}
Matrix matrix = new Matrix();
matrix.postRotate(rotate);
return Bitmap.createBitmap(bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), matrix, true);
}
}
Realization effect: