Ne10编译和介绍

1.介绍

        ARM® NEON™ 技术是适用于 ARM Cortex™-A 系列处理器的 SIMD(单指令多数据)架构扩展。 它可以使多媒体和信号处理算法提速,例如视频编码/解码、2D/3D 图形、游戏、音频和语音处理以及图像处理等。 Ne10的问世,出现许多使用NEON 并显著改善用户体验的多媒体应用程序。 有些应用程序开发人员可能不熟悉 NEON 汇编代码,因此 Ne10 库的创建可使开发人员从 ARMv7/NEON 中获得最大效益,而不必使用繁琐的汇编代码。

       Ne10 库提供一组最为常用并且极为优化的函数。 这组函数最初于 2012 年 3 月发布。 库中的初始功能集着重于矩阵/矢量代数以及信号处理。 Ne10 将持续改进,以包含图像处理等多领域内的更多高计算量任务。

2.源码获取

     Ne10的源码公开在github上面,其网站地址:https://github.com/projectNe10/Ne10 。

3.环境

  3.1硬件环境

        您需要准备ARM Cortex-A/R系列开发平台。如果没有硬件开发平台,也可使用仿真环境,如Google的Android Emulator。我现在使用的硬件开发板环境是arm-A53的平台,交叉编译平台ubuntu 16.04.

 3.2软件环境

  • 工具链:aarch64-linux-gnu-

4.编译和使用Ne0库

  4.1编译Ne10

    通过第2部分,获取源码后,进入源码目录,进行如下操作:

  1. 修改CMakeLists.txt.有二处修改,修改如下:
1. option(NE10_BUILD_UNIT_TEST "Build NE10 unit test" ON)  //原先为OFF
2. option(NE10_PERFORMANCE_TEST "Run performance test" ON)//原先为OFF

       此处打开,源码中的测试程序和选择performance-test。关于smoke testing,regression testing, performancetesting的区别如下:

  • Conformance testing (also called smoke testing), to check if the library works correctly.
  • Regression testing, which is similar to conformance testing but is aimed more specifically at testing whether the library still operates correctly after a change.
  • Performance testing, which gives an indication of how quickly the library performs certain tasks.

     2.修改GNUlinux_config.cmake

if(NOT DEFINED ENV{NE10_LINUX_TARGET_ARCH})
   set(NE10_LINUX_TARGET_ARCH "aarch64")
else()
//直接将此处设置为,aarch64

     3.编译

mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ../
make -j8

   此处是静态的编译方式,可以看到在build/modules/下面生成libNE10.

ccion@ubuntu:~/Ne10/build/modules$ ls
CMakeFiles  cmake_install.cmake  libNE10.a  Makefile

4.2使用和结果分析

       通过上面的步骤可以看到,在build目录下面生成了test文件,有二个应用程序,这里在我的开发板平台上面执行FFT的执行程序-NE10-dsp_unit_test_static_performanc。其重要部分源码如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_dsp.h"
#include "NE10_macros.h"
#include "seatest.h"
#include "unit_test_common.h"
void test_fft_c2c_1d_float32_performance()
{
    ne10_int32_t i = 0;
    ne10_int32_t fftSize = 0;
    ne10_int32_t flag_result = NE10_OK;
    ne10_int32_t test_loop = 0;

    fprintf (stdout, "----------%30s start\n", __FUNCTION__);
    fprintf (stdout, "%25s%20s%20s%20s%20s\n", "FFT Length", "C Time (micro-s)", "NEON Time (micro-s)", "Time Savings", "Performance Ratio");

    for (fftSize = MIN_LENGTH_SAMPLES_CPX; fftSize <= TEST_LENGTH_SAMPLES; fftSize *= 2)
    {
        fprintf (stdout, "FFT size %d\n", fftSize);

        /* FFT test */
        memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));
        flag_result = test_c2c_alloc (fftSize);
        if (flag_result == NE10_ERR)
        {
            return;
        }

        test_loop = TEST_COUNT / fftSize;

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 0);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        /* IFFT test */
        memcpy (in_c, out_c, 2 * fftSize * sizeof (ne10_float32_t));
        memcpy (in_neon, out_c, 2 * fftSize * sizeof (ne10_float32_t));

        GET_TIME
        (
            time_c,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_c ( (ne10_fft_cpx_float32_t*) out_c, (ne10_fft_cpx_float32_t*) in_c, cfg_c, 1);
        }
        );
        GET_TIME
        (
            time_neon,
        {
            for (i = 0; i < test_loop; i++)
                ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 1);
        }
        );

        time_speedup = (ne10_float32_t) time_c / time_neon;
        time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
        ne10_log (__FUNCTION__, "Float FFT%21d%20lld%20lld%19.2f%%%18.2f:1\n", fftSize, time_c, time_neon, time_savings, time_speedup);

        NE10_FREE (cfg_c);
        NE10_FREE (cfg_neon);
    }
}

执行结果:

      可以看到,在FFT>8之后采用Ne10版本比纯c版效率高很多,但是在处理2,4,8个FFT时,Ne10的效率居然还没有c高。

再来看看处理图像的效率问题:执行NE10_imgproc_unit_test_statci_performanc。其重要源码如下:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

#include "NE10_imgproc.h"
#include "seatest.h"
#include "unit_test_common.h"

void test_resize_performance_case()
{
    ne10_int32_t srcw;
    ne10_int32_t srch;
    ne10_int32_t dstw;
    ne10_int32_t dsth;
    ne10_int32_t i;
    ne10_int32_t w, h;
    ne10_int32_t channels = 4;
    ne10_int32_t pic_size = MEM_SIZE * MEM_SIZE * channels * sizeof (ne10_uint8_t);
    ne10_int64_t time_c = 0;
    ne10_int64_t time_neon = 0;

    /* init input memory */
    in_c = NE10_MALLOC (pic_size);
    in_neon = NE10_MALLOC (pic_size);

    /* init dst memory */
    out_c = NE10_MALLOC (pic_size);
    out_neon = NE10_MALLOC (pic_size);

    for (i = 0; i < pic_size; i++)
    {
        in_c[i] = in_neon[i] = (rand() & 0xff);
    }

    for (h = 16; h < MEM_SIZE; h += 4)
    {
        for (w = 16; w < MEM_SIZE; w += 4)
        {
            srcw = h;
            srch = h;
            dstw = w;
            dsth = w;

            printf ("srcw X srch = %d X %d \n", srcw, srch);
            printf ("dstw X dsth = %d X %d \n", dstw, dsth);

            GET_TIME
            (
                time_c,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_c (out_c, dstw, dsth, in_c, srcw, srch, srcw);
            }
            );

            GET_TIME
            (
                time_neon,
            {
                for (i = 0; i < TEST_COUNT; i++)
                    ne10_img_resize_bilinear_rgba_neon (out_neon, dstw, dsth, in_neon, srcw, srch, srcw);
            }
            );
            printf ("time c %lldus \n", time_c);
            printf ("time neon %lldus \n", time_neon);
            ne10_log (__FUNCTION__, "IMAGERESIZE%20d%20lld%20lld%19.2f%%%18.2f:1\n", (h * MEM_SIZE + w), time_c, time_neon, 0, 0);

        }
    }
    NE10_FREE (in_c);
    NE10_FREE (in_neon);
    NE10_FREE (out_c);
    NE10_FREE (out_neon);
}

执行结果:

很明显,做图像resize时,neon版本的要比c版本的效率搞很多

猜你喜欢

转载自blog.csdn.net/weixin_41965270/article/details/89241649
今日推荐