neon implements image scaling algorithm (bilinear interpolation)

C:

void resizeBilinearGray(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    //int[] temp = new int[w2*h2];
    int A, B, C, D, x, y, index, gray;
    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    float x_diff, y_diff, ya, yb;
    int offset = 0;
    int i = 0;
    int j = 0;
    for (i = 0; i<h2; i++) {
        for (j = 0; j<w2; j++) {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff = (x_ratio * j) - x;
            y_diff = (y_ratio * i) - y;
            index = y*st + x;
            // range is 0 to 255 thus bitwise AND with 0xff
            A = src[index] & 0xff;
            B = src[index + 1] & 0xff;
            C = src[index + st] & 0xff;
            D = src[index + st + 1] & 0xff;
            // Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
            gray = (int)(
                A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +
                C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                );
            dst[offset++] = gray;
        }
    }
    return  ;
}

neon optimization: neon requires batch loading, but the data required for each calculation of the bilinear interpolation method needs to be calculated according to the pixel position, which cannot be directly loaded in batches. My current idea is to store all the data required for the calculation in a temporary in memory, and then use neon to bulk load the computation.

void resizeBilinearGray_neon(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    int x,y,index;
    uint16_t* dst_11 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));        
    uint16_t* dst_12 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    uint16_t* dst_21 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    uint16_t* dst_22 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    float* x_diff = (float*)malloc(w2*h2*sizeof(float));
    float* y_diff = (float*)malloc(w2*h2*sizeof(float));
    //int[] temp = new int[w2*h2];
    //int A, B, C, D, x, y, index, gray;
    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    //float x_diff, y_diff, ya, yb;
    int offset = 0;
    int i = 0;
    int j = 0;
    for (i = 0; i<h2; i++)
    {
        for (j = 0; j<w2; j++)
        {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff[i*w2+j] = (x_ratio * j) - x;
            y_diff[i*w2+j] = (y_ratio * i) - y;
            index = y*st + x;
            // range is 0 to 255 thus bitwise AND with 0xff
            dst_11[i*w2+j] = src[index] & 0xff;
            dst_12[i*w2+j] = src[index + 1] & 0xff;
            dst_21[i*w2+j] = src[index + st] & 0xff;
            dst_22[i*w2+j] = src[index + st + 1] & 0xff;
        }
    }
            // Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
    for(offset=0;offset<w2*h2;offset += 8)
    {
        uint16x8_t  A = vld1q_u16(dst_11+offset);     //加载尽可能多的数据进入寄存器,减少内存读写
        uint16x8_t  B = vld1q_u16(dst_12+offset);
        uint16x8_t  C = vld1q_u16(dst_21+offset);
        uint16x8_t  D = vld1q_u16(dst_22+offset);

        float32x4_t x_1 = vld1q_f32(x_diff+offset);        
        float32x4_t x_2 = vld1q_f32(x_diff+offset+4);
        float32x4_t y_1 = vld1q_f32(y_diff+offset);
        float32x4_t y_2 = vld1q_f32(y_diff+offset+4);

        float32x4_t    one = vdupq_n_f32(1.0);
        float32x4_t one_x_1 = vsubq_f32(one,x_1);        //计算1-x_diff和1-y_diff
        float32x4_t one_x_2 = vsubq_f32(one,x_2);
        float32x4_t one_y_1 = vsubq_f32(one,y_1);
        float32x4_t one_y_2 = vsubq_f32(one,y_2);

        /*类型转换,将16位uint转换成32位float*/

        //处理A
        uint16x4_t v_16_low = vget_low_u16(A);        //读取寄存器的高/低部分到新的寄存器中
        uint16x4_t v_16_high = vget_high_u16(A);            
        uint32x4_t v_32_low = vmovl_u16(v_16_low);    //将16位扩展为32位
        uint32x4_t v_32_high = vmovl_u16(v_16_high);
        float32x4_t A_32f_low = vcvtq_f32_u32(v_32_low);    //将int转换为float
        float32x4_t A_32f_high = vcvtq_f32_u32(v_32_high);
        //处理B
        v_16_low = vget_low_u16(B);        
        v_16_high = vget_high_u16(B);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t B_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t B_32f_high = vcvtq_f32_u32(v_32_high);
        //处理C
        v_16_low = vget_low_u16(C);        
        v_16_high = vget_high_u16(C);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t C_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t C_32f_high = vcvtq_f32_u32(v_32_high);
        //处理D
        v_16_low = vget_low_u16(C);        
        v_16_high = vget_high_u16(C);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t D_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t D_32f_high = vcvtq_f32_u32(v_32_high);

        float32x4_t temp1,temp2;
        uint32x4_t result_32;
        uint16x4_t result_16_low,result_16_high;    

        temp1 = vmulq_f32(A_32f_low,one_x_1);        //temp1=A(low)*(1 - x_diff)
        temp1 = vmulq_f32(temp1,one_y_1);            //temp1=A(low)*(1 - x_diff)*(1 - y_diff)
        temp2 = vmulq_f32(B_32f_low,x_1);            //temp2=B(low)*(x_diff)
        temp2 = vmulq_f32(temp2,one_y_1);            //temp2=B(low)*(x_diff)*(1 - y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        temp2 = vmulq_f32(C_32f_low,y_1);            //temp2=C(low)*(y_diff)
        temp2 = vmulq_f32(temp2,one_x_1);            //temp2=C(low)*(y_diff)*(1 - x_diff)
        temp1 = vaddq_f32(temp1,temp2);    
        temp2 = vmulq_f32(D_32f_low,x_1);            //temp2=D(low)*(x_diff)
        temp2 = vmulq_f32(temp2,y_1);            //temp2=D(low)*(x_diff*y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        result_32 = vcvtq_u32_f32(temp1);            //数据类型转换
        result_16_low = vqmovn_u32(result_32);    //窄指令,32位变为16位

        temp1 = vmulq_f32(A_32f_high,one_x_2);        //temp1=A(high)*(1 - x_diff)
        temp1 = vmulq_f32(temp1,one_y_2);            //temp1=A(high)*(1 - x_diff)*(1 - y_diff)
        temp2 = vmulq_f32(B_32f_high,x_2);            //temp2=B(high)*(x_diff)
        temp2 = vmulq_f32(temp2,one_y_2);            //temp2=B(high)*(x_diff)*(1 - y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        temp2 = vmulq_f32(C_32f_high,y_2);            //temp2=C(high)*(y_diff)
        temp2 = vmulq_f32(temp2,one_x_2);            //temp2=C(high)*(y_diff)*(1 - x_diff)
        temp1 = vaddq_f32(temp1,temp2);    
        temp2 = vmulq_f32(D_32f_high,x_2);            //temp2=D(high)*(x_diff)
        temp2 = vmulq_f32(temp2,y_2);            //temp2=D(high)*(x_diff*y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        result_32 = vcvtq_u32_f32(temp1);            //数据类型转换
        result_16_high = vqmovn_u32(result_32);    //窄指令,32位变为16位

        uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
        uint8x8_t result = vqmovn_u16(result_16);

        vst1_u8(dst+offset,result);        
    }
    free(dst_11);
    free(dst_12);
    free(dst_21);
    free(dst_22);
    free(x_diff);
    free(y_diff);
    return  ;
}

Test results: In the actual test, it is found that the implementation of neon is much slower than C. The guess is that the data needs to be put back into the temporary memory, and the read and write memory is too much.

Inline assembly: use inline assembly for optimization, directly load the data from the r register into the neon register one by one, without first putting it into the temporary memory

void resizeBilinearGray_neon_Optimized(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    int x,y,index;
    uint32_t dst_11,dst_12,dst_22,dst_21;
    float x_diff,y_diff,x_diff_1,y_diff_1;

    //uint32x4_t  A,B,C,D;
    //float32x4_t x_32,y_32;

    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    /*
    uint32_t* test0=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test1=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test2=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test3=(uint32_t*)malloc(4*sizeof(uint32_t));
    float*    ftest0=(float*)malloc(4*sizeof(float));
    float*    ftest1=(float*)malloc(4*sizeof(float));
    float*    ftest2=(float*)malloc(4*sizeof(float));
    float*    ftest3=(float*)malloc(4*sizeof(float));
    uint8_t* test8=(uint8_t*)malloc(8*sizeof(uint8_t));
    */
    int offset = 0;
    int i = 0;
    int j = 0;
    int flag=0;

    for (i = 0; i<h2; i++)
    {
        for (j = 0; j<w2; j++)
        {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff = (x_ratio * j) - x;
            y_diff = (y_ratio * i) - y;
            x_diff_1=1-x_diff;
            y_diff_1=1-y_diff;
            index = y*st + x;
            //int t=0;
            // range is 0 to 255 thus bitwise AND with 0xff
            dst_11 = src[index] & 0xff;
            dst_12 = src[index + 1] & 0xff;
            dst_21 = src[index + st] & 0xff;
            dst_22 = src[index + st + 1] & 0xff;

            switch(offset)
            {
                case 0:
                 asm volatile
                (
                    "vmov.32 d0[0], %0\t\n"
                    "vmov.32 d2[0], %1\t\n"
                    "vmov.32 d4[0], %2\t\n"
                    "vmov.32 d30[0], %3\t\n"

                    "vmov.32 d8[0],  %4\t\n"
                    "vmov.32 d10[0], %5\t\n"
                    "vmov.32 d12[0], %6\t\n"
                    "vmov.32 d14[0], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"
                );
                offset++;
                break;    

                case 1:
                asm volatile
                (
                    "vmov.32 d0[1], %0\t\n"
                    "vmov.32 d2[1], %1\t\n"
                    "vmov.32 d4[1], %2\t\n"
                    "vmov.32 d30[1], %3\t\n"

                    "vmov.32 d8[1],  %4\t\n"
                    "vmov.32 d10[1], %5\t\n"
                    "vmov.32 d12[1], %6\t\n"
                    "vmov.32 d14[1], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"                    
                );
                offset++;
                break;

                case 2:
                asm volatile
                (
                    "vmov.32 d1[0], %0\t\n"
                    "vmov.32 d3[0], %1\t\n"
                    "vmov.32 d5[0], %2\t\n"
                    "vmov.32 d31[0], %3\t\n"

                    "vmov.32 d9[0],  %4\t\n"
                    "vmov.32 d11[0], %5\t\n"
                    "vmov.32 d13[0], %6\t\n"
                    "vmov.32 d15[0], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"
                );
                /*
                asm(
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
                    :
                    :"memory"
                );

                for(t=0;t<3;t++)
                {
                    printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");*/
                offset++;
                break;

                case 3:
                asm volatile
                (
                    "vmov.32 d1[1], %0\t\n"
                    "vmov.32 d3[1], %1\t\n"
                    "vmov.32 d5[1], %2\t\n"
                    "vmov.32 d31[1], %3\t\n"

                    "vmov.32 d9[1],  %4\t\n"
                    "vmov.32 d11[1], %5\t\n"
                    "vmov.32 d13[1], %6\t\n"
                    "vmov.32 d15[1], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"memory","q0","q1","q2","q15","q4","q5","q6","q7"
                );
                /*
                asm volatile
                (
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
                    :
                    :"memory"
                );
                for(t=0;t<4;t++)
                {
                    printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");*/
                if(flag)
                {

                    //unsigned char* dst_offset=dst[i*w2+j];
                int dst_offset=&dst[i*w2+j];
                asm volatile
                (
                    "vcvt.f32.u32 q0, q0\t\n"        //转换格式,uint32x4_t -> float32x4_t
                    "vcvt.f32.u32 q1, q1\t\n"
                    "vcvt.f32.u32 q2, q2\t\n"
                    "vcvt.f32.u32 q15, q15\t\n"
                    //A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                    "vmul.f32 q8,q0,q5\t\n"    //A*(1 - x_diff)*(1 - y_diff) ->q8
                    "vmul.f32 q8,q8,q7\t\n"
                    "vmul.f32 q9,q1,q4\t\n"    //B*(x_diff)*(1 - y_diff) ->q9
                    "vmul.f32 q9,q9,q7\t\n"
                    "vadd.f32 q8,q8,q9\t\n"       //q8+q9 -> q8
                    "vmul.f32 q9,q2,q6\t\n"    //C*(y_diff)*(1 - x_diff) -> q9
                    "vmul.f32 q9,q9,q5\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8
                    "vmul.f32 q9,q15,q4\t\n"    //D*(x_diff*y_diff) -> q9
                    "vmul.f32 q9,q9,q6\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8

                    "vcvt.u32.f32 q8,q8\t\n"        //转换格式,float32x4_t -> uint32x4_t
                    "vqmovn.u32 d21,q8\t\n"        //窄指令,uint32x4_t -> uint16x4_t

                    "vqmovn.u16 d22,q10\t\n"        //窄指令,uint16x8_t -> uint8x8_t
                    //"vst1.8 {d22}, [%1]\t\n"         //存储
                    "vst1.8 {d22}, [%0]!\t\n"         //存储
                    :"+r"(dst_offset)//,"+r"(test)
                    :
                    :"memory", "q10","d22"
                );
                /*
                asm(
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    "vst1.8  {d22}, [%8]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3),"+r"(test8)
                    :
                    :"memory"
                );
                for(t=0;t<4;t++)
                {
                    printf("%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");
                for(t=0;t<8;t++)
                {
                    printf("%d\t",test8[t]);
                }
                printf("\n");*/
                flag=0;
                }
                else
                {
                    asm volatile
                (
                    "vcvt.f32.u32 q0, q0\t\n"        //转换格式,uint32x4_t -> float32x4_t
                    "vcvt.f32.u32 q1, q1\t\n"
                    "vcvt.f32.u32 q2, q2\t\n"
                    "vcvt.f32.u32 q15, q15\t\n"
                    //A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                    "vmul.f32 q8,q0,q5\t\n"    //A*(1 - x_diff)*(1 - y_diff) ->q8
                    "vmul.f32 q8,q8,q7\t\n"
                    "vmul.f32 q9,q1,q4\t\n"    //B*(x_diff)*(1 - y_diff) ->q9
                    "vmul.f32 q9,q9,q7\t\n"
                    "vadd.f32 q8,q8,q9\t\n"       //q8+q9 -> q8
                    "vmul.f32 q9,q2,q6\t\n"    //C*(y_diff)*(1 - x_diff) -> q9
                    "vmul.f32 q9,q9,q5\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8
                    "vmul.f32 q9,q15,q4\t\n"    //D*(x_diff*y_diff) -> q9
                    "vmul.f32 q9,q9,q6\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8

                    "vcvt.u32.f32 q8,q8\t\n"        //转换格式,float32x4_t -> uint32x4_t
                    "vqmovn.u32 d20,q8\t\n"        //窄指令,uint32x4_t -> uint16x4_t
                    :
                    :
                    :"d20"
                );
                    flag=1;
                }
                    offset=0;
                    break;                
                default:
                    printf("offset error!\n");
                break;
            }
        }
    }
    return  ;
}

Test: Similar to unoptimized C, after testing, I feel that the bilinear interpolation image scaling for grayscale images is not suitable for neon optimization. Although the code can also be optimized for instruction scheduling, loading data directly from memory into neon registers, etc., but it feels that the optimization effect and cost performance are not high. The image scaling algorithm in the Ne10 library is also rgba image scaling based on bilinear interpolation, and its speedup ratio is only 1.5 (compared to its own C). Because the calculated pixel data can be loaded into registers in batches, there is no need to load them individually.

There is also a C implementation optimized by shift, and the test result is the fastest. The implementation of this version has not been optimized with neon, and the optimization effect has not been tested.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325560764&siteId=291194637