neon实现图像缩放算法（双线性插值法）

C：

void resizeBilinearGray(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    //int[] temp = new int[w2*h2];
    int A, B, C, D, x, y, index, gray;
    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    float x_diff, y_diff, ya, yb;
    int offset = 0;
    int i = 0;
    int j = 0;
    for (i = 0; i<h2; i++) {
        for (j = 0; j<w2; j++) {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff = (x_ratio * j) - x;
            y_diff = (y_ratio * i) - y;
            index = y*st + x;
            // range is 0 to 255 thus bitwise AND with 0xff
            A = src[index] & 0xff;
            B = src[index + 1] & 0xff;
            C = src[index + st] & 0xff;
            D = src[index + st + 1] & 0xff;
            // Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
            gray = (int)(
                A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +
                C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                );
            dst[offset++] = gray;
        }
    }
    return  ;
}

neon优化：neon需要批量加载，但双线性插值法的每个计算所需数据都需要根据像素位置计算，无法直接批量加载，我目前的思路是将计算所需的所有数据按序存储在临时内存中，然后用neon批量加载计算。

void resizeBilinearGray_neon(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    int x,y,index;
    uint16_t* dst_11 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));        
    uint16_t* dst_12 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    uint16_t* dst_21 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    uint16_t* dst_22 = (uint16_t*)malloc(w2*h2*sizeof(uint16_t));
    float* x_diff = (float*)malloc(w2*h2*sizeof(float));
    float* y_diff = (float*)malloc(w2*h2*sizeof(float));
    //int[] temp = new int[w2*h2];
    //int A, B, C, D, x, y, index, gray;
    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    //float x_diff, y_diff, ya, yb;
    int offset = 0;
    int i = 0;
    int j = 0;
    for (i = 0; i<h2; i++)
    {
        for (j = 0; j<w2; j++)
        {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff[i*w2+j] = (x_ratio * j) - x;
            y_diff[i*w2+j] = (y_ratio * i) - y;
            index = y*st + x;
            // range is 0 to 255 thus bitwise AND with 0xff
            dst_11[i*w2+j] = src[index] & 0xff;
            dst_12[i*w2+j] = src[index + 1] & 0xff;
            dst_21[i*w2+j] = src[index + st] & 0xff;
            dst_22[i*w2+j] = src[index + st + 1] & 0xff;
        }
    }
            // Y = A(1-w)(1-h) + B(w)(1-h) + C(h)(1-w) + Dwh
    for(offset=0;offset<w2*h2;offset += 8)
    {
        uint16x8_t  A = vld1q_u16(dst_11+offset);     //加载尽可能多的数据进入寄存器，减少内存读写
        uint16x8_t  B = vld1q_u16(dst_12+offset);
        uint16x8_t  C = vld1q_u16(dst_21+offset);
        uint16x8_t  D = vld1q_u16(dst_22+offset);

        float32x4_t x_1 = vld1q_f32(x_diff+offset);        
        float32x4_t x_2 = vld1q_f32(x_diff+offset+4);
        float32x4_t y_1 = vld1q_f32(y_diff+offset);
        float32x4_t y_2 = vld1q_f32(y_diff+offset+4);

        float32x4_t    one = vdupq_n_f32(1.0);
        float32x4_t one_x_1 = vsubq_f32(one,x_1);        //计算1-x_diff和1-y_diff
        float32x4_t one_x_2 = vsubq_f32(one,x_2);
        float32x4_t one_y_1 = vsubq_f32(one,y_1);
        float32x4_t one_y_2 = vsubq_f32(one,y_2);

        /*类型转换，将16位uint转换成32位float*/

        //处理A
        uint16x4_t v_16_low = vget_low_u16(A);        //读取寄存器的高/低部分到新的寄存器中
        uint16x4_t v_16_high = vget_high_u16(A);            
        uint32x4_t v_32_low = vmovl_u16(v_16_low);    //将16位扩展为32位
        uint32x4_t v_32_high = vmovl_u16(v_16_high);
        float32x4_t A_32f_low = vcvtq_f32_u32(v_32_low);    //将int转换为float
        float32x4_t A_32f_high = vcvtq_f32_u32(v_32_high);
        //处理B
        v_16_low = vget_low_u16(B);        
        v_16_high = vget_high_u16(B);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t B_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t B_32f_high = vcvtq_f32_u32(v_32_high);
        //处理C
        v_16_low = vget_low_u16(C);        
        v_16_high = vget_high_u16(C);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t C_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t C_32f_high = vcvtq_f32_u32(v_32_high);
        //处理D
        v_16_low = vget_low_u16(C);        
        v_16_high = vget_high_u16(C);            
        v_32_low = vmovl_u16(v_16_low);    
        v_32_high = vmovl_u16(v_16_high);
        float32x4_t D_32f_low = vcvtq_f32_u32(v_32_low);    
        float32x4_t D_32f_high = vcvtq_f32_u32(v_32_high);

        float32x4_t temp1,temp2;
        uint32x4_t result_32;
        uint16x4_t result_16_low,result_16_high;    

        temp1 = vmulq_f32(A_32f_low,one_x_1);        //temp1=A(low)*(1 - x_diff)
        temp1 = vmulq_f32(temp1,one_y_1);            //temp1=A(low)*(1 - x_diff)*(1 - y_diff)
        temp2 = vmulq_f32(B_32f_low,x_1);            //temp2=B(low)*(x_diff)
        temp2 = vmulq_f32(temp2,one_y_1);            //temp2=B(low)*(x_diff)*(1 - y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        temp2 = vmulq_f32(C_32f_low,y_1);            //temp2=C(low)*(y_diff)
        temp2 = vmulq_f32(temp2,one_x_1);            //temp2=C(low)*(y_diff)*(1 - x_diff)
        temp1 = vaddq_f32(temp1,temp2);    
        temp2 = vmulq_f32(D_32f_low,x_1);            //temp2=D(low)*(x_diff)
        temp2 = vmulq_f32(temp2,y_1);            //temp2=D(low)*(x_diff*y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        result_32 = vcvtq_u32_f32(temp1);            //数据类型转换
        result_16_low = vqmovn_u32(result_32);    //窄指令，32位变为16位

        temp1 = vmulq_f32(A_32f_high,one_x_2);        //temp1=A(high)*(1 - x_diff)
        temp1 = vmulq_f32(temp1,one_y_2);            //temp1=A(high)*(1 - x_diff)*(1 - y_diff)
        temp2 = vmulq_f32(B_32f_high,x_2);            //temp2=B(high)*(x_diff)
        temp2 = vmulq_f32(temp2,one_y_2);            //temp2=B(high)*(x_diff)*(1 - y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        temp2 = vmulq_f32(C_32f_high,y_2);            //temp2=C(high)*(y_diff)
        temp2 = vmulq_f32(temp2,one_x_2);            //temp2=C(high)*(y_diff)*(1 - x_diff)
        temp1 = vaddq_f32(temp1,temp2);    
        temp2 = vmulq_f32(D_32f_high,x_2);            //temp2=D(high)*(x_diff)
        temp2 = vmulq_f32(temp2,y_2);            //temp2=D(high)*(x_diff*y_diff)
        temp1 = vaddq_f32(temp1,temp2);            
        result_32 = vcvtq_u32_f32(temp1);            //数据类型转换
        result_16_high = vqmovn_u32(result_32);    //窄指令，32位变为16位

        uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);
        uint8x8_t result = vqmovn_u16(result_16);

        vst1_u8(dst+offset,result);        
    }
    free(dst_11);
    free(dst_12);
    free(dst_21);
    free(dst_22);
    free(x_diff);
    free(y_diff);
    return  ;
}

测试结果：实际测试时，发现用neon的实现反而比C慢不少，猜测是由于需要将数据重新放入临时内存，读写内存过多的原因

内嵌汇编：采用内嵌汇编进行优化，直接将数据从r寄存器逐个加载进neon寄存器，不先放入临时内存

void resizeBilinearGray_neon_Optimized(unsigned char* src, int w, int h, int src_stride, int w2, int h2,unsigned char* dst)
{
    int st = src_stride;
    int x,y,index;
    uint32_t dst_11,dst_12,dst_22,dst_21;
    float x_diff,y_diff,x_diff_1,y_diff_1;

    //uint32x4_t  A,B,C,D;
    //float32x4_t x_32,y_32;

    float x_ratio = ((float)(w - 1)) / w2;
    float y_ratio = ((float)(h - 1)) / h2;
    /*
    uint32_t* test0=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test1=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test2=(uint32_t*)malloc(4*sizeof(uint32_t));
    uint32_t* test3=(uint32_t*)malloc(4*sizeof(uint32_t));
    float*    ftest0=(float*)malloc(4*sizeof(float));
    float*    ftest1=(float*)malloc(4*sizeof(float));
    float*    ftest2=(float*)malloc(4*sizeof(float));
    float*    ftest3=(float*)malloc(4*sizeof(float));
    uint8_t* test8=(uint8_t*)malloc(8*sizeof(uint8_t));
    */
    int offset = 0;
    int i = 0;
    int j = 0;
    int flag=0;

    for (i = 0; i<h2; i++)
    {
        for (j = 0; j<w2; j++)
        {
            x = (int)(x_ratio * j);
            y = (int)(y_ratio * i);
            x_diff = (x_ratio * j) - x;
            y_diff = (y_ratio * i) - y;
            x_diff_1=1-x_diff;
            y_diff_1=1-y_diff;
            index = y*st + x;
            //int t=0;
            // range is 0 to 255 thus bitwise AND with 0xff
            dst_11 = src[index] & 0xff;
            dst_12 = src[index + 1] & 0xff;
            dst_21 = src[index + st] & 0xff;
            dst_22 = src[index + st + 1] & 0xff;

            switch(offset)
            {
                case 0:
                 asm volatile
                (
                    "vmov.32 d0[0], %0\t\n"
                    "vmov.32 d2[0], %1\t\n"
                    "vmov.32 d4[0], %2\t\n"
                    "vmov.32 d30[0], %3\t\n"

                    "vmov.32 d8[0],  %4\t\n"
                    "vmov.32 d10[0], %5\t\n"
                    "vmov.32 d12[0], %6\t\n"
                    "vmov.32 d14[0], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"
                );
                offset++;
                break;    

                case 1:
                asm volatile
                (
                    "vmov.32 d0[1], %0\t\n"
                    "vmov.32 d2[1], %1\t\n"
                    "vmov.32 d4[1], %2\t\n"
                    "vmov.32 d30[1], %3\t\n"

                    "vmov.32 d8[1],  %4\t\n"
                    "vmov.32 d10[1], %5\t\n"
                    "vmov.32 d12[1], %6\t\n"
                    "vmov.32 d14[1], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"                    
                );
                offset++;
                break;

                case 2:
                asm volatile
                (
                    "vmov.32 d1[0], %0\t\n"
                    "vmov.32 d3[0], %1\t\n"
                    "vmov.32 d5[0], %2\t\n"
                    "vmov.32 d31[0], %3\t\n"

                    "vmov.32 d9[0],  %4\t\n"
                    "vmov.32 d11[0], %5\t\n"
                    "vmov.32 d13[0], %6\t\n"
                    "vmov.32 d15[0], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"q0","q1","q2","q15","q4","q5","q6","q7"
                );
                /*
                asm(
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
                    :
                    :"memory"
                );

                for(t=0;t<3;t++)
                {
                    printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");*/
                offset++;
                break;

                case 3:
                asm volatile
                (
                    "vmov.32 d1[1], %0\t\n"
                    "vmov.32 d3[1], %1\t\n"
                    "vmov.32 d5[1], %2\t\n"
                    "vmov.32 d31[1], %3\t\n"

                    "vmov.32 d9[1],  %4\t\n"
                    "vmov.32 d11[1], %5\t\n"
                    "vmov.32 d13[1], %6\t\n"
                    "vmov.32 d15[1], %7\t\n"
                    :
                    :"r"(dst_11),"r"(dst_12),"r"(dst_21),"r"(dst_22),"r"(x_diff),"r"(x_diff_1),"r"(y_diff),"r"(y_diff_1)
                    :"memory","q0","q1","q2","q15","q4","q5","q6","q7"
                );
                /*
                asm volatile
                (
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3)
                    :
                    :"memory"
                );
                for(t=0;t<4;t++)
                {
                    printf("%d\t%d\t%d\t%d\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");*/
                if(flag)
                {

                    //unsigned char* dst_offset=dst[i*w2+j];
                int dst_offset=&dst[i*w2+j];
                asm volatile
                (
                    "vcvt.f32.u32 q0, q0\t\n"        //转换格式，uint32x4_t -> float32x4_t
                    "vcvt.f32.u32 q1, q1\t\n"
                    "vcvt.f32.u32 q2, q2\t\n"
                    "vcvt.f32.u32 q15, q15\t\n"
                    //A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                    "vmul.f32 q8,q0,q5\t\n"    //A*(1 - x_diff)*(1 - y_diff) ->q8
                    "vmul.f32 q8,q8,q7\t\n"
                    "vmul.f32 q9,q1,q4\t\n"    //B*(x_diff)*(1 - y_diff) ->q9
                    "vmul.f32 q9,q9,q7\t\n"
                    "vadd.f32 q8,q8,q9\t\n"       //q8+q9 -> q8
                    "vmul.f32 q9,q2,q6\t\n"    //C*(y_diff)*(1 - x_diff) -> q9
                    "vmul.f32 q9,q9,q5\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8
                    "vmul.f32 q9,q15,q4\t\n"    //D*(x_diff*y_diff) -> q9
                    "vmul.f32 q9,q9,q6\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8

                    "vcvt.u32.f32 q8,q8\t\n"        //转换格式，float32x4_t -> uint32x4_t
                    "vqmovn.u32 d21,q8\t\n"        //窄指令，uint32x4_t -> uint16x4_t

                    "vqmovn.u16 d22,q10\t\n"        //窄指令，uint16x8_t -> uint8x8_t
                    //"vst1.8 {d22}, [%1]\t\n"         //存储
                    "vst1.8 {d22}, [%0]!\t\n"         //存储
                    :"+r"(dst_offset)//,"+r"(test)
                    :
                    :"memory", "q10","d22"
                );
                /*
                asm(
                    "vst1.32 {q0}, [%0]\t\n"
                    "vst1.32 {q1}, [%1]\t\n"
                    "vst1.32 {q2}, [%2]\t\n"
                    "vst1.32 {q15}, [%3]\t\n"
                    "vst1.32 {q4}, [%4]\t\n"
                    "vst1.32 {q5}, [%5]\t\n"
                    "vst1.32 {q6}, [%6]\t\n"
                    "vst1.32 {q7}, [%7]\t\n"
                    "vst1.8  {d22}, [%8]\t\n"
                    :"+r"(test0),"+r"(test1),"+r"(test2),"+r"(test3),"+r"(ftest0),"+r"(ftest1),"+r"(ftest2),"+r"(ftest3),"+r"(test8)
                    :
                    :"memory"
                );
                for(t=0;t<4;t++)
                {
                    printf("%f\t%f\t%f\t%f\t%f\t%f\t%f\t%f\n",test0[t],test1[t],test2[t],test3[t],ftest0[t],ftest1[t],ftest2[t],ftest3[t]);
                    //printf("%d",dst[i*w2+j+t]);
                }
                printf("\n");
                for(t=0;t<8;t++)
                {
                    printf("%d\t",test8[t]);
                }
                printf("\n");*/
                flag=0;
                }
                else
                {
                    asm volatile
                (
                    "vcvt.f32.u32 q0, q0\t\n"        //转换格式，uint32x4_t -> float32x4_t
                    "vcvt.f32.u32 q1, q1\t\n"
                    "vcvt.f32.u32 q2, q2\t\n"
                    "vcvt.f32.u32 q15, q15\t\n"
                    //A*(1 - x_diff)*(1 - y_diff) + B*(x_diff)*(1 - y_diff) +C*(y_diff)*(1 - x_diff) + D*(x_diff*y_diff)
                    "vmul.f32 q8,q0,q5\t\n"    //A*(1 - x_diff)*(1 - y_diff) ->q8
                    "vmul.f32 q8,q8,q7\t\n"
                    "vmul.f32 q9,q1,q4\t\n"    //B*(x_diff)*(1 - y_diff) ->q9
                    "vmul.f32 q9,q9,q7\t\n"
                    "vadd.f32 q8,q8,q9\t\n"       //q8+q9 -> q8
                    "vmul.f32 q9,q2,q6\t\n"    //C*(y_diff)*(1 - x_diff) -> q9
                    "vmul.f32 q9,q9,q5\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8
                    "vmul.f32 q9,q15,q4\t\n"    //D*(x_diff*y_diff) -> q9
                    "vmul.f32 q9,q9,q6\t\n"
                    "vadd.f32 q8,q8,q9\t\n"    //q8+q9 -> q8

                    "vcvt.u32.f32 q8,q8\t\n"        //转换格式，float32x4_t -> uint32x4_t
                    "vqmovn.u32 d20,q8\t\n"        //窄指令，uint32x4_t -> uint16x4_t
                    :
                    :
                    :"d20"
                );
                    flag=1;
                }
                    offset=0;
                    break;                
                default:
                    printf("offset error!\n");
                break;
            }
        }
    }
    return  ;
}

测试：跟未优化的C差不多，测试下来，感觉对于灰度图的双线性插值法图像缩放，并不适合neon优化。虽然代码中还可以进行指令调度、直接从内存加载数据进neon寄存器等优化，但感觉优化效果以及性价比不高。Ne10库中的图像缩放算法也是基于双线性插值法的rgba图像缩放，其加速比也才1.5（相比于它自己的C），如果是用于rgba格式的图像数据，感觉会有加速，因为计算像素数据可以批量加载进寄存器，不需单独加载。

还有一个用移位优化后的C实现，测试结果最快，该版本的实现尚未尝试用neon优化过，没测过优化效果

neon实现图像缩放算法（双线性插值法）

猜你喜欢