NEON accelerated matrix multiplication optimization (arbitrary order)

NEON Acceleration Series Articles



foreword


1. NEON matrix multiplication optimization (any order)

Matrix multiplication through neon.

2. Use steps

1.Code display

The code is as follows (example):

#include <stdio.h>
#include <sys/time.h>
#include <stdint.h>
#include <string.h>
#include <arm_neon.h>

double sub_time(struct timeval t1, struct timeval t0)
{
    
    
    double s = t1.tv_sec - t0.tv_sec;
    double us = t1.tv_usec - t0.tv_usec;

    return s*1000 + us/1000;
}

#define N 16

int main(void)
{
    
    
    float a1[N][N], c1[N][N], a2[N][N], c2[N][N];
    for(int i = 0; i < N; i++){
    
    
       for(int j = 0; j <N; j++){
    
    
           a1[i][j] = 1;
           c1[i][j] = 2;
           a2[i][j] = 1;
           c2[i][j] = 2;
       
       }
    }
    
    
    float d[N][N] = {
    
    {
    
    0}};
    float e[N][N] = {
    
    {
    
    0}};
    int i, j, k, m;

    struct timeval t1, t0;
  
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
    
    
        for (j=0;j<N;j++)
        {
    
    
            for(k=0;k<N;k++)
            {
    
    
                for (m=0;m<N;m++)
                {
    
    
                    d[j][k] += a1[j][m] * c1[m][k];
                }
            }
        }
    }

    gettimeofday(&t1, NULL);
    printf("basic time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
    
    
        for(k=0;k<N;k++)
        {
    
    
            printf("%f\t", d[j][k]);
        }
        printf("\n");
    }
    
    gettimeofday(&t0, NULL);
    for (i=0;i<10000;i++)
    {
    
    
        float32x4_t vc0 = vdupq_n_f32(0.0f);
        float32x4_t vc1 = vdupq_n_f32(0.0f);
        float32x4_t vc2 = vdupq_n_f32(0.0f);
        float32x4_t vc3 = vdupq_n_f32(0.0f);

	float32x4_t ret = vdupq_n_f32(0.0f);
	
        for (j=0;j<N;j++)
        {
    
    
        	// 通过neon直接计算16*16矩阵的结果
          
            ret = vmlaq_f32(ret, vdupq_n_f32(a2[0][j]), vdupq_n_f32(c2[j][0]));
          
            
        }
        
        for (j=0;j<N;j++){
    
    
           
            vst1q_f32(&e[j][0], ret);
            vst1q_f32(&e[j][4], ret);
            vst1q_f32(&e[j][8], ret);
            vst1q_f32(&e[j][12], ret);
           
        }
      
        
    }

    gettimeofday(&t1, NULL);
    printf("neon time used: %0.3f.\n", sub_time(t1,t0));

    for (j=0;j<N;j++)
    {
    
    
        for(k=0;k<N;k++)
        {
    
    
            printf("%f\t", e[j][k]);
        }
        printf("\n");
    }

    return 0;
}

3. Other NEON accelerate implements follow-up updates

Summarize

You are welcome to criticize and correct!!

Guess you like

Origin blog.csdn.net/weixin_45206081/article/details/128253348