The influence and time-consuming analysis of C language using array index and pointer index on compiler optimization in the loop

a[i]The C language can use both the subscript method and *(a+i)the pointer method when accessing the array , which is completely equivalent in theory. However, when the compiler optimizes the loop, the index of the pointer method may not be analyzed thoroughly, so it takes more time than the array index

Array indexing takes time

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

unsigned long get_start_ms() {
    
    
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (ts.tv_sec * 1000 + ts.tv_nsec / 1000000);
}

int main() {
    
    
    unsigned long t = get_start_ms();
    uint64_t* mem = malloc(1024*1024*128*sizeof(uint64_t));
    register uint64_t sum = 0;
    for(int i = 0; i < 1024*1024*128; i++) sum += mem[i];
    printf("[%lums]0x%016llx\n", get_start_ms()-t, sum);
}

The results after compiling and running separately are as follows.
arr
It can be seen that as the optimization level increases, the time spent decreases in turn.

Time-consuming pointer indexing

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

unsigned long get_start_ms() {
    
    
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (ts.tv_sec * 1000 + ts.tv_nsec / 1000000);
}

int main() {
    
    
    unsigned long t = get_start_ms();
    uint64_t* mem = malloc(1024*1024*128*sizeof(uint64_t));
    uint64_t* end = mem + 1024*1024*128;
    register uint64_t sum = 0;
    while(mem<end) sum += *mem++;
    printf("[%lums]0x%016llx\n", get_start_ms()-t, sum);
}

The results after compiling and running separately are as follows.
ptr
Similarly, as the optimization level increases, the time spent decreases in turn. But we have noticed that -O1the pointer and the array take about the same time at the time, but the array is faster than the pointer in the next two stages 10ms, which is not a small number.

Compiler optimization analysis

For the reasons for this phenomenon, we need to analyze from the assembly code.

1. Optimization level-O1

At this level, the difference between the two is not obvious.

  • array index
    arro1
  • pointer index
    ptro1

2. Optimization level - O2

At this level, vectorization is introduced, but array indexing is more vectorized and has fewer instructions.

  • array index
    arro2
  • pointer index
	cmpq	$4, %r9
	jae	LBB1_2
## %bb.1:
                                        ## implicit-def: $rbx
	jmp	LBB1_11
LBB1_2:
	movq	%r9, %r8
	andq	$-4, %r8
	leaq	-4(%r8), %rdi
	movq	%rdi, %rsi
	shrq	$2, %rsi
	incq	%rsi
	movl	%esi, %ebx
	andl	$3, %ebx
	cmpq	$12, %rdi
	jae	LBB1_4
## %bb.3:
	pxor	%xmm0, %xmm0
	xorl	%edi, %edi
	pxor	%xmm1, %xmm1
	testq	%rbx, %rbx
	jne	LBB1_7
	jmp	LBB1_9
LBB1_4:
	movl	$1, %edi
	subq	%rsi, %rdi
	leaq	-1(%rbx,%rdi), %rsi
	pxor	%xmm0, %xmm0
	xorl	%edi, %edi
	pxor	%xmm1, %xmm1
	.p2align	4, 0x90
LBB1_5:                                 ## =>This Inner Loop Header: Depth=1
	movdqu	8(%rax,%rdi,8), %xmm2
	paddq	%xmm0, %xmm2
	movdqu	24(%rax,%rdi,8), %xmm0
	paddq	%xmm1, %xmm0
	movdqu	40(%rax,%rdi,8), %xmm1
	movdqu	56(%rax,%rdi,8), %xmm3
	movdqu	72(%rax,%rdi,8), %xmm4
	paddq	%xmm1, %xmm4
	paddq	%xmm2, %xmm4
	movdqu	88(%rax,%rdi,8), %xmm2
	paddq	%xmm3, %xmm2
	paddq	%xmm0, %xmm2
	movdqu	104(%rax,%rdi,8), %xmm0
	paddq	%xmm4, %xmm0
	movdqu	120(%rax,%rdi,8), %xmm1
	paddq	%xmm2, %xmm1
	addq	$16, %rdi
	addq	$4, %rsi
	jne	LBB1_5
## %bb.6:
	testq	%rbx, %rbx
	je	LBB1_9
LBB1_7:
	leaq	24(%rax,%rdi,8), %rax
	negq	%rbx
	.p2align	4, 0x90
LBB1_8:                                 ## =>This Inner Loop Header: Depth=1
	movdqu	-16(%rax), %xmm2
	paddq	%xmm2, %xmm0
	movdqu	(%rax), %xmm2
	paddq	%xmm2, %xmm1
	addq	$32, %rax
	incq	%rbx
	jne	LBB1_8
LBB1_9:
	paddq	%xmm1, %xmm0
	pshufd	$78, %xmm0, %xmm1       ## xmm1 = xmm0[2,3,0,1]
	paddq	%xmm0, %xmm1
	movq	%xmm1, %rbx
	cmpq	%r8, %r9
	je	LBB1_12
## %bb.10:
	leaq	(%rdx,%r8,8), %rdx
	.p2align	4, 0x90
LBB1_11:                                ## =>This Inner Loop Header: Depth=1
	addq	(%rdx), %rbx
	addq	$8, %rdx
	cmpq	%rcx, %rdx
	jb	LBB1_11
LBB1_12:

3. Optimization level -O2 -march=native

Introduced at this level avx512, the degree of vectorization of array indexing is still higher, and the number of instructions is less.

  • array index
    arro2n
  • pointer index
	cmpq	$16, %r9
	jae	LBB1_2
## %bb.1:
                                        ## implicit-def: $rbx
	jmp	LBB1_11
LBB1_2:
	movq	%r9, %r8
	andq	$-16, %r8
	leaq	-16(%r8), %rdi
	movq	%rdi, %rsi
	shrq	$4, %rsi
	addq	$1, %rsi
	movl	%esi, %ebx
	andl	$3, %ebx
	cmpq	$48, %rdi
	jae	LBB1_4
## %bb.3:
	vpxor	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
	vpxor	%xmm1, %xmm1, %xmm1
	vpxor	%xmm2, %xmm2, %xmm2
	vpxor	%xmm3, %xmm3, %xmm3
	testq	%rbx, %rbx
	jne	LBB1_7
	jmp	LBB1_9
LBB1_4:
	movl	$1, %edi
	subq	%rsi, %rdi
	leaq	(%rbx,%rdi), %rsi
	addq	$-1, %rsi
	vpxor	%xmm0, %xmm0, %xmm0
	xorl	%edi, %edi
	vpxor	%xmm1, %xmm1, %xmm1
	vpxor	%xmm2, %xmm2, %xmm2
	vpxor	%xmm3, %xmm3, %xmm3
	.p2align	4, 0x90
LBB1_5:                                 ## =>This Inner Loop Header: Depth=1
	vpaddq	8(%rax,%rdi,8), %ymm0, %ymm0
	vpaddq	40(%rax,%rdi,8), %ymm1, %ymm1
	vpaddq	72(%rax,%rdi,8), %ymm2, %ymm2
	vpaddq	104(%rax,%rdi,8), %ymm3, %ymm3
	vpaddq	136(%rax,%rdi,8), %ymm0, %ymm0
	vpaddq	168(%rax,%rdi,8), %ymm1, %ymm1
	vpaddq	200(%rax,%rdi,8), %ymm2, %ymm2
	vpaddq	232(%rax,%rdi,8), %ymm3, %ymm3
	vpaddq	264(%rax,%rdi,8), %ymm0, %ymm0
	vpaddq	296(%rax,%rdi,8), %ymm1, %ymm1
	vpaddq	328(%rax,%rdi,8), %ymm2, %ymm2
	vpaddq	360(%rax,%rdi,8), %ymm3, %ymm3
	vpaddq	392(%rax,%rdi,8), %ymm0, %ymm0
	vpaddq	424(%rax,%rdi,8), %ymm1, %ymm1
	vpaddq	456(%rax,%rdi,8), %ymm2, %ymm2
	vpaddq	488(%rax,%rdi,8), %ymm3, %ymm3
	addq	$64, %rdi
	addq	$4, %rsi
	jne	LBB1_5
## %bb.6:
	testq	%rbx, %rbx
	je	LBB1_9
LBB1_7:
	leaq	(%rax,%rdi,8), %rax
	addq	$104, %rax
	negq	%rbx
	.p2align	4, 0x90
LBB1_8:                                 ## =>This Inner Loop Header: Depth=1
	vpaddq	-96(%rax), %ymm0, %ymm0
	vpaddq	-64(%rax), %ymm1, %ymm1
	vpaddq	-32(%rax), %ymm2, %ymm2
	vpaddq	(%rax), %ymm3, %ymm3
	subq	$-128, %rax
	incq	%rbx
	jne	LBB1_8
LBB1_9:
	vpaddq	%ymm3, %ymm1, %ymm1
	vpaddq	%ymm2, %ymm0, %ymm0
	vpaddq	%ymm1, %ymm0, %ymm0
	vextracti128	$1, %ymm0, %xmm1
	vpaddq	%ymm1, %ymm0, %ymm0
	vpshufd	$78, %xmm0, %xmm1       ## xmm1 = xmm0[2,3,0,1]
	vpaddq	%xmm1, %xmm0, %xmm0
	vmovq	%xmm0, %rbx
	cmpq	%r8, %r9
	je	LBB1_12
## %bb.10:
	leaq	(%rdx,%r8,8), %rdx
	.p2align	4, 0x90
LBB1_11:                                ## =>This Inner Loop Header: Depth=1
	addq	(%rdx), %rbx
	addq	$8, %rdx
	cmpq	%rcx, %rdx
	jb	LBB1_11
LBB1_12:

It can be seen that the compiler has a set of mature optimization methods for the vectorization of the array index under the for loop. If you switch to the pointer index rashly, it will slow down the speed, so you must be cautious.

Guess you like

Origin blog.csdn.net/u011570312/article/details/121265729