a[i]
The C language can use both the subscript method and*(a+i)
the pointer method when accessing the array , which is completely equivalent in theory. However, when the compiler optimizes the loop, the index of the pointer method may not be analyzed thoroughly, so it takes more time than the array index
Array indexing takes time
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
unsigned long get_start_ms() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (ts.tv_sec * 1000 + ts.tv_nsec / 1000000);
}
int main() {
unsigned long t = get_start_ms();
uint64_t* mem = malloc(1024*1024*128*sizeof(uint64_t));
register uint64_t sum = 0;
for(int i = 0; i < 1024*1024*128; i++) sum += mem[i];
printf("[%lums]0x%016llx\n", get_start_ms()-t, sum);
}
The results after compiling and running separately are as follows.
It can be seen that as the optimization level increases, the time spent decreases in turn.
Time-consuming pointer indexing
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
unsigned long get_start_ms() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (ts.tv_sec * 1000 + ts.tv_nsec / 1000000);
}
int main() {
unsigned long t = get_start_ms();
uint64_t* mem = malloc(1024*1024*128*sizeof(uint64_t));
uint64_t* end = mem + 1024*1024*128;
register uint64_t sum = 0;
while(mem<end) sum += *mem++;
printf("[%lums]0x%016llx\n", get_start_ms()-t, sum);
}
The results after compiling and running separately are as follows.
Similarly, as the optimization level increases, the time spent decreases in turn. But we have noticed that -O1
the pointer and the array take about the same time at the time, but the array is faster than the pointer in the next two stages 10ms
, which is not a small number.
Compiler optimization analysis
For the reasons for this phenomenon, we need to analyze from the assembly code.
1. Optimization level-O1
At this level, the difference between the two is not obvious.
- array index
- pointer index
2. Optimization level - O2
At this level, vectorization is introduced, but array indexing is more vectorized and has fewer instructions.
- array index
- pointer index
cmpq $4, %r9
jae LBB1_2
## %bb.1:
## implicit-def: $rbx
jmp LBB1_11
LBB1_2:
movq %r9, %r8
andq $-4, %r8
leaq -4(%r8), %rdi
movq %rdi, %rsi
shrq $2, %rsi
incq %rsi
movl %esi, %ebx
andl $3, %ebx
cmpq $12, %rdi
jae LBB1_4
## %bb.3:
pxor %xmm0, %xmm0
xorl %edi, %edi
pxor %xmm1, %xmm1
testq %rbx, %rbx
jne LBB1_7
jmp LBB1_9
LBB1_4:
movl $1, %edi
subq %rsi, %rdi
leaq -1(%rbx,%rdi), %rsi
pxor %xmm0, %xmm0
xorl %edi, %edi
pxor %xmm1, %xmm1
.p2align 4, 0x90
LBB1_5: ## =>This Inner Loop Header: Depth=1
movdqu 8(%rax,%rdi,8), %xmm2
paddq %xmm0, %xmm2
movdqu 24(%rax,%rdi,8), %xmm0
paddq %xmm1, %xmm0
movdqu 40(%rax,%rdi,8), %xmm1
movdqu 56(%rax,%rdi,8), %xmm3
movdqu 72(%rax,%rdi,8), %xmm4
paddq %xmm1, %xmm4
paddq %xmm2, %xmm4
movdqu 88(%rax,%rdi,8), %xmm2
paddq %xmm3, %xmm2
paddq %xmm0, %xmm2
movdqu 104(%rax,%rdi,8), %xmm0
paddq %xmm4, %xmm0
movdqu 120(%rax,%rdi,8), %xmm1
paddq %xmm2, %xmm1
addq $16, %rdi
addq $4, %rsi
jne LBB1_5
## %bb.6:
testq %rbx, %rbx
je LBB1_9
LBB1_7:
leaq 24(%rax,%rdi,8), %rax
negq %rbx
.p2align 4, 0x90
LBB1_8: ## =>This Inner Loop Header: Depth=1
movdqu -16(%rax), %xmm2
paddq %xmm2, %xmm0
movdqu (%rax), %xmm2
paddq %xmm2, %xmm1
addq $32, %rax
incq %rbx
jne LBB1_8
LBB1_9:
paddq %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
paddq %xmm0, %xmm1
movq %xmm1, %rbx
cmpq %r8, %r9
je LBB1_12
## %bb.10:
leaq (%rdx,%r8,8), %rdx
.p2align 4, 0x90
LBB1_11: ## =>This Inner Loop Header: Depth=1
addq (%rdx), %rbx
addq $8, %rdx
cmpq %rcx, %rdx
jb LBB1_11
LBB1_12:
3. Optimization level -O2 -march=native
Introduced at this level avx512
, the degree of vectorization of array indexing is still higher, and the number of instructions is less.
- array index
- pointer index
cmpq $16, %r9
jae LBB1_2
## %bb.1:
## implicit-def: $rbx
jmp LBB1_11
LBB1_2:
movq %r9, %r8
andq $-16, %r8
leaq -16(%r8), %rdi
movq %rdi, %rsi
shrq $4, %rsi
addq $1, %rsi
movl %esi, %ebx
andl $3, %ebx
cmpq $48, %rdi
jae LBB1_4
## %bb.3:
vpxor %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vpxor %xmm1, %xmm1, %xmm1
vpxor %xmm2, %xmm2, %xmm2
vpxor %xmm3, %xmm3, %xmm3
testq %rbx, %rbx
jne LBB1_7
jmp LBB1_9
LBB1_4:
movl $1, %edi
subq %rsi, %rdi
leaq (%rbx,%rdi), %rsi
addq $-1, %rsi
vpxor %xmm0, %xmm0, %xmm0
xorl %edi, %edi
vpxor %xmm1, %xmm1, %xmm1
vpxor %xmm2, %xmm2, %xmm2
vpxor %xmm3, %xmm3, %xmm3
.p2align 4, 0x90
LBB1_5: ## =>This Inner Loop Header: Depth=1
vpaddq 8(%rax,%rdi,8), %ymm0, %ymm0
vpaddq 40(%rax,%rdi,8), %ymm1, %ymm1
vpaddq 72(%rax,%rdi,8), %ymm2, %ymm2
vpaddq 104(%rax,%rdi,8), %ymm3, %ymm3
vpaddq 136(%rax,%rdi,8), %ymm0, %ymm0
vpaddq 168(%rax,%rdi,8), %ymm1, %ymm1
vpaddq 200(%rax,%rdi,8), %ymm2, %ymm2
vpaddq 232(%rax,%rdi,8), %ymm3, %ymm3
vpaddq 264(%rax,%rdi,8), %ymm0, %ymm0
vpaddq 296(%rax,%rdi,8), %ymm1, %ymm1
vpaddq 328(%rax,%rdi,8), %ymm2, %ymm2
vpaddq 360(%rax,%rdi,8), %ymm3, %ymm3
vpaddq 392(%rax,%rdi,8), %ymm0, %ymm0
vpaddq 424(%rax,%rdi,8), %ymm1, %ymm1
vpaddq 456(%rax,%rdi,8), %ymm2, %ymm2
vpaddq 488(%rax,%rdi,8), %ymm3, %ymm3
addq $64, %rdi
addq $4, %rsi
jne LBB1_5
## %bb.6:
testq %rbx, %rbx
je LBB1_9
LBB1_7:
leaq (%rax,%rdi,8), %rax
addq $104, %rax
negq %rbx
.p2align 4, 0x90
LBB1_8: ## =>This Inner Loop Header: Depth=1
vpaddq -96(%rax), %ymm0, %ymm0
vpaddq -64(%rax), %ymm1, %ymm1
vpaddq -32(%rax), %ymm2, %ymm2
vpaddq (%rax), %ymm3, %ymm3
subq $-128, %rax
incq %rbx
jne LBB1_8
LBB1_9:
vpaddq %ymm3, %ymm1, %ymm1
vpaddq %ymm2, %ymm0, %ymm0
vpaddq %ymm1, %ymm0, %ymm0
vextracti128 $1, %ymm0, %xmm1
vpaddq %ymm1, %ymm0, %ymm0
vpshufd $78, %xmm0, %xmm1 ## xmm1 = xmm0[2,3,0,1]
vpaddq %xmm1, %xmm0, %xmm0
vmovq %xmm0, %rbx
cmpq %r8, %r9
je LBB1_12
## %bb.10:
leaq (%rdx,%r8,8), %rdx
.p2align 4, 0x90
LBB1_11: ## =>This Inner Loop Header: Depth=1
addq (%rdx), %rbx
addq $8, %rdx
cmpq %rcx, %rdx
jb LBB1_11
LBB1_12:
It can be seen that the compiler has a set of mature optimization methods for the vectorization of the array index under the for loop. If you switch to the pointer index rashly, it will slow down the speed, so you must be cautious.