Open
Description
https://godbolt.org/z/oEzY8G6PK
The code:
#include<stddef.h>
__attribute__((noreturn))
void unexpected(size_t i);
void external(int,int,int,int,int);
void foo(int* start, size_t n, int arg1, int arg2, int arg3, int arg4) {
int* curr = start;
int* end = start + n;
while (curr < end) {
int val = *curr;
if (__builtin_expect(val < 0, 0)) {
unexpected(curr - start);
}
external(val, arg1, arg2, arg3, arg4);
curr++;
}
}
compiled with -O3
gets compiled to this core loop:
.LBB0_2:
mov edi, dword ptr [r12 + r13]
test edi, edi
js .LBB0_5
mov esi, r15d
mov edx, r14d
mov ecx, ebp
mov r8d, dword ptr [rsp + 4] ; avoidable!
call external
lea rax, [r12 + r13]
add rax, 4
add r13, 4
cmp rax, rbx
jb .LBB0_2
which contains a stack reload, as a consequence of LLVM replacing the pointer bump with a (scaled) index, thus taking two registers to store curr
. Whereas gcc produces:
.L3:
mov r8d, r13d
mov ecx, r12d
mov edx, ebp
mov esi, ebx
call "external"
add r15, 4
cmp r15, r14
jnb .L1
.L4: mov edi, DWORD PTR [r15]
test edi, edi
jns .L3
which doesn't contain the reload, and also saves two instructions on computing curr
for the comparison (by not having to do it at all, whereas LLVM does it in an inefficient way (the lea; add; add
could trivially be add; lea
, but this is unrelated to the main issue)), and as such is two instructions shorter.