Description
Here is a simple loop to find the index of the first different bit in two arrays:
unsigned long crit_bit(unsigned long *a, unsigned long *b, unsigned long size) {
unsigned long ret = 0;
#pragma nounroll
for (unsigned long i = 0; i < size; ++i) {
unsigned long diff = a[i] ^ b[i];
int bits = 8 * sizeof(unsigned long);
ret += __builtin_ctzg(diff, bits);
if (diff) {
break;
}
}
return ret;
}
On x86-64 with -O3 -mbmi
, the loop generates
.LBB0_4:
mov r9, rax
mov r10, qword ptr [rdi + 8*r8]
mov r11, qword ptr [rsi + 8*r8]
mov rax, r11
xor rax, r10
tzcnt rax, rax
mov rbx, r11
xor rbx, r10
cmove rax, rcx
add rax, r9
xor r11, r10
jne .LBB0_6
lea r9, [r8 + 1]
cmp rdx, r8
mov r8, r9
jne .LBB0_4
First, it's using cmove
to get 64 in case the input was zero, but tzcnt
already returns 64 in that case so that's redundant.
Second, it could use the flags after tzcnt
to handle the if (diff) break
, but instead it re-does the comparison.
Better code would look something like this:
xor rax, r10
tzcnt rax, rax
lea rax, [rax + r9]
jc .LBB0_6