Description
Statement
I found a C++ code pattern missed optimization after #84628 that is widely used in Verilator generated C++ codes which consume CIRCT generated verilog code for RTL circuit simulation.
Without being optimized for vector operations, the code is longer. Even worse, when simulating large RTLs, the branch predictor in the current CPUs usually didn't work for its small size. We should make this pattern to generate code like the original one using vector operations to avoid so many hard-to-predict branches.
Reduced reproducer
TL;DR: https://godbolt.org/z/3qK3jeo1E
Look at the following C++ code:
struct a_struct {
unsigned int value;
unsigned int some_cond;
/* selector A */
unsigned int index;
unsigned int value_0;
unsigned int value_1;
unsigned int value_2;
unsigned int value_3;
/* selector B */
unsigned int value_0_b;
unsigned int value_1_b;
unsigned int value_2_b;
unsigned int value_3_b;
bool use_0;
bool use_1;
bool use_2;
bool use_3;
};
void some_func(a_struct &a) {
if (a.some_cond) {
a.value = ( a.use_0 ? a.value_0_b : 0) |
( a.use_1 ? a.value_1_b : 0) |
( a.use_2 ? a.value_2_b : 0) |
( a.use_3 ? a.value_3_b : 0);
}
else {
a.value = ( 0U == a.index ? a.value_0 : 0) |
( 1U == a.index ? a.value_1 : 0) |
( 2U == a.index ? a.value_2 : 0) |
( 3U == a.index ? a.value_3 : 0);
}
}
Compile (on x86-64 target): clang++ -O3 -S -c -mllvm --jump-is-expensive test.cpp
Before the commit 56b3222, we will get asm like this all using vector operation:
_Z9some_funcR8a_struct: # @_Z9some_funcR8a_struct
.cfi_startproc
# %bb.0:
cmpl $0, 4(%rdi)
je .LBB0_3
# %bb.1:
movd 44(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
punpcklwd %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3]
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1
movdqu 28(%rdi), %xmm0
pandn %xmm0, %xmm1
pshufd $238, %xmm1, %xmm0 # xmm0 = xmm1[2,3,2,3]
por %xmm1, %xmm0
pshufd $85, %xmm0, %xmm1 # xmm1 = xmm0[1,1,1,1]
por %xmm0, %xmm1
movd %xmm1, %eax
movl %eax, (%rdi)
retq
.LBB0_3:
movdqu 8(%rdi), %xmm0
movdqu 12(%rdi), %xmm1
pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
pcmpeqd .LCPI0_0(%rip), %xmm0
pand %xmm1, %xmm0
pshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3]
por %xmm0, %xmm1
pshufd $85, %xmm1, %xmm0 # xmm0 = xmm1[1,1,1,1]
por %xmm1, %xmm0
movd %xmm0, %eax
movl %eax, (%rdi)
retq
However, After commit 56b3222, we will get a bad code like this:
_Z9some_funcR8a_struct: # @_Z9some_funcR8a_struct
.cfi_startproc
# %bb.0:
cmpl $0, 4(%rdi)
je .LBB0_8
# %bb.1:
xorl %ecx, %ecx
cmpb $0, 44(%rdi)
movl $0, %edx
jne .LBB0_2
# %bb.3:
cmpb $0, 45(%rdi)
movl $0, %eax
jne .LBB0_4
.LBB0_5:
orl %edx, %eax
cmpb $0, 46(%rdi)
je .LBB0_7
.LBB0_6:
movl 36(%rdi), %ecx
.LBB0_7:
orl %ecx, %eax
movzbl 47(%rdi), %edx
movl $40, %ecx
xorl %esi, %esi
testb %dl, %dl
je .LBB0_17
.LBB0_16:
movl (%rdi,%rcx), %esi
.LBB0_17:
orl %esi, %eax
movl %eax, (%rdi)
retq
.LBB0_8:
movl 8(%rdi), %ecx
xorl %edx, %edx
movl $0, %esi
testl %ecx, %ecx
je .LBB0_9
# %bb.10:
movl $0, %eax
cmpl $1, %ecx
je .LBB0_11
.LBB0_12:
orl %esi, %eax
cmpl $2, %ecx
jne .LBB0_14
.LBB0_13:
movl 20(%rdi), %edx
.LBB0_14:
orl %edx, %eax
cmpl $3, %ecx
sete %dl
movl $24, %ecx
xorl %esi, %esi
testb %dl, %dl
jne .LBB0_16
jmp .LBB0_17
.LBB0_2:
movl 28(%rdi), %edx
cmpb $0, 45(%rdi)
movl $0, %eax
je .LBB0_5
.LBB0_4:
movl 32(%rdi), %eax
orl %edx, %eax
cmpb $0, 46(%rdi)
jne .LBB0_6
jmp .LBB0_7
.LBB0_9:
movl 12(%rdi), %esi
movl $0, %eax
cmpl $1, %ecx
jne .LBB0_12
.LBB0_11:
movl 16(%rdi), %eax
orl %esi, %eax
cmpl $2, %ecx
je .LBB0_13
jmp .LBB0_14
I have tried to revert the commit 56b3222 based on the recent main branch commit a51d263, then it fixed.