Closed
Description
Hi all. The compiler output for the test cases below is efficiently handled in 1.47. However, it has since been progressively unraveling into something quite awkward. This trend occurs on both x64 and ARM targets. All examples are compiled with -C opt-level=3 -C lto=fat -C codegen-units=1
.
Code
pub fn case_1(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
[
a[0] + b[0],
a[1] + b[1],
a[2] + b[2],
a[3] + b[3],
]
}
pub fn case_2(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
let mut c = [0.0; 4];
for i in 0..4 {
c[i] = a[i] + b[i];
}
c
}
1.47.0 - Ok
https://rust.godbolt.org/z/33WhfM3n8
example::case_1:
mov rax, rdi
vmovups xmm0, xmmword ptr [rsi]
vaddps xmm0, xmm0, xmmword ptr [rdx]
vmovups xmmword ptr [rdi], xmm0
ret
example::case_2:
mov rax, rdi
vmovups xmm0, xmmword ptr [rsi]
vaddps xmm0, xmm0, xmmword ptr [rdx]
vmovups xmmword ptr [rdi], xmm0
ret
1.48.0 - Regression case_1
https://rust.godbolt.org/z/bqEre5dx5
example::case_1:
vmovss xmm0, dword ptr [rdi]
vaddss xmm0, xmm0, dword ptr [rsi]
vmovss xmm1, dword ptr [rdi + 4]
vaddss xmm1, xmm1, dword ptr [rsi + 4]
vmovsd xmm2, qword ptr [rdi + 8]
vmovsd xmm3, qword ptr [rsi + 8]
vaddps xmm2, xmm2, xmm3
vmovd eax, xmm0
vmovd ecx, xmm1
vextractps esi, xmm2, 0
vextractps edx, xmm2, 1
shl rdx, 32
or rdx, rsi
shl rcx, 32
or rax, rcx
ret
example::case_2:
vmovups xmm0, xmmword ptr [rdi]
vaddps xmm0, xmm0, xmmword ptr [rsi]
vmovq rax, xmm0
vpextrq rdx, xmm0, 1
ret
1.50.0 - Regression case_1 and case_2
https://rust.godbolt.org/z/Pj399ezPq
example::case_1:
vmovd xmm0, esi
shr rsi, 32
vmovd xmm1, edi
shr rdi, 32
vpinsrd xmm1, xmm1, edi, 1
vmovd xmm2, esi
vmovd xmm3, edx
shr rdx, 32
vpinsrd xmm3, xmm3, edx, 1
vaddps xmm1, xmm1, xmm3
vmovd xmm3, ecx
shr rcx, 32
vaddss xmm0, xmm0, xmm3
vmovd xmm3, ecx
vaddss xmm2, xmm2, xmm3
vmovd edx, xmm0
vmovd eax, xmm2
shl rax, 32
or rdx, rax
vxorps xmm0, xmm0, xmm0
vblendps xmm0, xmm1, xmm0, 2
vmovq rcx, xmm0
vmovshdup xmm0, xmm1
vmovq rax, xmm0
shl rax, 32
or rax, rcx
ret
example::case_2:
vmovq xmm0, rcx
vmovq xmm1, rdx
vpunpcklqdq xmm0, xmm1, xmm0
vmovq xmm1, rsi
vmovq xmm2, rdi
vpunpcklqdq xmm1, xmm2, xmm1
vaddps xmm0, xmm1, xmm0
vmovq rax, xmm0
vpextrq rdx, xmm0, 1
ret
1.52.0 - Further regression case_2
https://rust.godbolt.org/z/KsvbW5vhW
example::case_1:
vmovd xmm0, esi
shr rsi, 32
vmovd xmm1, edi
shr rdi, 32
vpinsrd xmm1, xmm1, edi, 1
vmovd xmm2, esi
vmovd xmm3, edx
shr rdx, 32
vpinsrd xmm3, xmm3, edx, 1
vaddps xmm1, xmm1, xmm3
vmovd xmm3, ecx
shr rcx, 32
vaddss xmm0, xmm0, xmm3
vmovd xmm3, ecx
vaddss xmm2, xmm2, xmm3
vmovd edx, xmm0
vmovd eax, xmm2
shl rax, 32
or rdx, rax
vxorps xmm0, xmm0, xmm0
vblendps xmm0, xmm0, xmm1, 1
vmovq rcx, xmm0
vmovshdup xmm0, xmm1
vmovq rax, xmm0
shl rax, 32
or rax, rcx
ret
example::case_2:
mov rax, rsi
shld rax, rdi, 32
vmovd xmm0, esi
shr rsi, 32
vmovq xmm1, rax
vmovq xmm2, rsi
vpunpckldq xmm1, xmm2, xmm1
vmovd xmm2, ecx
vaddss xmm0, xmm0, xmm2
vmovd xmm2, edx
shrd rdx, rcx, 32
shr rcx, 32
vmovq xmm3, rdx
vmovq xmm4, rcx
vpunpckldq xmm3, xmm4, xmm3
vmovd xmm4, edi
vaddps xmm1, xmm1, xmm3
vaddps xmm2, xmm2, xmm4
vextractps eax, xmm2, 0
vmovd ecx, xmm0
vextractps edx, xmm1, 0
vextractps esi, xmm1, 1
shl rsi, 32
shl rdx, 32
or rdx, rcx
or rax, rsi
ret