Skip to content

Basic vectorization performance regression from 1.48.0 onwards #85265

Closed
@shampoofactory

Description

@shampoofactory

Hi all. The compiler output for the test cases below is efficiently handled in 1.47. However, it has since been progressively unraveling into something quite awkward. This trend occurs on both x64 and ARM targets. All examples are compiled with -C opt-level=3 -C lto=fat -C codegen-units=1.

Code

pub fn case_1(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    [
        a[0] + b[0],
        a[1] + b[1],
        a[2] + b[2],
        a[3] + b[3],
    ]
}

pub fn case_2(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    let mut c = [0.0; 4];
    for i in 0..4 {
        c[i] = a[i] + b[i];
    }
    c
}

1.47.0 - Ok

https://rust.godbolt.org/z/33WhfM3n8

example::case_1:
        mov     rax, rdi
        vmovups xmm0, xmmword ptr [rsi]
        vaddps  xmm0, xmm0, xmmword ptr [rdx]
        vmovups xmmword ptr [rdi], xmm0
        ret

example::case_2:
        mov     rax, rdi
        vmovups xmm0, xmmword ptr [rsi]
        vaddps  xmm0, xmm0, xmmword ptr [rdx]
        vmovups xmmword ptr [rdi], xmm0
        ret

1.48.0 - Regression case_1

https://rust.godbolt.org/z/bqEre5dx5

example::case_1:
        vmovss  xmm0, dword ptr [rdi]
        vaddss  xmm0, xmm0, dword ptr [rsi]
        vmovss  xmm1, dword ptr [rdi + 4]
        vaddss  xmm1, xmm1, dword ptr [rsi + 4]
        vmovsd  xmm2, qword ptr [rdi + 8]
        vmovsd  xmm3, qword ptr [rsi + 8]
        vaddps  xmm2, xmm2, xmm3
        vmovd   eax, xmm0
        vmovd   ecx, xmm1
        vextractps      esi, xmm2, 0
        vextractps      edx, xmm2, 1
        shl     rdx, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::case_2:
        vmovups xmm0, xmmword ptr [rdi]
        vaddps  xmm0, xmm0, xmmword ptr [rsi]
        vmovq   rax, xmm0
        vpextrq rdx, xmm0, 1
        ret

1.50.0 - Regression case_1 and case_2

https://rust.godbolt.org/z/Pj399ezPq

example::case_1:
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovd   xmm1, edi
        shr     rdi, 32
        vpinsrd xmm1, xmm1, edi, 1
        vmovd   xmm2, esi
        vmovd   xmm3, edx
        shr     rdx, 32
        vpinsrd xmm3, xmm3, edx, 1
        vaddps  xmm1, xmm1, xmm3
        vmovd   xmm3, ecx
        shr     rcx, 32
        vaddss  xmm0, xmm0, xmm3
        vmovd   xmm3, ecx
        vaddss  xmm2, xmm2, xmm3
        vmovd   edx, xmm0
        vmovd   eax, xmm2
        shl     rax, 32
        or      rdx, rax
        vxorps  xmm0, xmm0, xmm0
        vblendps        xmm0, xmm1, xmm0, 2
        vmovq   rcx, xmm0
        vmovshdup       xmm0, xmm1
        vmovq   rax, xmm0
        shl     rax, 32
        or      rax, rcx
        ret

example::case_2:
        vmovq   xmm0, rcx
        vmovq   xmm1, rdx
        vpunpcklqdq     xmm0, xmm1, xmm0
        vmovq   xmm1, rsi
        vmovq   xmm2, rdi
        vpunpcklqdq     xmm1, xmm2, xmm1
        vaddps  xmm0, xmm1, xmm0
        vmovq   rax, xmm0
        vpextrq rdx, xmm0, 1
        ret

1.52.0 - Further regression case_2

https://rust.godbolt.org/z/KsvbW5vhW

example::case_1:
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovd   xmm1, edi
        shr     rdi, 32
        vpinsrd xmm1, xmm1, edi, 1
        vmovd   xmm2, esi
        vmovd   xmm3, edx
        shr     rdx, 32
        vpinsrd xmm3, xmm3, edx, 1
        vaddps  xmm1, xmm1, xmm3
        vmovd   xmm3, ecx
        shr     rcx, 32
        vaddss  xmm0, xmm0, xmm3
        vmovd   xmm3, ecx
        vaddss  xmm2, xmm2, xmm3
        vmovd   edx, xmm0
        vmovd   eax, xmm2
        shl     rax, 32
        or      rdx, rax
        vxorps  xmm0, xmm0, xmm0
        vblendps        xmm0, xmm0, xmm1, 1
        vmovq   rcx, xmm0
        vmovshdup       xmm0, xmm1
        vmovq   rax, xmm0
        shl     rax, 32
        or      rax, rcx
        ret

example::case_2:
        mov     rax, rsi
        shld    rax, rdi, 32
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovq   xmm1, rax
        vmovq   xmm2, rsi
        vpunpckldq      xmm1, xmm2, xmm1
        vmovd   xmm2, ecx
        vaddss  xmm0, xmm0, xmm2
        vmovd   xmm2, edx
        shrd    rdx, rcx, 32
        shr     rcx, 32
        vmovq   xmm3, rdx
        vmovq   xmm4, rcx
        vpunpckldq      xmm3, xmm4, xmm3
        vmovd   xmm4, edi
        vaddps  xmm1, xmm1, xmm3
        vaddps  xmm2, xmm2, xmm4
        vextractps      eax, xmm2, 0
        vmovd   ecx, xmm0
        vextractps      edx, xmm1, 0
        vextractps      esi, xmm1, 1
        shl     rsi, 32
        shl     rdx, 32
        or      rdx, rcx
        or      rax, rsi
        ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-codegenArea: Code generationC-bugCategory: This is a bug.I-slowIssue: Problems and improvements with respect to performance of generated code.P-highHigh priorityT-compilerRelevant to the compiler team, which will review and decide on the PR/issue.regression-from-stable-to-stablePerformance or correctness regression from one stable version to another.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions