Skip to content

Weird AVX 512 code generated with std::simd when using -Zbuild-std  #129293

Open
@cvijdea-bd

Description

@cvijdea-bd

Discussed on Zulip: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd/topic/simd.3A.3AMask.20codegen.20on.20avx512

I tried this code (Godbolt link):

#![feature(portable_simd)]
use std::simd::u8x32;
use std::simd::Simd;
use std::simd::cmp::SimdPartialOrd;

#[inline(never)]
pub fn test_lt_select(idxs: u8x32) -> u8x32 {
    idxs.simd_lt(Simd::splat(32u8))
        .select(idxs, Simd::splat(u8::MAX))
}

I expected to see this happen: simd_lt + simd_select is compiled to clean 3 instruction sequence (AVX512: vpcmpltub + vpcmpeqd + vmovdqu8-with-mask, AVX2: vpmaxub + vpcmpeqb + vpor) - this is the case with pre-built std, see Godbolt

Instead, this happened: with -Zbuild-std, vpcmpltub is followed by lots of redundant shuffling of mask registers

RUSTFLAGS=-Ctarget-cpu=sapphirerapids cargo build --release -Z build-std --target x86_64-unknown-linux-gnu

Messy assembly
00000000000596b0 <_ZN9test_simd14test_lt_select17h4bc8c118b05e9d0dE>:
   596b0:       62 f3 7d 28 3e 05 25    vpcmpltub k0,ymm0,YMMWORD PTR [rip+0xfffffffffffab725]        # 4de0 <anon.b59fd370342578772160e03262ba3ae9.2.llvm.10957916908726049888+0x1a0>
   596b7:       b7 fa ff 01
        unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
   596bb:       c4 e3 79 31 c8 08       kshiftrd k1,k0,0x8
   596c1:       c4 e3 79 31 d0 18       kshiftrd k2,k0,0x18
   596c7:       c5 fb 93 c2             kmovd  eax,k2
   596cb:       c4 e3 79 31 d0 10       kshiftrd k2,k0,0x10
            core::intrinsics::simd::simd_select_bitmask(
   596d1:       c5 f9 93 ca             kmovb  ecx,k2
   596d5:       c5 f9 6e c8             vmovd  xmm1,eax
   596d9:       c5 fb 93 c1             kmovd  eax,k1
   596dd:       c4 e3 71 20 c8 01       vpinsrb xmm1,xmm1,eax,0x1
   596e3:       c4 e2 79 31 c9          vpmovzxbd xmm1,xmm1
   596e8:       c4 e2 71 47 0d 7f bb    vpsllvd xmm1,xmm1,XMMWORD PTR [rip+0xfffffffffffabb7f]        # 5270 <anon.0fb72383af8265b0a0fc52c4e835cc4a.60.llvm.6827016723924283095+0x40>
   596ef:       fa ff
   596f1:       c1 e1 10                shl    ecx,0x10
   596f4:       c5 f9 7e c8             vmovd  eax,xmm1
   596f8:       09 c8                   or     eax,ecx
   596fa:       c4 e3 79 16 c9 01       vpextrd ecx,xmm1,0x1
   59700:       09 c1                   or     ecx,eax
   59702:       c5 f9 93 c0             kmovb  eax,k0
   59706:       09 c8                   or     eax,ecx
   59708:       c5 fb 92 c8             kmovd  k1,eax
   5970c:       c5 f5 76 c9             vpcmpeqd ymm1,ymm1,ymm1
   59710:       62 f1 7f 29 6f c8       vmovdqu8 ymm1{k1},ymm0
   59716:       c5 fd 7f 0f             vmovdqa YMMWORD PTR [rdi],ymm1

#[inline(never)]
pub fn test_lt_select(idxs: u8x32) -> u8x32 {
    idxs.simd_lt(Simd::splat(32u8))
        .select(idxs, Simd::splat(u8::MAX))
}
   5971a:       c5 f8 77                vzeroupper
   5971d:       c3                      ret

Without -Zbuild-std, the generated LLVM IR is a beautiful icmp ult followed by select.

; test_simd::test_lt_select
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind nonlazybind willreturn memory(argmem: write) uwtable
define internal fastcc void @_ZN9test_simd14test_lt_select17h0358d9f4e1cf8f67E(ptr dead_on_unwind noalias nocapture noundef writable writeonly align 32 dereferenceable(32) %_0, <32 x i8> %idxs.0.val) unnamed_addr #3 !dbg !2029 {
start:
    #dbg_declare(ptr undef, !2031, !DIExpression(), !2032)
    #dbg_declare(ptr undef, !2033, !DIExpression(), !2037)
    #dbg_declare(ptr undef, !2039, !DIExpression(), !2044)
    #dbg_value(<32 x i8> <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>, !2036, !DIExpression(), !2046)
  %0 = icmp ult <32 x i8> %idxs.0.val, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>, !dbg !2047
    #dbg_value(<32 x i8> poison, !2042, !DIExpression(), !2048)
    #dbg_value(<32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, !2043, !DIExpression(), !2048)
  %1 = select <32 x i1> %0, <32 x i8> %idxs.0.val, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, !dbg !2049
  store <32 x i8> %1, ptr %_0, align 32, !dbg !2049
  ret void, !dbg !2050
}

With -Zbuild-std, the LLVM IR is as much of a mess as the generated assembly:

Messy IR
; test_simd::test_lt_select
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind nonlazybind willreturn memory(argmem: write) uwtable
define internal fastcc void @_ZN9test_simd14test_lt_select17h4bc8c118b05e9d0dE(ptr dead_on_unwind noalias nocapture noundef writable writeonly align 32 dereferenceable(32) %_0, <32 x i8> %idxs.0.val) unnamed_addr #3 !dbg !2383 {
start:
    #dbg_declare(ptr undef, !2385, !DIExpression(), !2386)
    #dbg_declare(ptr undef, !2387, !DIExpression(), !2391)
    #dbg_declare(ptr undef, !2393, !DIExpression(), !2398)
    #dbg_value(<32 x i8> <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>, !2390, !DIExpression(), !2400)
  %0 = icmp ult <32 x i8> %idxs.0.val, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>, !dbg !2401
    #dbg_value(<32 x i8> poison, !2402, !DIExpression(), !2405)
    #dbg_value(<32 x i8> poison, !2407, !DIExpression(), !2410)
  %bc13 = bitcast <32 x i1> %0 to <4 x i8>, !dbg !2412
  %.sroa.05.0.extract.trunc = extractelement <4 x i8> %bc13, i64 0, !dbg !2412
  %.sroa.3.0.extract.trunc = extractelement <4 x i8> %bc13, i64 2, !dbg !2412
    #dbg_value(i8 %.sroa.05.0.extract.trunc, !2396, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !2413)
    #dbg_value(i8 %.sroa.05.0.extract.trunc, !2414, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !2417)
    #dbg_value(i8 %.sroa.05.0.extract.trunc, !2419, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !2422)
    #dbg_value(i8 undef, !2396, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !2413)
    #dbg_value(i8 undef, !2414, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !2417)
    #dbg_value(i8 undef, !2419, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !2422)
    #dbg_value(i8 %.sroa.3.0.extract.trunc, !2396, !DIExpression(DW_OP_LLVM_fragment, 16, 8), !2413)
    #dbg_value(i8 %.sroa.3.0.extract.trunc, !2414, !DIExpression(DW_OP_LLVM_fragment, 16, 8), !2417)
    #dbg_value(i8 %.sroa.3.0.extract.trunc, !2419, !DIExpression(DW_OP_LLVM_fragment, 16, 8), !2422)
    #dbg_value(i8 undef, !2396, !DIExpression(DW_OP_LLVM_fragment, 24, 8), !2413)
    #dbg_value(i8 undef, !2414, !DIExpression(DW_OP_LLVM_fragment, 24, 8), !2417)
    #dbg_value(i8 undef, !2419, !DIExpression(DW_OP_LLVM_fragment, 24, 8), !2422)
    #dbg_value(<32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, !2397, !DIExpression(), !2413)
  %.sroa.39.0.insert.ext = zext i8 %.sroa.3.0.extract.trunc to i32, !dbg !2424
  %.sroa.39.0.insert.shift = shl nuw nsw i32 %.sroa.39.0.insert.ext, 16, !dbg !2424
  %1 = shufflevector <4 x i8> %bc13, <4 x i8> poison, <2 x i32> <i32 3, i32 1>, !dbg !2424
  %2 = zext <2 x i8> %1 to <2 x i32>, !dbg !2424
  %3 = shl nuw <2 x i32> %2, <i32 24, i32 8>, !dbg !2424
  %4 = extractelement <2 x i32> %3, i64 0, !dbg !2424
  %.sroa.39.0.insert.insert = or disjoint i32 %4, %.sroa.39.0.insert.shift, !dbg !2424
  %5 = extractelement <2 x i32> %3, i64 1, !dbg !2424
  %.sroa.28.0.insert.insert = or disjoint i32 %.sroa.39.0.insert.insert, %5, !dbg !2424
  %.sroa.07.0.insert.ext = zext i8 %.sroa.05.0.extract.trunc to i32, !dbg !2424
  %.sroa.07.0.insert.insert = or disjoint i32 %.sroa.28.0.insert.insert, %.sroa.07.0.insert.ext, !dbg !2424
  %6 = bitcast i32 %.sroa.07.0.insert.insert to <32 x i1>, !dbg !2424
  %7 = select <32 x i1> %6, <32 x i8> %idxs.0.val, <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, !dbg !2425
  store <32 x i8> %7, ptr %_0, align 32, !dbg !2425
  ret void, !dbg !2426
}

With -Zbuild-std, but a target-cpu without avx512 (e.g. x86-64-v3), the IR and assembly are beautiful again.

rustc --version --verbose:

rustc 1.82.0-nightly (636d7ff91 2024-08-19)
binary: rustc
commit-hash: 636d7ff91b9847d6d43c7bbe023568828f6e3246
commit-date: 2024-08-19
host: x86_64-unknown-linux-gnu
release: 1.82.0-nightly
LLVM version: 19.1.0

Reproduced with and without lto = "thin", also on Windows, and also with different target-cpu (x86-64-v4, skylake-avx512).

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-SIMDArea: SIMD (Single Instruction Multiple Data)A-codegenArea: Code generationC-bugCategory: This is a bug.PG-portable-simdProject group: Portable SIMD (https://github.com/rust-lang/project-portable-simd)T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions