Skip to content

Failure to optimize away adding a vector of zeroes #107423

Open
@Validark

Description

@Validark

I have this code which computes the partial prefix sum of interleaved vectors (Godbolt link):

export fn foo(v0: @Vector(16, u8), v1: @Vector(16, u8), v2: @Vector(16, u8), v3: @Vector(16, u8), dest: [*]u8) void {
    const condition: @Vector(64, u8) = @bitCast([4]@Vector(16, u8){ v0, v1, v2, v3 });
    var exclusive_prefix_sum = shiftInterleaved4ElementsRight(condition, 1, u16);

    inline for (0..3) |i| {
        exclusive_prefix_sum +%= shiftInterleaved4ElementsRight(exclusive_prefix_sum, 1 << i, u16);
    }

    dest[0..64].* = exclusive_prefix_sum;
}

fn shiftInterleaved4ElementsRight(vecs: @Vector(64, u8), comptime amount: u7, comptime boundary: type) @Vector(64, u8) {
    var new_vecs: [4]@Vector(16, u8) = @bitCast(vecs);

    if ((amount & 1) == 1) {
        const n = shiftElementsRight(new_vecs[3], 1, boundary);
        new_vecs[3] = new_vecs[2];
        new_vecs[2] = new_vecs[1];
        new_vecs[1] = new_vecs[0];
        new_vecs[0] = n;
    }

    if ((amount & 2) == 2) {
        const n1 = shiftElementsRight(new_vecs[3], 1, boundary);
        const n0 = shiftElementsRight(new_vecs[2], 1, boundary);
        new_vecs[3] = new_vecs[1];
        new_vecs[2] = new_vecs[0];
        new_vecs[1] = n1;
        new_vecs[0] = n0;
    }

    const left_amt = amount >> 2;

    if (left_amt > 0) {
        new_vecs = .{
            shiftElementsRight(new_vecs[0], left_amt, boundary),
            shiftElementsRight(new_vecs[1], left_amt, boundary),
            shiftElementsRight(new_vecs[2], left_amt, boundary),
            shiftElementsRight(new_vecs[3], left_amt, boundary)
        };
    }

    return @bitCast(new_vecs);
}

fn shiftElementsRight(vec: @Vector(16, u8), comptime amount: u7, comptime boundary: type) @Vector(16, u8) {
    return @bitCast(@as(@Vector(16 / @sizeOf(boundary), boundary), @bitCast(vec)) << @splat(8*amount));
}

Compiled for the Apple M3:

LLVM correctly detects that a would-be shl v3.8h, v4.8h, #8 is always 0, and emits a movi instruction instead, and then adds v3 and v4 together.

foo:
        shl     v3.8h, v3.8h, #8
        shl     v4.8h, v2.8h, #8
        add     v5.16b, v0.16b, v3.16b
        add     v3.16b, v3.16b, v4.16b
        add     v0.16b, v1.16b, v0.16b
        add     v1.16b, v2.16b, v1.16b
        shl     v2.8h, v1.8h, #8
        shl     v4.8h, v0.8h, #8
        add     v4.16b, v4.16b, v3.16b
        add     v1.16b, v5.16b, v1.16b
        add     v0.16b, v3.16b, v0.16b
        add     v2.16b, v2.16b, v5.16b
-       movi    v3.2d, #0000000000000000
        shl     v5.8h, v2.8h, #8
        shl     v6.8h, v0.8h, #8
        shl     v7.8h, v1.8h, #8
        add     v2.16b, v5.16b, v2.16b
        add     v0.16b, v6.16b, v0.16b
        add     v1.16b, v7.16b, v1.16b
        stp     q0, q1, [x0, #32]
-       add     v0.16b, v3.16b, v4.16b
        stp     q0, q2, [x0]
        ret

Compiled for Sandy Bridge (because it also only has 16 byte vectors):

foo:
        vpsllw  xmm3, xmm3, 8
        vpsllw  xmm4, xmm2, 8
        vpaddb  xmm4, xmm3, xmm4
        vpaddb  xmm3, xmm0, xmm3
        vpaddb  xmm0, xmm1, xmm0
        vpaddb  xmm1, xmm2, xmm1
        vpsllw  xmm2, xmm1, 8
        vpsllw  xmm5, xmm0, 8
        vpaddb  xmm5, xmm5, xmm4
        vpaddb  xmm1, xmm3, xmm1
        vpaddb  xmm0, xmm4, xmm0
        vpaddb  xmm2, xmm2, xmm3
-       vpxor   xmm3, xmm3, xmm3
-       vpaddb  xmm3, xmm3, xmm5
        vpsllw  xmm4, xmm2, 8
        vpsllw  xmm5, xmm0, 8
        vpsllw  xmm6, xmm1, 8
        vpaddb  xmm0, xmm5, xmm0
        vpaddb  xmm1, xmm6, xmm1
        vpaddb  xmm2, xmm4, xmm2
        vmovdqu xmmword ptr [rdi], xmm3
        vmovdqu xmmword ptr [rdi + 16], xmm2
        vmovdqu xmmword ptr [rdi + 48], xmm1
        vmovdqu xmmword ptr [rdi + 32], xmm0
        ret

Unoptimized LLVM dump (via zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1):

; ModuleID = 'llvm_code'
source_filename = "llvm_code"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-musl"

%Target.Cpu.Feature.Set = type { [5 x i64] }
%Target.Cpu.Model = type { { ptr, i64 }, { ptr, i64 }, %Target.Cpu.Feature.Set }
%Target.Cpu = type { ptr, %Target.Cpu.Feature.Set, i6, [7 x i8] }

@builtin.zig_backend = internal unnamed_addr constant i64 2, align 8
@Target.Cpu.Feature.Set.empty = internal unnamed_addr constant %Target.Cpu.Feature.Set zeroinitializer, align 8
@Target.aarch64.cpu.apple_latest = internal unnamed_addr constant %Target.Cpu.Model { { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_227, i64 0), i64 12 }, { ptr, i64 } { ptr getelementptr inbounds (i8, ptr @__anon_230, i64 0), i64 12 }, %Target.Cpu.Feature.Set { [5 x i64] [i64 158329674598400, i64 2251799830972420, i64 1125900041060352, i64 12885032960, i64 0] } }, align 8
@__anon_227 = internal unnamed_addr constant [13 x i8] c"apple_latest\00", align 1
@__anon_230 = internal unnamed_addr constant [13 x i8] c"apple-latest\00", align 1
@builtin.cpu = internal unnamed_addr constant %Target.Cpu { ptr getelementptr inbounds (i8, ptr @Target.aarch64.cpu.apple_latest, i64 0), %Target.Cpu.Feature.Set { [5 x i64] [i64 -6882457295353816576, i64 3831332528523300484, i64 4612917471702155264, i64 47783866528, i64 0] }, i6 6, [7 x i8] undef }, align 8
@start.simplified_logic = internal unnamed_addr constant i1 false, align 1
@builtin.output_mode = internal unnamed_addr constant i2 -2, align 1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local void @foo(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, ptr align 1 nonnull %4) #0 {
5:
  %6 = alloca [64 x i8], align 1
  %7 = alloca [8 x i8], align 8
  %8 = alloca [64 x i8], align 64
  %9 = alloca [64 x i8], align 16
  %10 = getelementptr inbounds [4 x <16 x i8>], ptr %9, i64 0, i64 0
  store <16 x i8> %0, ptr %10, align 16
  %11 = getelementptr inbounds [4 x <16 x i8>], ptr %9, i64 0, i64 1
  store <16 x i8> %1, ptr %11, align 16
  %12 = getelementptr inbounds [4 x <16 x i8>], ptr %9, i64 0, i64 2
  store <16 x i8> %2, ptr %12, align 16
  %13 = getelementptr inbounds [4 x <16 x i8>], ptr %9, i64 0, i64 3
  store <16 x i8> %3, ptr %13, align 16
  %14 = load <64 x i8>, ptr %9, align 16
  %15 = call fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1458(<64 x i8> %14)
  store <64 x i8> %15, ptr %8, align 64
  %16 = load <64 x i8>, ptr %8, align 64
  %17 = load <64 x i8>, ptr %8, align 64
  %18 = call fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1458(<64 x i8> %17)
  %19 = add <64 x i8> %16, %18
  store <64 x i8> %19, ptr %8, align 64
  %20 = load <64 x i8>, ptr %8, align 64
  %21 = load <64 x i8>, ptr %8, align 64
  %22 = call fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1462(<64 x i8> %21)
  %23 = add <64 x i8> %20, %22
  store <64 x i8> %23, ptr %8, align 64
  %24 = load <64 x i8>, ptr %8, align 64
  %25 = load <64 x i8>, ptr %8, align 64
  %26 = call fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1464(<64 x i8> %25)
  %27 = add <64 x i8> %24, %26
  store <64 x i8> %27, ptr %8, align 64
  store ptr %4, ptr %7, align 8
  %28 = load ptr, ptr %7, align 8
  %29 = getelementptr inbounds i8, ptr %28, i64 0
  %30 = load <64 x i8>, ptr %8, align 64
  store <64 x i8> %30, ptr %6, align 1
  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %29, ptr align 1 %6, i64 64, i1 false)
  ret void
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1458(<64 x i8> %0) unnamed_addr #0 {
1:
  %2 = alloca [64 x i8], align 16
  %3 = alloca [64 x i8], align 16
  store <64 x i8> %0, ptr %2, align 16
  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %2, i64 64, i1 false)
  %4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %5 = load <16 x i8>, ptr %4
  %6 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %5)
  %7 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %9 = load <16 x i8>, ptr %8
  store <16 x i8> %9, ptr %7, align 16
  %10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %11 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %12 = load <16 x i8>, ptr %11
  store <16 x i8> %12, ptr %10, align 16
  %13 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %14 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %15 = load <16 x i8>, ptr %14
  store <16 x i8> %15, ptr %13, align 16
  %16 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  store <16 x i8> %6, ptr %16, align 16
  %17 = load <64 x i8>, ptr %3, align 16
  ret <64 x i8> %17
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1462(<64 x i8> %0) unnamed_addr #0 {
1:
  %2 = alloca [64 x i8], align 16
  %3 = alloca [64 x i8], align 16
  store <64 x i8> %0, ptr %2, align 16
  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %2, i64 64, i1 false)
  %4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %5 = load <16 x i8>, ptr %4
  %6 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %5)
  %7 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %8 = load <16 x i8>, ptr %7
  %9 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %8)
  %10 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %11 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %12 = load <16 x i8>, ptr %11
  store <16 x i8> %12, ptr %10, align 16
  %13 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %14 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %15 = load <16 x i8>, ptr %14
  store <16 x i8> %15, ptr %13, align 16
  %16 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  store <16 x i8> %6, ptr %16, align 16
  %17 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  store <16 x i8> %9, ptr %17, align 16
  %18 = load <64 x i8>, ptr %3, align 16
  ret <64 x i8> %18
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <64 x i8> @llvm_code.shiftInterleaved4ElementsRight__anon_1464(<64 x i8> %0) unnamed_addr #0 {
1:
  %2 = alloca [64 x i8], align 16
  %3 = alloca [64 x i8], align 16
  store <64 x i8> %0, ptr %2, align 16
  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %3, ptr align 16 %2, i64 64, i1 false)
  %4 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %5 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 0
  %6 = load <16 x i8>, ptr %5
  %7 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %6)
  store <16 x i8> %7, ptr %4, align 16
  %8 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %9 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 1
  %10 = load <16 x i8>, ptr %9
  %11 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %10)
  store <16 x i8> %11, ptr %8, align 16
  %12 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %13 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 2
  %14 = load <16 x i8>, ptr %13
  %15 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %14)
  store <16 x i8> %15, ptr %12, align 16
  %16 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %17 = getelementptr inbounds [4 x <16 x i8>], ptr %3, i64 0, i64 3
  %18 = load <16 x i8>, ptr %17
  %19 = call fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %18)
  store <16 x i8> %19, ptr %16, align 16
  %20 = load <64 x i8>, ptr %3, align 16
  ret <64 x i8> %20
}

; Function Attrs: nounwind willreturn nofree nocallback memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly %0, ptr noalias nocapture readonly %1, i64 %2, i1 immarg %3) #1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.shiftElementsRight__anon_1475(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = bitcast <16 x i8> %0 to <8 x i16>
  %3 = zext <8 x i4> <i4 -8, i4 -8, i4 -8, i4 -8, i4 -8, i4 -8, i4 -8, i4 -8> to <8 x i16>
  %4 = shl <8 x i16> %2, %3
  %5 = bitcast <8 x i16> %4 to <16 x i8>
  ret <16 x i8> %5
}

attributes #0 = { nounwind uwtable nosanitize_coverage skipprofile "frame-pointer"="none" "target-cpu"="apple-latest" "target-features"="-a510,-a520,-a65,-a710,-a720,-a76,-a78,-a78c,-addr-lsl-fast,+aes,-aggressive-fma,+alternate-sextload-cvt-f32-pattern,+altnzcv,-alu-lsl-fast,+am,+amvs,+arith-bcc-fusion,+arith-cbz-fusion,-ascend-store-address,-b16b16,-balance-fp-ops,+bf16,-brbe,+bti,-call-saved-x10,-call-saved-x11,-call-saved-x12,-call-saved-x13,-call-saved-x14,-call-saved-x15,-call-saved-x18,-call-saved-x8,-call-saved-x9,+ccdp,+ccidx,+ccpp,-chk,-clrbhb,-cmp-bcc-fusion,+complxnum,+CONTEXTIDREL2,-cortex-r82,-cpa,+crc,+crypto,-cssc,-d128,+disable-latency-sched-heuristic,-disable-ldp,-disable-stp,+dit,+dotprod,+ecv,+el2vmsa,+el3,-enable-select-opt,-ete,-exynos-cheap-as-move,-f32mm,-f64mm,-faminmax,+fgt,-fix-cortex-a53-835769,+flagm,-fmv,-force-32bit-jump-tables,+fp16fml,-fp8,-fp8dot2,-fp8dot4,-fp8fma,+fp-armv8,-fpmr,+fptoint,+fullfp16,+fuse-address,-fuse-addsub-2reg-const1,-fuse-adrp-add,+fuse-aes,+fuse-arith-logic,+fuse-crypto-eor,+fuse-csel,+fuse-literals,-gcs,-harden-sls-blr,-harden-sls-nocomdat,-harden-sls-retbr,-hbc,+hcx,+i8mm,-ite,+jsconv,-ldp-aligned-only,+lor,-ls64,+lse,-lse128,+lse2,-lut,-mec,-mops,+mpam,-mte,+neon,-nmi,-no-bti-at-return-twice,-no-neg-immediates,-no-sve-fp-ld1r,-no-zcz-fp,+nv,-outline-atomics,+pan,+pan-rwv,+pauth,-pauth-lr,+perfmon,-predictable-select-expensive,+predres,-prfm-slc-target,-rand,+ras,-rasv2,+rcpc,-rcpc3,+rcpc-immo,+rdm,-reserve-x1,-reserve-x10,-reserve-x11,-reserve-x12,-reserve-x13,-reserve-x14,-reserve-x15,-reserve-x18,-reserve-x2,-reserve-x20,-reserve-x21,-reserve-x22,-reserve-x23,-reserve-x24,-reserve-x25,-reserve-x26,-reserve-x27,-reserve-x28,-reserve-x3,-reserve-x30,-reserve-x4,-reserve-x5,-reserve-x6,-reserve-x7,-reserve-x9,-rme,+sb,+sel2,+sha2,+sha3,-slow-misaligned-128store,-slow-paired-128,-slow-strqro-store,-sm4,-sme,-sme2,-sme2p1,-sme-f16f16,-sme-f64f64,-sme-f8f16,-sme-f8f32,-sme-fa64,-sme-i16i64,-sme-lutv2,-spe,-spe-eef,-specres2,+specrestrict,+ssbs,-ssve-fp8dot2,-ssve-fp8dot4,-ssve-fp8fma,+store-pair-suppress,-stp-aligned-only,-strict-align,-sve,-sve2,-sve2-aes,-sve2-bitperm,-sve2-sha3,-sve2-sm4,-sve2p1,-tagged-globals,-the,+tlb-rmi,-tlbiw,-tme,-tpidr-el1,-tpidr-el2,-tpidr-el3,-tpidrro-el0,+tracev8.4,-trbe,+uaops,-use-experimental-zeroing-pseudos,-use-postra-scheduler,-use-reciprocal-square-root,-use-scalar-inc-vl,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,-v8.7a,-v8.8a,-v8.9a,+v8a,-v8r,-v9.1a,-v9.2a,-v9.3a,-v9.4a,-v9.5a,-v9a,+vh,-wfxt,-xs,+zcm,+zcz,-zcz-fp-workaround,+zcz-gp" }
attributes #1 = { nounwind willreturn nofree nocallback memory(argmem: readwrite) }

!llvm.module.flags = !{}

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions