Skip to content

[AVX-512] Concatenating k-masks via shift+or should produce a kunpckdq instruction #111431

@Validark

Description

@Validark
export fn foo(a: @Vector(32, u8), b: @Vector(32, u8), c: @Vector(32, u8)) u64 {
    const b1: u32 = @bitCast(a == b);
    const b2: u32 = @bitCast(a == c);
    return (@as(u64, b1) << 32) | b2;
}
define dso_local i64 @foo(<32 x i8> %0, <32 x i8> %1, <32 x i8> %2) local_unnamed_addr {
Entry:
  %3 = icmp eq <32 x i8> %0, %1
  %4 = bitcast <32 x i1> %3 to i32
  %5 = icmp eq <32 x i8> %0, %2
  %6 = bitcast <32 x i1> %5 to i32
  %7 = zext i32 %4 to i64
  %8 = shl nuw i64 %7, 32
  %9 = zext i32 %6 to i64
  %10 = or disjoint i64 %8, %9
  ret i64 %10
}

This compiles like so: (Godbolt link)

foo:
        vpcmpeqb        k0, ymm0, ymm1
        kmovd   ecx, k0
        vpcmpeqb        k0, ymm0, ymm2
        kmovd   eax, k0
        shl     rcx, 32
        or      rax, rcx
        vzeroupper
        ret

I would think it should be:

foo:
        vpcmpeqb        k0, ymm0, ymm1
        vpcmpeqb        k1, ymm0, ymm2
        kunpckdq        k0, k1, k0
        kmovd   rax, k0
        vzeroupper
        ret

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions