Description
export fn foo(a: @Vector(32, u8), b: @Vector(32, u8), c: @Vector(32, u8)) u64 {
const b1: u32 = @bitCast(a == b);
const b2: u32 = @bitCast(a == c);
return (@as(u64, b1) << 32) | b2;
}
define dso_local i64 @foo(<32 x i8> %0, <32 x i8> %1, <32 x i8> %2) local_unnamed_addr {
Entry:
%3 = icmp eq <32 x i8> %0, %1
%4 = bitcast <32 x i1> %3 to i32
%5 = icmp eq <32 x i8> %0, %2
%6 = bitcast <32 x i1> %5 to i32
%7 = zext i32 %4 to i64
%8 = shl nuw i64 %7, 32
%9 = zext i32 %6 to i64
%10 = or disjoint i64 %8, %9
ret i64 %10
}
This compiles like so: (Godbolt link)
foo:
vpcmpeqb k0, ymm0, ymm1
kmovd ecx, k0
vpcmpeqb k0, ymm0, ymm2
kmovd eax, k0
shl rcx, 32
or rax, rcx
vzeroupper
ret
I would think it should be:
foo:
vpcmpeqb k0, ymm0, ymm1
vpcmpeqb k1, ymm0, ymm2
kunpckdq k0, k1, k0
kmovd rax, k0
vzeroupper
ret