Closed
Description
#include<stdint.h>
#include<stdbool.h>
#include<immintrin.h>
__m256i f(int32_t* b) {
__m256i t1 = _mm256_set1_epi32(*b);
__m256i t2 = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19);
return _mm256_shuffle_epi8(t1, t2);
}
https://godbolt.org/z/fWEKevPE9
This generates:
f: # @f
vmovd xmm0, dword ptr [rdi] # xmm0 = mem[0],zero,zero,zero
vpermq ymm0, ymm0, 68 # ymm0 = ymm0[0,1,0,1]
vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
ret
whereas GCC produces the better (and more obvious):
vpbroadcastd ymm0, DWORD PTR [rdi]
vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]
ret
IR:
define <4 x i64> @f(ptr nocapture noundef readonly %b) local_unnamed_addr {
entry:
%0 = load i32, ptr %b, align 4
%vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i64 0
%vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>
%1 = bitcast <8 x i32> %vecinit7.i.i to <32 x i8>
%2 = shufflevector <32 x i8> %1, <32 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
%3 = bitcast <32 x i8> %2 to <4 x i64>
ret <4 x i64> %3
}