Skip to content

Suboptimal vpshufb of broadcast #66150

Closed
Closed
@dzaima

Description

@dzaima
#include<stdint.h>
#include<stdbool.h>
#include<immintrin.h>
__m256i f(int32_t* b) {
  __m256i t1 = _mm256_set1_epi32(*b);
  __m256i t2 = _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19);
  return  _mm256_shuffle_epi8(t1, t2);
}

https://godbolt.org/z/fWEKevPE9
This generates:

f:                                      # @f
        vmovd   xmm0, dword ptr [rdi]           # xmm0 = mem[0],zero,zero,zero
        vpermq  ymm0, ymm0, 68                  # ymm0 = ymm0[0,1,0,1]
        vpshufb ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
        ret

whereas GCC produces the better (and more obvious):

        vpbroadcastd    ymm0, DWORD PTR [rdi]
        vpshufb ymm0, ymm0, YMMWORD PTR .LC0[rip]
        ret

IR:

define <4 x i64> @f(ptr nocapture noundef readonly %b) local_unnamed_addr {
entry:
  %0 = load i32, ptr %b, align 4
  %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i64 0
  %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>
  %1 = bitcast <8 x i32> %vecinit7.i.i to <32 x i8>
  %2 = shufflevector <32 x i8> %1, <32 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
  %3 = bitcast <32 x i8> %2 to <4 x i64>
  ret <4 x i64> %3
}

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions