Skip to content

[X86][SelectionDAG] Handle the case for gather where index is SHL #139703

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 19, 2025
14 changes: 13 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56740,12 +56740,24 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
}
}
}
// If the index is a left shift, \ComputeNumSignBits we are recomputing
// the number of sign bits from the shifted value. We are trying to enable
// the optimization in which we can shrink indices if they are larger than
// 32-bits. Using the existing fold techniques implemented below.
unsigned ComputeNumSignBits = DAG.ComputeNumSignBits(Index);
if (Index.getOpcode() == ISD::SHL) {
if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
if (DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
ComputeNumSignBits += *MinShAmt;
}
}
}

// Shrink indices if they are larger than 32-bits.
// Only do this before legalize types since v2i64 could become v2i32.
// FIXME: We could check that the type is legal if we're after legalize
// types, but then we would need to construct test cases where that happens.
if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
if (IndexWidth > 32 && ComputeNumSignBits > (IndexWidth - 32)) {
EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);

// FIXME: We could support more than just constant fold, but we need to
Expand Down
211 changes: 40 additions & 171 deletions llvm/test/CodeGen/X86/masked_gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4806,18 +4806,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still seems wrong - afaict you are doing this:

define i64 @src(i32 noundef %x) {
#0:
  %and = and i32 noundef %x, 536870911
  %zext = zext i32 %and to i64
  %hi = shl i64 %zext, 4
  ret i64 %hi
}
=>
define i64 @tgt(i32 noundef %x) {
#0:
  %shl = shl i32 noundef %x, 4
  %ext = sext i32 %shl to i64
  ret i64 %ext
}
Transformation doesn't verify!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the problem is that we can overflow the value on left shift, right?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you're losing too many sign bits

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, I have update the patch and removed the code.

; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index:
Expand All @@ -4832,44 +4823,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1}
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index:
; X86-SKX: # %bb.0:
Expand All @@ -4896,18 +4858,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset:
Expand All @@ -4922,44 +4875,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0
; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1}
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X86-SKX: # %bb.0:
Expand All @@ -4986,25 +4910,11 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: kmovw %k2, %k3
; X64-KNL-NEXT: vmovaps %ymm4, %ymm0
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-KNL-NEXT: vmovaps %ymm1, %ymm5
; X64-KNL-NEXT: kmovw %k1, %k3
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm2
; X64-KNL-NEXT: kmovw %k1, %k2
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2}
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1}
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_pair:
Expand All @@ -5021,58 +4931,17 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair(
; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_pair:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: kmovw %k2, %k3
; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5
; X64-SKX-SMALL-NEXT: kmovw %k1, %k3
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_pair:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0
; X64-SKX-LARGE-NEXT: kmovw %k2, %k3
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5
; X64-SKX-LARGE-NEXT: kmovw %k1, %k3
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm2
; X64-SKX-NEXT: kmovw %k1, %k2
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2}
; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1}
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_pair:
; X86-SKX: # %bb.0:
Expand Down