Skip to content

Update the base and index value for masked gather #130920

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
72b0f4b
[X86] Update the value of base and index of masked gather for better …
Mar 12, 2025
a231c96
[X86] Update the value of base and index of masked gather for better …
Mar 12, 2025
1ff621f
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 12, 2025
85d2e0e
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 12, 2025
2f0897b
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 12, 2025
cdb181d
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 12, 2025
dd8762a
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
07dd191
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
7d840ed
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
1bd64b8
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
b00f0a9
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
2268967
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
8aeeb31
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
f4e8b0c
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 13, 2025
49f084e
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
a7a52cd
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
6252789
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
8565941
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
5eecf46
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
ad31491
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 17, 2025
ba2f9e7
Merge branch 'gatherBaseIndexfix' of github.com:rohitaggarwal007/llvm…
Mar 19, 2025
2e344a1
Merge branch 'llvm:main' into gatherBaseIndexfix
rohitaggarwal007 Apr 15, 2025
87e2533
Merge branch 'llvm:main' into gatherBaseIndexfix
rohitaggarwal007 Apr 16, 2025
f516be2
Update the masked_gather_scatter.ll
Apr 16, 2025
c2848c2
Remove redundant gatherBaseIndexFix.ll
Apr 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 105 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56508,6 +56508,109 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
Scatter->isTruncatingStore());
}

// Target override this function to decide whether it want to update the base
// and index value of a non-uniform gep
static bool updateBaseAndIndex(SDValue &Base, SDValue &Index, const SDLoc &DL,
const SDValue &Gep, SelectionDAG &DAG) {
SDValue Nbase;
SDValue Nindex;
bool Changed = false;
// This function check the opcode of Index and update the index
auto checkAndUpdateIndex = [&](SDValue &Idx) {
if (Idx.getOpcode() == ISD::SHL) { // shl zext, BV
SDValue Op10 = Idx.getOperand(0); // Zext or Sext value
SDValue Op11 = Idx.getOperand(1); // Build vector of constant

unsigned IndexWidth = Op10.getScalarValueSizeInBits();
if ((Op10.getOpcode() == ISD::SIGN_EXTEND ||
Op10.getOpcode() == ISD::ZERO_EXTEND) &&
IndexWidth > 32 &&
Op10.getOperand(0).getScalarValueSizeInBits() <= 32 &&
DAG.ComputeNumSignBits(Op10) > (IndexWidth - 32) &&
DAG.getValidMinimumShiftAmount(Idx)) {

KnownBits ExtKnown = DAG.computeKnownBits(Op10);
bool ExtIsNonNegative = ExtKnown.isNonNegative();
KnownBits ExtOpKnown = DAG.computeKnownBits(Op10.getOperand(0));
bool ExtOpIsNonNegative = ExtOpKnown.isNonNegative();
if (!ExtIsNonNegative || !ExtOpIsNonNegative)
return false;

SDValue NewOp10 =
Op10.getOperand(0); // Get the Operand zero from the ext
EVT VT = NewOp10.getValueType(); // Use the operand's type to determine
// the type of index

auto *ConstEltNo = dyn_cast<ConstantSDNode>(Op11.getOperand(0));
if (!ConstEltNo)
return false;

SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(),
DAG.getConstant(ConstEltNo->getZExtValue(),
DL, VT.getScalarType()));
Nindex = DAG.getNode(ISD::SHL, DL, VT, NewOp10,
DAG.getBuildVector(VT, DL, Ops));
return true;
}
}
return false;
};

// For the gep instruction, we are trying to properly assign the base and
// index value We are go through the lower code and iterate backward.
if (isNullConstant(Base) && Gep.getOpcode() == ISD::ADD) {
SDValue Op0 = Gep.getOperand(0); // base or add
SDValue Op1 = Gep.getOperand(1); // build vector or SHL
Nbase = Op0;
SDValue Idx = Op1;
auto Flags = Gep->getFlags();

if (Op0->getOpcode() == ISD::ADD) { // add t15(base), t18(Idx)
SDValue Op00 = Op0.getOperand(0); // Base
Nbase = Op00;
Idx = Op0.getOperand(1);
} else if (!(Op0->getOpcode() == ISD::BUILD_VECTOR &&
Op0.getOperand(0).getOpcode() == ISD::CopyFromReg)) {
return false;
}
if (!checkAndUpdateIndex(Idx)) {
return false;
}
if (Op0 != Nbase) {
auto *ConstEltNo = dyn_cast<ConstantSDNode>(Op1.getOperand(0));
if (!ConstEltNo)
return false;

SmallVector<SDValue, 8> Ops(
Nindex.getValueType().getVectorNumElements(),
DAG.getConstant(ConstEltNo->getZExtValue(), DL,
Nindex.getValueType().getScalarType()));
Nindex = DAG.getNode(ISD::ADD, DL, Nindex.getValueType(), Nindex,
DAG.getBuildVector(Nindex.getValueType(), DL, Ops),
Flags);
}
Base = Nbase.getOperand(0);
Index = Nindex;
Changed = true;
} else if (Base.getOpcode() == ISD::CopyFromReg ||
(Base.getOpcode() == ISD::ADD &&
Base.getOperand(0).getOpcode() == ISD::CopyFromReg &&
isConstOrConstSplat(Base.getOperand(1)))) {
if (checkAndUpdateIndex(Index)) {
Index = Nindex;
Changed = true;
}
}
if (Changed) {
LLVM_DEBUG(dbgs() << "Successful in updating the non uniform gep "
"information\n";
dbgs() << "updated base "; Base.dump();
dbgs() << "updated Index "; Index.dump(););
return true;
}
return false;
}

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
Expand All @@ -56520,6 +56623,8 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (DCI.isBeforeLegalize()) {
if (updateBaseAndIndex(Base, Index, DL, Index, DAG))
return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
unsigned IndexWidth = Index.getScalarValueSizeInBits();

// Shrink indices if they are larger than 32-bits.
Expand Down Expand Up @@ -56622,7 +56727,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
}

return SDValue();
}

Expand Down
211 changes: 40 additions & 171 deletions llvm/test/CodeGen/X86/masked_gather_scatter.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4819,18 +4819,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index:
Expand All @@ -4845,44 +4836,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16
; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%zmm0), %zmm1 {%k1}
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index:
; X86-SKX: # %bb.0:
Expand All @@ -4909,18 +4871,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm0
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1}
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_structpt_16f32_mask_index_offset:
Expand All @@ -4935,44 +4888,15 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a
; X86-KNL-NEXT: vmovaps %zmm1, %zmm0
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm3
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm3 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm0), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm0
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm0
; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm0), %zmm1 {%k1}
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_structpt_16f32_mask_index_offset:
; X86-SKX: # %bb.0:
Expand All @@ -4999,25 +4923,11 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-KNL-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-KNL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-KNL-NEXT: kshiftrw $8, %k1, %k2
; X64-KNL-NEXT: kmovw %k2, %k3
; X64-KNL-NEXT: vmovaps %ymm4, %ymm0
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-KNL-NEXT: vmovaps %ymm1, %ymm5
; X64-KNL-NEXT: kmovw %k1, %k3
; X64-KNL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-KNL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-KNL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-KNL-NEXT: vpslld $4, (%rsi), %zmm2
; X64-KNL-NEXT: kmovw %k1, %k2
; X64-KNL-NEXT: vmovaps %zmm1, %zmm0
; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2}
; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1}
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test_gather_16f32_mask_index_pair:
Expand All @@ -5034,58 +4944,17 @@ define {<16 x float>, <16 x float>} @test_gather_16f32_mask_index_pair(ptr %x, p
; X86-KNL-NEXT: vgatherdps 4(%eax,%zmm2), %zmm1 {%k1}
; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_gather_16f32_mask_index_pair:
; X64-SKX-SMALL: # %bb.0:
; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; X64-SKX-SMALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-SKX-SMALL-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-SMALL-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-SKX-SMALL-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-SMALL-NEXT: kmovw %k2, %k3
; X64-SKX-SMALL-NEXT: vmovaps %ymm4, %ymm0
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-SKX-SMALL-NEXT: vmovaps %ymm1, %ymm5
; X64-SKX-SMALL-NEXT: kmovw %k1, %k3
; X64-SKX-SMALL-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-SKX-SMALL-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-SKX-SMALL-NEXT: retq
;
; X64-SKX-LARGE-LABEL: test_gather_16f32_mask_index_pair:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0
; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
; X64-SKX-LARGE-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm0, %zmm3
; X64-SKX-LARGE-NEXT: vpsllq $4, %zmm2, %zmm2
; X64-SKX-LARGE-NEXT: vextractf64x4 $1, %zmm1, %ymm4
; X64-SKX-LARGE-NEXT: kshiftrw $8, %k1, %k2
; X64-SKX-LARGE-NEXT: vmovaps %ymm4, %ymm0
; X64-SKX-LARGE-NEXT: kmovw %k2, %k3
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm2), %ymm0 {%k3}
; X64-SKX-LARGE-NEXT: vmovaps %ymm1, %ymm5
; X64-SKX-LARGE-NEXT: kmovw %k1, %k3
; X64-SKX-LARGE-NEXT: vgatherqps (%rdi,%zmm3), %ymm5 {%k3}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm0, %zmm5, %zmm0
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm2), %ymm4 {%k2}
; X64-SKX-LARGE-NEXT: vgatherqps 4(%rdi,%zmm3), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vinsertf64x4 $1, %ymm4, %zmm1, %zmm1
; X64-SKX-LARGE-NEXT: retq
; X64-SKX-LABEL: test_gather_16f32_mask_index_pair:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %zmm0
; X64-SKX-NEXT: vpslld $31, %zmm0, %zmm0
; X64-SKX-NEXT: vpmovd2m %zmm0, %k1
; X64-SKX-NEXT: vpslld $4, (%rsi), %zmm2
; X64-SKX-NEXT: kmovw %k1, %k2
; X64-SKX-NEXT: vmovaps %zmm1, %zmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%zmm2), %zmm0 {%k2}
; X64-SKX-NEXT: vgatherdps 4(%rdi,%zmm2), %zmm1 {%k1}
; X64-SKX-NEXT: retq
;
; X86-SKX-LABEL: test_gather_16f32_mask_index_pair:
; X86-SKX: # %bb.0:
Expand Down
Loading