Skip to content

[SLP]Add subvector vectorization for non-load nodes #108430

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 134 additions & 20 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1347,6 +1347,7 @@ class BoUpSLP {
}
MinBWs.clear();
ReductionBitWidth = 0;
BaseGraphSize = 1;
CastMaxMinBWSizes.reset();
ExtraBitWidthNodes.clear();
InstrElementSize.clear();
Expand All @@ -1357,6 +1358,9 @@ class BoUpSLP {

unsigned getTreeSize() const { return VectorizableTree.size(); }

/// Returns the base graph size, before any transformations.
unsigned getCanonicalGraphSize() const { return BaseGraphSize; }

/// Perform LICM and CSE on the newly generated gather sequences.
void optimizeGatherSequence();

Expand Down Expand Up @@ -4142,6 +4146,9 @@ class BoUpSLP {
/// reduction.
unsigned ReductionBitWidth = 0;

/// Canonical graph size before the transformations.
unsigned BaseGraphSize = 1;

/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
/// type sizes, used in the tree.
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
Expand Down Expand Up @@ -8447,47 +8454,147 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,

void BoUpSLP::transformNodes() {
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
BaseGraphSize = VectorizableTree.size();
// Operands are profitable if they are:
// 1. At least one constant
// or
// 2. Splats
// or
// 3. Results in good vectorization opportunity, i.e. may generate vector
// nodes and reduce cost of the graph.
auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
const InstructionsState &S) {
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
I2->getOperand(Op));
return all_of(
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
return all_of(Cand,
[](const std::pair<Value *, Value *> &P) {
return isa<Constant>(P.first) ||
isa<Constant>(P.second) || P.first == P.second;
}) ||
findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
});
};
// The tree may grow here, so iterate over nodes, built before.
for (unsigned Idx : seq<unsigned>(VectorizableTree.size())) {
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
TreeEntry &E = *VectorizableTree[Idx];
if (E.isGather()) {
ArrayRef<Value *> VL = E.Scalars;
const unsigned Sz = getVectorElementSize(VL.front());
unsigned MinVF = getMinVF(2 * Sz);
// Do not try partial vectorization for small nodes (<= 2), nodes with the
// same opcode and same parent block or all constants.
if (VL.size() <= 2 ||
(E.getOpcode() &&
(E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
E.isAltShuffle() || !allSameBlock(VL)) ||
allConstant(VL) || isSplat(VL))
continue;
// Try to find vectorizable sequences and transform them into a series of
// insertvector instructions.
unsigned StartIdx = 0;
unsigned End = VL.size();
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
SmallVector<unsigned> Slices;
for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
// If any instruction is vectorized already - do not try again.
if (getTreeEntry(Slice.front()) || getTreeEntry(Slice.back()))
// Reuse the existing node, if it fully matches the slice.
if (const TreeEntry *SE = getTreeEntry(Slice.front());
SE || getTreeEntry(Slice.back())) {
if (!SE)
continue;
if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
continue;
}
// Constant already handled effectively - skip.
if (allConstant(Slice))
continue;
InstructionsState S = getSameOpcode(Slice, *TLI);
if (!S.getOpcode() || S.isAltShuffle() ||
(S.getOpcode() != Instruction::Load &&
any_of(Slice, [&](Value *V) {
return !areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
})))
// Do not try to vectorize small splats (less than vector register and
// only with the single non-undef element).
bool IsSplat = isSplat(Slice);
if (Slices.empty() || !IsSplat ||
(VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
Slice.front()->getType(), VF)),
1U, VF - 1) !=
std::clamp(TTI->getNumberOfParts(getWidenedType(
Slice.front()->getType(), 2 * VF)),
1U, 2 * VF)) ||
count(Slice, Slice.front()) ==
(isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
if (IsSplat)
continue;
InstructionsState S = getSameOpcode(Slice, *TLI);
if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
continue;
if (VF == 2) {
// Try to vectorize reduced values or if all users are vectorized.
// For expensive instructions extra extracts might be profitable.
if ((!UserIgnoreList || E.Idx != 0) &&
TTI->getInstructionCost(cast<Instruction>(Slice.front()),
CostKind) < TTI::TCC_Expensive &&
!all_of(Slice, [&](Value *V) {
return areAllUsersVectorized(cast<Instruction>(V),
UserIgnoreList);
}))
continue;
if (S.getOpcode() == Instruction::Load) {
OrdersType Order;
SmallVector<Value *> PointerOps;
LoadsState Res =
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
// Do not vectorize gathers.
if (Res == LoadsState::ScatterVectorize ||
Res == LoadsState::Gather)
continue;
} else if (S.getOpcode() == Instruction::ExtractElement ||
(TTI->getInstructionCost(
cast<Instruction>(Slice.front()), CostKind) <
TTI::TCC_Expensive &&
!CheckOperandsProfitability(
cast<Instruction>(Slice.front()),
cast<Instruction>(Slice.back()), S))) {
// Do not vectorize extractelements (handled effectively
// alread). Do not vectorize non-profitable instructions (with
// low cost and non-vectorizable operands.)
continue;
}
}
}
Slices.emplace_back(Cnt);
}
auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
if (StartIdx == Cnt)
StartIdx = Cnt + VF;
if (End == Cnt + VF)
End = Cnt;
};
for (unsigned Cnt : Slices) {
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
// If any instruction is vectorized already - do not try again.
if (const TreeEntry *SE = getTreeEntry(Slice.front());
SE || getTreeEntry(Slice.back())) {
if (!SE)
continue;
if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
continue;
AddCombinedNode(SE->Idx, Cnt);
continue;
}
unsigned PrevSize = VectorizableTree.size();
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
if (PrevSize + 1 == VectorizableTree.size() &&
VectorizableTree[PrevSize]->isGather()) {
VectorizableTree[PrevSize]->isGather() &&
VectorizableTree[PrevSize]->getOpcode() !=
Instruction::ExtractElement &&
!isSplat(Slice)) {
VectorizableTree.pop_back();
continue;
}
E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
if (StartIdx == Cnt)
StartIdx = Cnt + VF;
if (End == Cnt + VF)
End = Cnt;
AddCombinedNode(PrevSize, Cnt);
}
}
}
Expand Down Expand Up @@ -11691,6 +11798,13 @@ BoUpSLP::isGatherShuffledEntry(
"Expected only single user of the gather node.");
assert(VL.size() % NumParts == 0 &&
"Number of scalars must be divisible by NumParts.");
if (TE->UserTreeIndices.front().UserTE->isGather() &&
TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
assert((TE->getOpcode() == Instruction::ExtractElement ||
isSplat(TE->Scalars)) &&
"Expected splat or extractelements only node.");
return {};
}
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
SmallVector<std::optional<TTI::ShuffleKind>> Res;
for (unsigned Part : seq<unsigned>(NumParts)) {
Expand Down Expand Up @@ -16467,7 +16581,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
if (R.isGathered(Chain.front()) ||
R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
return std::nullopt;
Size = R.getTreeSize();
Size = R.getCanonicalGraphSize();
return false;
}
R.reorderTopToBottom();
Expand All @@ -16477,7 +16591,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,

R.computeMinimumValueSizes();

Size = R.getTreeSize();
Size = R.getCanonicalGraphSize();
if (S.getOpcode() == Instruction::Load)
Size = 2; // cut off masked gather small trees
InstructionCost Cost = R.getTreeCost();
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
Original file line number Diff line number Diff line change
Expand Up @@ -685,10 +685,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2
; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]]
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
Expand All @@ -700,8 +700,8 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
Expand All @@ -715,21 +715,21 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44
; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4
; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,24 +255,19 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12)
; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11>
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]]
; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP2]], i64 12)
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]]
; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[PTR]], align 2
; CHECK-NEXT: ret void
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ define void @test() {
; CHECK-NEXT: ret void
; CHECK: [[BB6]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ]
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> [[TMP1]], i64 2)
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> zeroinitializer, [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> zeroinitializer, [[TMP2]]
; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> <i32 2, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
; CHECK-NEXT: [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]]
; CHECK-NEXT: br i1 false, label %[[BB2]], label %[[BB6]]
;
Expand Down
Loading
Loading