Skip to content

Commit 3469db8

Browse files
[SLP]Add subvector vectorization for non-load nodes
Previously SLP vectorize supported clustered vectorization for loads only. This patch adds support for "clustered" vectorization for other instructions. If the buildvector node contains "clusters", which can be vectorized separately and then inserted into the resulting buildvector result, it is better to do, since it may reduce the cost of the vector graph and produce better vector code. The patch does some analysis, if it is profitable to try to do this kind of extra vectorization. It checks the scalar instructions and its operands and tries to vectorize them only if they result in a better graph. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #108430
1 parent aea0668 commit 3469db8

File tree

8 files changed

+212
-103
lines changed

8 files changed

+212
-103
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 136 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,6 +1347,7 @@ class BoUpSLP {
13471347
}
13481348
MinBWs.clear();
13491349
ReductionBitWidth = 0;
1350+
BaseGraphSize = 1;
13501351
CastMaxMinBWSizes.reset();
13511352
ExtraBitWidthNodes.clear();
13521353
InstrElementSize.clear();
@@ -1355,11 +1356,10 @@ class BoUpSLP {
13551356
ValueToGatherNodes.clear();
13561357
}
13571358

1358-
unsigned getTreeSize() const {
1359-
return GatheredLoadsEntriesFirst == NoGatheredLoads
1360-
? VectorizableTree.size()
1361-
: GatheredLoadsEntriesFirst;
1362-
}
1359+
unsigned getTreeSize() const { return VectorizableTree.size(); }
1360+
1361+
/// Returns the base graph size, before any transformations.
1362+
unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
13631363

13641364
/// Perform LICM and CSE on the newly generated gather sequences.
13651365
void optimizeGatherSequence();
@@ -4191,6 +4191,9 @@ class BoUpSLP {
41914191
/// reduction.
41924192
unsigned ReductionBitWidth = 0;
41934193

4194+
/// Canonical graph size before the transformations.
4195+
unsigned BaseGraphSize = 1;
4196+
41944197
/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
41954198
/// type sizes, used in the tree.
41964199
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
@@ -9001,47 +9004,147 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
90019004

90029005
void BoUpSLP::transformNodes() {
90039006
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9007+
BaseGraphSize = VectorizableTree.size();
9008+
// Operands are profitable if they are:
9009+
// 1. At least one constant
9010+
// or
9011+
// 2. Splats
9012+
// or
9013+
// 3. Results in good vectorization opportunity, i.e. may generate vector
9014+
// nodes and reduce cost of the graph.
9015+
auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9016+
const InstructionsState &S) {
9017+
SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
9018+
for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
9019+
Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9020+
I2->getOperand(Op));
9021+
return all_of(
9022+
Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9023+
return all_of(Cand,
9024+
[](const std::pair<Value *, Value *> &P) {
9025+
return isa<Constant>(P.first) ||
9026+
isa<Constant>(P.second) || P.first == P.second;
9027+
}) ||
9028+
findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
9029+
});
9030+
};
90049031
// The tree may grow here, so iterate over nodes, built before.
9005-
for (unsigned Idx : seq<unsigned>(VectorizableTree.size())) {
9032+
for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
90069033
TreeEntry &E = *VectorizableTree[Idx];
90079034
if (E.isGather()) {
90089035
ArrayRef<Value *> VL = E.Scalars;
90099036
const unsigned Sz = getVectorElementSize(VL.front());
90109037
unsigned MinVF = getMinVF(2 * Sz);
9038+
// Do not try partial vectorization for small nodes (<= 2), nodes with the
9039+
// same opcode and same parent block or all constants.
90119040
if (VL.size() <= 2 ||
9012-
(E.getOpcode() &&
9013-
(E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
9041+
!(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9042+
E.isAltShuffle() || !allSameBlock(VL)) ||
9043+
allConstant(VL) || isSplat(VL))
90149044
continue;
90159045
// Try to find vectorizable sequences and transform them into a series of
90169046
// insertvector instructions.
90179047
unsigned StartIdx = 0;
90189048
unsigned End = VL.size();
9019-
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
9049+
for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
9050+
SmallVector<unsigned> Slices;
90209051
for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
90219052
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
90229053
// If any instruction is vectorized already - do not try again.
9023-
if (getTreeEntry(Slice.front()) || getTreeEntry(Slice.back()))
9054+
// Reuse the existing node, if it fully matches the slice.
9055+
if (const TreeEntry *SE = getTreeEntry(Slice.front());
9056+
SE || getTreeEntry(Slice.back())) {
9057+
if (!SE)
9058+
continue;
9059+
if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9060+
continue;
9061+
}
9062+
// Constant already handled effectively - skip.
9063+
if (allConstant(Slice))
90249064
continue;
9025-
InstructionsState S = getSameOpcode(Slice, *TLI);
9026-
if (!S.getOpcode() || S.isAltShuffle() ||
9027-
(S.getOpcode() != Instruction::Load &&
9028-
any_of(Slice, [&](Value *V) {
9029-
return !areAllUsersVectorized(cast<Instruction>(V),
9030-
UserIgnoreList);
9031-
})))
9065+
// Do not try to vectorize small splats (less than vector register and
9066+
// only with the single non-undef element).
9067+
bool IsSplat = isSplat(Slice);
9068+
if (Slices.empty() || !IsSplat ||
9069+
(VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9070+
Slice.front()->getType(), VF)),
9071+
1U, VF - 1) !=
9072+
std::clamp(TTI->getNumberOfParts(getWidenedType(
9073+
Slice.front()->getType(), 2 * VF)),
9074+
1U, 2 * VF)) ||
9075+
count(Slice, Slice.front()) ==
9076+
(isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
9077+
if (IsSplat)
9078+
continue;
9079+
InstructionsState S = getSameOpcode(Slice, *TLI);
9080+
if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
9081+
continue;
9082+
if (VF == 2) {
9083+
// Try to vectorize reduced values or if all users are vectorized.
9084+
// For expensive instructions extra extracts might be profitable.
9085+
if ((!UserIgnoreList || E.Idx != 0) &&
9086+
TTI->getInstructionCost(cast<Instruction>(Slice.front()),
9087+
CostKind) < TTI::TCC_Expensive &&
9088+
!all_of(Slice, [&](Value *V) {
9089+
return areAllUsersVectorized(cast<Instruction>(V),
9090+
UserIgnoreList);
9091+
}))
9092+
continue;
9093+
if (S.getOpcode() == Instruction::Load) {
9094+
OrdersType Order;
9095+
SmallVector<Value *> PointerOps;
9096+
LoadsState Res =
9097+
canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9098+
// Do not vectorize gathers.
9099+
if (Res == LoadsState::ScatterVectorize ||
9100+
Res == LoadsState::Gather)
9101+
continue;
9102+
} else if (S.getOpcode() == Instruction::ExtractElement ||
9103+
(TTI->getInstructionCost(
9104+
cast<Instruction>(Slice.front()), CostKind) <
9105+
TTI::TCC_Expensive &&
9106+
!CheckOperandsProfitability(
9107+
cast<Instruction>(Slice.front()),
9108+
cast<Instruction>(Slice.back()), S))) {
9109+
// Do not vectorize extractelements (handled effectively
9110+
// alread). Do not vectorize non-profitable instructions (with
9111+
// low cost and non-vectorizable operands.)
9112+
continue;
9113+
}
9114+
}
9115+
}
9116+
Slices.emplace_back(Cnt);
9117+
}
9118+
auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
9119+
E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9120+
if (StartIdx == Cnt)
9121+
StartIdx = Cnt + VF;
9122+
if (End == Cnt + VF)
9123+
End = Cnt;
9124+
};
9125+
for (unsigned Cnt : Slices) {
9126+
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9127+
// If any instruction is vectorized already - do not try again.
9128+
if (const TreeEntry *SE = getTreeEntry(Slice.front());
9129+
SE || getTreeEntry(Slice.back())) {
9130+
if (!SE)
9131+
continue;
9132+
if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9133+
continue;
9134+
AddCombinedNode(SE->Idx, Cnt);
90329135
continue;
9136+
}
90339137
unsigned PrevSize = VectorizableTree.size();
90349138
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
90359139
if (PrevSize + 1 == VectorizableTree.size() &&
9036-
VectorizableTree[PrevSize]->isGather()) {
9140+
VectorizableTree[PrevSize]->isGather() &&
9141+
VectorizableTree[PrevSize]->getOpcode() !=
9142+
Instruction::ExtractElement &&
9143+
!isSplat(Slice)) {
90379144
VectorizableTree.pop_back();
90389145
continue;
90399146
}
9040-
E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
9041-
if (StartIdx == Cnt)
9042-
StartIdx = Cnt + VF;
9043-
if (End == Cnt + VF)
9044-
End = Cnt;
9147+
AddCombinedNode(PrevSize, Cnt);
90459148
}
90469149
}
90479150
}
@@ -12293,6 +12396,14 @@ BoUpSLP::isGatherShuffledEntry(
1229312396
"Expected only single user of the gather node.");
1229412397
assert(VL.size() % NumParts == 0 &&
1229512398
"Number of scalars must be divisible by NumParts.");
12399+
if (!TE->UserTreeIndices.empty() &&
12400+
TE->UserTreeIndices.front().UserTE->isGather() &&
12401+
TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
12402+
assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
12403+
isSplat(TE->Scalars)) &&
12404+
"Expected splat or extractelements only node.");
12405+
return {};
12406+
}
1229612407
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
1229712408
SmallVector<std::optional<TTI::ShuffleKind>> Res;
1229812409
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -17119,7 +17230,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1711917230
if (R.isGathered(Chain.front()) ||
1712017231
R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
1712117232
return std::nullopt;
17122-
Size = R.getTreeSize();
17233+
Size = R.getCanonicalGraphSize();
1712317234
return false;
1712417235
}
1712517236
R.reorderTopToBottom();
@@ -17129,7 +17240,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1712917240

1713017241
R.computeMinimumValueSizes();
1713117242

17132-
Size = R.getTreeSize();
17243+
Size = R.getCanonicalGraphSize();
1713317244
if (S.getOpcode() == Instruction::Load)
1713417245
Size = 2; // cut off masked gather small trees
1713517246
InstructionCost Cost = R.getTreeCost();

llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -685,10 +685,10 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
685685
; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1
686686
; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64
687687
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]]
688-
; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2
689-
; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64
690-
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]]
691-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
688+
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4
689+
; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1
690+
; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64
691+
; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]]
692692
; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3
693693
; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64
694694
; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]]
@@ -700,8 +700,8 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
700700
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
701701
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
702702
; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]]
703-
; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]]
704-
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4
703+
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
704+
; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]]
705705
; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
706706
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4
707707
; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]]
@@ -715,21 +715,21 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
715715
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]]
716716
; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]]
717717
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
718-
; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28
719-
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4
720-
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4
718+
; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
719+
; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32
720+
; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
721+
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
721722
; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
722723
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
723-
; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]]
724724
; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]]
725725
; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44
726726
; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36
727727
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4
728728
; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4
729729
; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4
730730
; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4
731-
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4
732-
; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4
731+
; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4
732+
; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4
733733
; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4
734734
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]]
735735
; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>

llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,12 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
259259
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
260260
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
261261
; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
262-
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
263-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer
264-
; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]]
265-
; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[PTR]], align 2
262+
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP8]], <8 x i8> [[TMP0]], i64 0)
263+
; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP10]], <4 x i8> [[TMP3]], i64 12)
264+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
265+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer
266+
; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP11]], <16 x i8> [[TMP13]]
267+
; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[PTR]], align 2
266268
; CHECK-NEXT: ret void
267269
;
268270
entry:

llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@ define void @test() {
1212
; CHECK-NEXT: ret void
1313
; CHECK: [[BB6]]:
1414
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ]
15-
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <2 x i32> [[TMP1]], i64 2)
15+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
16+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 4>
1617
; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> zeroinitializer, [[TMP2]]
1718
; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> zeroinitializer, [[TMP2]]
1819
; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
19-
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <2 x i32> <i32 2, i32 poison>
20-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> <i32 poison, i32 0>, <2 x i32> <i32 0, i32 3>
20+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 poison>, <2 x i32> <i32 2, i32 1>
2121
; CHECK-NEXT: [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]]
2222
; CHECK-NEXT: br i1 false, label %[[BB2]], label %[[BB6]]
2323
;

0 commit comments

Comments
 (0)