Skip to content

Commit 30edf1c

Browse files
[SLP]Do not early exit if the number of unique elements is non-power-of-2. (#65476)
We still can try to vectorize the bundle of the instructions, even if the repeated number of instruction is non-power-of-2. In this case need to adjust the cost (calculate the cost only for unique scalar instructions) and cost of the extracts. Also, when scheduling the bundle need to schedule only unique scalars to avoid compiler crash because of the multiple dependencies. Can be safely applied only if all scalars's users are also vectorized and do not require memory accesses (this one is a temporarily requirement, can be relaxed later). --------- Co-authored-by: Alexey Bataev <[email protected]>
1 parent 2c93e3c commit 30edf1c

File tree

6 files changed

+108
-105
lines changed

6 files changed

+108
-105
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2911,7 +2911,8 @@ class BoUpSLP {
29112911
}
29122912
if (Last->State != TreeEntry::NeedToGather) {
29132913
for (Value *V : VL) {
2914-
assert(!getTreeEntry(V) && "Scalar already in tree!");
2914+
[[maybe_unused]] const TreeEntry *TE = getTreeEntry(V);
2915+
assert((!TE || TE == Last) && "Scalar already in tree!");
29152916
ScalarToTreeEntry[V] = Last;
29162917
}
29172918
// Update the scheduler bundle to point to this TreeEntry.
@@ -2924,7 +2925,8 @@ class BoUpSLP {
29242925
for (Value *V : VL) {
29252926
if (doesNotNeedToBeScheduled(V))
29262927
continue;
2927-
assert(BundleMember && "Unexpected end of bundle.");
2928+
if (!BundleMember)
2929+
continue;
29282930
BundleMember->TE = Last;
29292931
BundleMember = BundleMember->NextInBundle;
29302932
}
@@ -5583,9 +5585,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
55835585

55845586
SmallVector<int> ReuseShuffleIndicies;
55855587
SmallVector<Value *> UniqueValues;
5586-
auto &&TryToFindDuplicates = [&VL, &ReuseShuffleIndicies, &UniqueValues,
5587-
&UserTreeIdx,
5588-
this](const InstructionsState &S) {
5588+
SmallVector<Value *> NonUniqueValueVL;
5589+
auto TryToFindDuplicates = [&](const InstructionsState &S,
5590+
bool DoNotFail = false) {
55895591
// Check that every instruction appears once in this bundle.
55905592
DenseMap<Value *, unsigned> UniquePositions(VL.size());
55915593
for (Value *V : VL) {
@@ -5612,6 +5614,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
56125614
!isConstant(V);
56135615
})) ||
56145616
!llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
5617+
if (DoNotFail && UniquePositions.size() > 1 &&
5618+
NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
5619+
all_of(UniqueValues, [=](Value *V) {
5620+
return isa<ExtractElementInst>(V) ||
5621+
areAllUsersVectorized(cast<Instruction>(V),
5622+
UserIgnoreList);
5623+
})) {
5624+
unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
5625+
if (PWSz == VL.size()) {
5626+
ReuseShuffleIndicies.clear();
5627+
} else {
5628+
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5629+
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5630+
UniqueValues.back());
5631+
VL = NonUniqueValueVL;
5632+
}
5633+
return true;
5634+
}
56155635
LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
56165636
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
56175637
return false;
@@ -5857,7 +5877,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
58575877
}
58585878

58595879
// Check that every instruction appears once in this bundle.
5860-
if (!TryToFindDuplicates(S))
5880+
if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
58615881
return;
58625882

58635883
// Perform specific checks for each particular instruction kind.
@@ -5877,7 +5897,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
58775897

58785898
BlockScheduling &BS = *BSRef;
58795899

5880-
std::optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
5900+
std::optional<ScheduleData *> Bundle =
5901+
BS.tryScheduleBundle(UniqueValues, this, S);
58815902
#ifdef EXPENSIVE_CHECKS
58825903
// Make sure we didn't break any internal invariants
58835904
BS.verify();
@@ -7537,7 +7558,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
75377558
Instruction *VL0 = E->getMainOp();
75387559
unsigned ShuffleOrOp =
75397560
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
7540-
const unsigned Sz = VL.size();
7561+
SetVector<Value *> UniqueValues(VL.begin(), VL.end());
7562+
const unsigned Sz = UniqueValues.size();
75417563
auto GetCostDiff =
75427564
[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
75437565
function_ref<InstructionCost(InstructionCost)> VectorCost) {
@@ -7644,7 +7666,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
76447666
// Count reused scalars.
76457667
InstructionCost ScalarCost = 0;
76467668
SmallPtrSet<const TreeEntry *, 4> CountedOps;
7647-
for (Value *V : VL) {
7669+
for (Value *V : UniqueValues) {
76487670
auto *PHI = dyn_cast<PHINode>(V);
76497671
if (!PHI)
76507672
continue;
@@ -7665,8 +7687,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
76657687
}
76667688
case Instruction::ExtractValue:
76677689
case Instruction::ExtractElement: {
7668-
auto GetScalarCost = [=](unsigned Idx) {
7669-
auto *I = cast<Instruction>(VL[Idx]);
7690+
auto GetScalarCost = [&](unsigned Idx) {
7691+
auto *I = cast<Instruction>(UniqueValues[Idx]);
76707692
VectorType *SrcVecTy;
76717693
if (ShuffleOrOp == Instruction::ExtractElement) {
76727694
auto *EE = cast<ExtractElementInst>(I);
@@ -7844,9 +7866,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
78447866
Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
78457867
}
78467868
}
7847-
auto GetScalarCost = [=](unsigned Idx) {
7848-
auto *VI =
7849-
VL0->getOpcode() == Opcode ? cast<Instruction>(VL[Idx]) : nullptr;
7869+
auto GetScalarCost = [&](unsigned Idx) {
7870+
auto *VI = VL0->getOpcode() == Opcode
7871+
? cast<Instruction>(UniqueValues[Idx])
7872+
: nullptr;
78507873
return TTI->getCastInstrCost(Opcode, ScalarTy, SrcScalarTy,
78517874
TTI::getCastContextHint(VI), CostKind, VI);
78527875
};
@@ -7891,7 +7914,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
78917914
? CmpInst::BAD_FCMP_PREDICATE
78927915
: CmpInst::BAD_ICMP_PREDICATE;
78937916
auto GetScalarCost = [&](unsigned Idx) {
7894-
auto *VI = cast<Instruction>(VL[Idx]);
7917+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
78957918
CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
78967919
? CmpInst::BAD_FCMP_PREDICATE
78977920
: CmpInst::BAD_ICMP_PREDICATE;
@@ -7951,8 +7974,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79517974
case Instruction::And:
79527975
case Instruction::Or:
79537976
case Instruction::Xor: {
7954-
auto GetScalarCost = [=](unsigned Idx) {
7955-
auto *VI = cast<Instruction>(VL[Idx]);
7977+
auto GetScalarCost = [&](unsigned Idx) {
7978+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
79567979
unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
79577980
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
79587981
TTI::OperandValueInfo Op2Info =
@@ -7975,14 +7998,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79757998
return CommonCost + GetGEPCostDiff(VL, VL0);
79767999
}
79778000
case Instruction::Load: {
7978-
auto GetScalarCost = [=](unsigned Idx) {
7979-
auto *VI = cast<LoadInst>(VL[Idx]);
8001+
auto GetScalarCost = [&](unsigned Idx) {
8002+
auto *VI = cast<LoadInst>(UniqueValues[Idx]);
79808003
return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(),
79818004
VI->getPointerAddressSpace(), CostKind,
79828005
TTI::OperandValueInfo(), VI);
79838006
};
79848007
auto *LI0 = cast<LoadInst>(VL0);
7985-
auto GetVectorCost = [=](InstructionCost CommonCost) {
8008+
auto GetVectorCost = [&](InstructionCost CommonCost) {
79868009
InstructionCost VecLdCost;
79878010
if (E->State == TreeEntry::Vectorize) {
79888011
VecLdCost = TTI->getMemoryOpCost(
@@ -7993,7 +8016,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
79938016
E->State == TreeEntry::PossibleStridedVectorize) &&
79948017
"Unknown EntryState");
79958018
Align CommonAlignment = LI0->getAlign();
7996-
for (Value *V : VL)
8019+
for (Value *V : UniqueValues)
79978020
CommonAlignment =
79988021
std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
79998022
VecLdCost = TTI->getGatherScatterOpCost(
@@ -8045,8 +8068,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
80458068
GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
80468069
}
80478070
case Instruction::Call: {
8048-
auto GetScalarCost = [=](unsigned Idx) {
8049-
auto *CI = cast<CallInst>(VL[Idx]);
8071+
auto GetScalarCost = [&](unsigned Idx) {
8072+
auto *CI = cast<CallInst>(UniqueValues[Idx]);
80508073
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
80518074
if (ID != Intrinsic::not_intrinsic) {
80528075
IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
@@ -8087,8 +8110,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
80878110
}
80888111
return false;
80898112
};
8090-
auto GetScalarCost = [=](unsigned Idx) {
8091-
auto *VI = cast<Instruction>(VL[Idx]);
8113+
auto GetScalarCost = [&](unsigned Idx) {
8114+
auto *VI = cast<Instruction>(UniqueValues[Idx]);
80928115
assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
80938116
(void)E;
80948117
return TTI->getInstructionCost(VI, CostKind);
@@ -8607,6 +8630,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
86078630
SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
86088631
SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
86098632
SmallVector<APInt> DemandedElts;
8633+
SmallDenseSet<Value *, 4> UsedInserts;
86108634
for (ExternalUser &EU : ExternalUses) {
86118635
// We only add extract cost once for the same scalar.
86128636
if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -8627,6 +8651,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
86278651
// to detect it as a final shuffled/identity match.
86288652
if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
86298653
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
8654+
if (!UsedInserts.insert(VU).second)
8655+
continue;
86308656
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
86318657
if (InsertIdx) {
86328658
const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
@@ -11008,6 +11034,7 @@ Value *BoUpSLP::vectorizeTree(
1100811034
// Maps extract Scalar to the corresponding extractelement instruction in the
1100911035
// basic block. Only one extractelement per block should be emitted.
1101011036
DenseMap<Value *, DenseMap<BasicBlock *, Instruction *>> ScalarToEEs;
11037+
SmallDenseSet<Value *, 4> UsedInserts;
1101111038
// Extract all of the elements with the external uses.
1101211039
for (const auto &ExternalUse : ExternalUses) {
1101311040
Value *Scalar = ExternalUse.Scalar;
@@ -11106,6 +11133,8 @@ Value *BoUpSLP::vectorizeTree(
1110611133
// Skip if the scalar is another vector op or Vec is not an instruction.
1110711134
if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
1110811135
if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
11136+
if (!UsedInserts.insert(VU).second)
11137+
continue;
1110911138
std::optional<unsigned> InsertIdx = getInsertIndex(VU);
1111011139
if (InsertIdx) {
1111111140
// Need to use original vector, if the root is truncated.

llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
123123

124124
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125125
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126-
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
127-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
128-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
129-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
130-
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
131-
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
132-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
133-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
134-
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
135-
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
136-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
137-
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
138-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
139-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
140-
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
141-
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
126+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
128+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
129+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130+
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
131+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132+
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
136+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137+
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
138+
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
142139
;
143140
%v0.0 = extractelement <2 x i32> %v0, i32 0
144141
%v0.1 = extractelement <2 x i32> %v0, i32 1

llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -123,22 +123,19 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
123123

124124
define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
125125
; CHECK-LABEL: @build_vec_v4i32_reuse_1(
126-
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[V1:%.*]], i64 1
127-
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[V1]], i64 0
128-
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0:%.*]], i64 1
129-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0
130-
; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]]
131-
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]]
132-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0
133-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0
134-
; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]]
135-
; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]]
136-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
137-
; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]]
138-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison>
139-
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
140-
; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
141-
; CHECK-NEXT: ret <4 x i32> [[TMP2_31]]
126+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
127+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
128+
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[V0]], i64 1
129+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V1]], i64 1
130+
; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP4]]
131+
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
132+
; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]]
133+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
134+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 2>
135+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP0_1]], i64 0
136+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
137+
; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]]
138+
; CHECK-NEXT: ret <4 x i32> [[TMP11]]
142139
;
143140
%v0.0 = extractelement <2 x i32> %v0, i32 0
144141
%v0.1 = extractelement <2 x i32> %v0, i32 1

0 commit comments

Comments
 (0)