@@ -1347,6 +1347,7 @@ class BoUpSLP {
1347
1347
}
1348
1348
MinBWs.clear();
1349
1349
ReductionBitWidth = 0;
1350
+ BaseGraphSize = 1;
1350
1351
CastMaxMinBWSizes.reset();
1351
1352
ExtraBitWidthNodes.clear();
1352
1353
InstrElementSize.clear();
@@ -1355,11 +1356,10 @@ class BoUpSLP {
1355
1356
ValueToGatherNodes.clear();
1356
1357
}
1357
1358
1358
- unsigned getTreeSize() const {
1359
- return GatheredLoadsEntriesFirst == NoGatheredLoads
1360
- ? VectorizableTree.size()
1361
- : GatheredLoadsEntriesFirst;
1362
- }
1359
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
1360
+
1361
+ /// Returns the base graph size, before any transformations.
1362
+ unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1363
1363
1364
1364
/// Perform LICM and CSE on the newly generated gather sequences.
1365
1365
void optimizeGatherSequence();
@@ -4191,6 +4191,9 @@ class BoUpSLP {
4191
4191
/// reduction.
4192
4192
unsigned ReductionBitWidth = 0;
4193
4193
4194
+ /// Canonical graph size before the transformations.
4195
+ unsigned BaseGraphSize = 1;
4196
+
4194
4197
/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4195
4198
/// type sizes, used in the tree.
4196
4199
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
@@ -9001,47 +9004,147 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
9001
9004
9002
9005
void BoUpSLP::transformNodes() {
9003
9006
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9007
+ BaseGraphSize = VectorizableTree.size();
9008
+ // Operands are profitable if they are:
9009
+ // 1. At least one constant
9010
+ // or
9011
+ // 2. Splats
9012
+ // or
9013
+ // 3. Results in good vectorization opportunity, i.e. may generate vector
9014
+ // nodes and reduce cost of the graph.
9015
+ auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9016
+ const InstructionsState &S) {
9017
+ SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
9018
+ for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
9019
+ Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9020
+ I2->getOperand(Op));
9021
+ return all_of(
9022
+ Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9023
+ return all_of(Cand,
9024
+ [](const std::pair<Value *, Value *> &P) {
9025
+ return isa<Constant>(P.first) ||
9026
+ isa<Constant>(P.second) || P.first == P.second;
9027
+ }) ||
9028
+ findBestRootPair(Cand, LookAheadHeuristics::ScoreSplatLoads);
9029
+ });
9030
+ };
9004
9031
// The tree may grow here, so iterate over nodes, built before.
9005
- for (unsigned Idx : seq<unsigned>(VectorizableTree.size() )) {
9032
+ for (unsigned Idx : seq<unsigned>(BaseGraphSize )) {
9006
9033
TreeEntry &E = *VectorizableTree[Idx];
9007
9034
if (E.isGather()) {
9008
9035
ArrayRef<Value *> VL = E.Scalars;
9009
9036
const unsigned Sz = getVectorElementSize(VL.front());
9010
9037
unsigned MinVF = getMinVF(2 * Sz);
9038
+ // Do not try partial vectorization for small nodes (<= 2), nodes with the
9039
+ // same opcode and same parent block or all constants.
9011
9040
if (VL.size() <= 2 ||
9012
- (E.getOpcode() &&
9013
- (E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
9041
+ !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9042
+ E.isAltShuffle() || !allSameBlock(VL)) ||
9043
+ allConstant(VL) || isSplat(VL))
9014
9044
continue;
9015
9045
// Try to find vectorizable sequences and transform them into a series of
9016
9046
// insertvector instructions.
9017
9047
unsigned StartIdx = 0;
9018
9048
unsigned End = VL.size();
9019
- for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
9049
+ for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
9050
+ SmallVector<unsigned> Slices;
9020
9051
for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9021
9052
ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9022
9053
// If any instruction is vectorized already - do not try again.
9023
- if (getTreeEntry(Slice.front()) || getTreeEntry(Slice.back()))
9054
+ // Reuse the existing node, if it fully matches the slice.
9055
+ if (const TreeEntry *SE = getTreeEntry(Slice.front());
9056
+ SE || getTreeEntry(Slice.back())) {
9057
+ if (!SE)
9058
+ continue;
9059
+ if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9060
+ continue;
9061
+ }
9062
+ // Constant already handled effectively - skip.
9063
+ if (allConstant(Slice))
9024
9064
continue;
9025
- InstructionsState S = getSameOpcode(Slice, *TLI);
9026
- if (!S.getOpcode() || S.isAltShuffle() ||
9027
- (S.getOpcode() != Instruction::Load &&
9028
- any_of(Slice, [&](Value *V) {
9029
- return !areAllUsersVectorized(cast<Instruction>(V),
9030
- UserIgnoreList);
9031
- })))
9065
+ // Do not try to vectorize small splats (less than vector register and
9066
+ // only with the single non-undef element).
9067
+ bool IsSplat = isSplat(Slice);
9068
+ if (Slices.empty() || !IsSplat ||
9069
+ (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9070
+ Slice.front()->getType(), VF)),
9071
+ 1U, VF - 1) !=
9072
+ std::clamp(TTI->getNumberOfParts(getWidenedType(
9073
+ Slice.front()->getType(), 2 * VF)),
9074
+ 1U, 2 * VF)) ||
9075
+ count(Slice, Slice.front()) ==
9076
+ (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
9077
+ if (IsSplat)
9078
+ continue;
9079
+ InstructionsState S = getSameOpcode(Slice, *TLI);
9080
+ if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
9081
+ continue;
9082
+ if (VF == 2) {
9083
+ // Try to vectorize reduced values or if all users are vectorized.
9084
+ // For expensive instructions extra extracts might be profitable.
9085
+ if ((!UserIgnoreList || E.Idx != 0) &&
9086
+ TTI->getInstructionCost(cast<Instruction>(Slice.front()),
9087
+ CostKind) < TTI::TCC_Expensive &&
9088
+ !all_of(Slice, [&](Value *V) {
9089
+ return areAllUsersVectorized(cast<Instruction>(V),
9090
+ UserIgnoreList);
9091
+ }))
9092
+ continue;
9093
+ if (S.getOpcode() == Instruction::Load) {
9094
+ OrdersType Order;
9095
+ SmallVector<Value *> PointerOps;
9096
+ LoadsState Res =
9097
+ canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9098
+ // Do not vectorize gathers.
9099
+ if (Res == LoadsState::ScatterVectorize ||
9100
+ Res == LoadsState::Gather)
9101
+ continue;
9102
+ } else if (S.getOpcode() == Instruction::ExtractElement ||
9103
+ (TTI->getInstructionCost(
9104
+ cast<Instruction>(Slice.front()), CostKind) <
9105
+ TTI::TCC_Expensive &&
9106
+ !CheckOperandsProfitability(
9107
+ cast<Instruction>(Slice.front()),
9108
+ cast<Instruction>(Slice.back()), S))) {
9109
+ // Do not vectorize extractelements (handled effectively
9110
+ // alread). Do not vectorize non-profitable instructions (with
9111
+ // low cost and non-vectorizable operands.)
9112
+ continue;
9113
+ }
9114
+ }
9115
+ }
9116
+ Slices.emplace_back(Cnt);
9117
+ }
9118
+ auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
9119
+ E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9120
+ if (StartIdx == Cnt)
9121
+ StartIdx = Cnt + VF;
9122
+ if (End == Cnt + VF)
9123
+ End = Cnt;
9124
+ };
9125
+ for (unsigned Cnt : Slices) {
9126
+ ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9127
+ // If any instruction is vectorized already - do not try again.
9128
+ if (const TreeEntry *SE = getTreeEntry(Slice.front());
9129
+ SE || getTreeEntry(Slice.back())) {
9130
+ if (!SE)
9131
+ continue;
9132
+ if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9133
+ continue;
9134
+ AddCombinedNode(SE->Idx, Cnt);
9032
9135
continue;
9136
+ }
9033
9137
unsigned PrevSize = VectorizableTree.size();
9034
9138
buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9035
9139
if (PrevSize + 1 == VectorizableTree.size() &&
9036
- VectorizableTree[PrevSize]->isGather()) {
9140
+ VectorizableTree[PrevSize]->isGather() &&
9141
+ VectorizableTree[PrevSize]->getOpcode() !=
9142
+ Instruction::ExtractElement &&
9143
+ !isSplat(Slice)) {
9037
9144
VectorizableTree.pop_back();
9038
9145
continue;
9039
9146
}
9040
- E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
9041
- if (StartIdx == Cnt)
9042
- StartIdx = Cnt + VF;
9043
- if (End == Cnt + VF)
9044
- End = Cnt;
9147
+ AddCombinedNode(PrevSize, Cnt);
9045
9148
}
9046
9149
}
9047
9150
}
@@ -12293,6 +12396,14 @@ BoUpSLP::isGatherShuffledEntry(
12293
12396
"Expected only single user of the gather node.");
12294
12397
assert(VL.size() % NumParts == 0 &&
12295
12398
"Number of scalars must be divisible by NumParts.");
12399
+ if (!TE->UserTreeIndices.empty() &&
12400
+ TE->UserTreeIndices.front().UserTE->isGather() &&
12401
+ TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
12402
+ assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
12403
+ isSplat(TE->Scalars)) &&
12404
+ "Expected splat or extractelements only node.");
12405
+ return {};
12406
+ }
12296
12407
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12297
12408
SmallVector<std::optional<TTI::ShuffleKind>> Res;
12298
12409
for (unsigned Part : seq<unsigned>(NumParts)) {
@@ -17119,7 +17230,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17119
17230
if (R.isGathered(Chain.front()) ||
17120
17231
R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
17121
17232
return std::nullopt;
17122
- Size = R.getTreeSize ();
17233
+ Size = R.getCanonicalGraphSize ();
17123
17234
return false;
17124
17235
}
17125
17236
R.reorderTopToBottom();
@@ -17129,7 +17240,7 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17129
17240
17130
17241
R.computeMinimumValueSizes();
17131
17242
17132
- Size = R.getTreeSize ();
17243
+ Size = R.getCanonicalGraphSize ();
17133
17244
if (S.getOpcode() == Instruction::Load)
17134
17245
Size = 2; // cut off masked gather small trees
17135
17246
InstructionCost Cost = R.getTreeCost();
0 commit comments