@@ -190,6 +190,10 @@ static cl::opt<bool>
190
190
ViewSLPTree("view-slp-tree", cl::Hidden,
191
191
cl::desc("Display the SLP trees with Graphviz"));
192
192
193
+ static cl::opt<bool> VectorizeNonPowerOf2(
194
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
195
+ cl::desc("Try to vectorize with non-power-of-2 number of elements."));
196
+
193
197
// Limit the number of alias checks. The limit is chosen so that
194
198
// it has no negative effect on the llvm benchmarks.
195
199
static const unsigned AliasedCheckLimit = 10;
@@ -2829,6 +2833,14 @@ class BoUpSLP {
2829
2833
SmallVectorImpl<Value *> *OpScalars = nullptr,
2830
2834
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2831
2835
2836
+ /// Return true if this is a non-power-of-2 node.
2837
+ bool isNonPowOf2Vec() const {
2838
+ bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2839
+ assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2840
+ "Reshuffling not supported with non-power-of-2 vectors yet.");
2841
+ return IsNonPowerOf2;
2842
+ }
2843
+
2832
2844
#ifndef NDEBUG
2833
2845
/// Debug printer.
2834
2846
LLVM_DUMP_METHOD void dump() const {
@@ -2994,9 +3006,11 @@ class BoUpSLP {
2994
3006
MustGather.insert(VL.begin(), VL.end());
2995
3007
}
2996
3008
2997
- if (UserTreeIdx.UserTE)
3009
+ if (UserTreeIdx.UserTE) {
2998
3010
Last->UserTreeIndices.push_back(UserTreeIdx);
2999
-
3011
+ assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3012
+ "Reordering isn't implemented for non-power-of-2 nodes yet");
3013
+ }
3000
3014
return Last;
3001
3015
}
3002
3016
@@ -4256,6 +4270,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4256
4270
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4257
4271
// Check the order of pointer operands or that all pointers are the same.
4258
4272
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4273
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4274
+ if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4275
+ assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4276
+ "supported with VectorizeNonPowerOf2");
4277
+ return LoadsState::Gather;
4278
+ }
4279
+
4259
4280
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4260
4281
if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4261
4282
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
@@ -4575,6 +4596,10 @@ static bool areTwoInsertFromSameBuildVector(
4575
4596
4576
4597
std::optional<BoUpSLP::OrdersType>
4577
4598
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4599
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4600
+ if (TE.isNonPowOf2Vec())
4601
+ return std::nullopt;
4602
+
4578
4603
// No need to reorder if need to shuffle reuses, still need to shuffle the
4579
4604
// node.
4580
4605
if (!TE.ReuseShuffleIndices.empty()) {
@@ -5145,6 +5170,10 @@ bool BoUpSLP::canReorderOperands(
5145
5170
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5146
5171
ArrayRef<TreeEntry *> ReorderableGathers,
5147
5172
SmallVectorImpl<TreeEntry *> &GatherOps) {
5173
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5174
+ if (UserTE->isNonPowOf2Vec())
5175
+ return false;
5176
+
5148
5177
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5149
5178
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5150
5179
return OpData.first == I &&
@@ -5318,6 +5347,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5318
5347
}
5319
5348
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5320
5349
const auto AllowsReordering = [&](const TreeEntry *TE) {
5350
+ // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5351
+ if (TE->isNonPowOf2Vec())
5352
+ return false;
5321
5353
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5322
5354
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5323
5355
(IgnoreReorder && TE->Idx == 0))
@@ -5944,6 +5976,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5944
5976
case Instruction::ExtractValue:
5945
5977
case Instruction::ExtractElement: {
5946
5978
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5979
+ // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5980
+ if (!isPowerOf2_32(VL.size()))
5981
+ return TreeEntry::NeedToGather;
5947
5982
if (Reuse || !CurrentOrder.empty())
5948
5983
return TreeEntry::Vectorize;
5949
5984
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -6258,6 +6293,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6258
6293
if (NumUniqueScalarValues == VL.size()) {
6259
6294
ReuseShuffleIndicies.clear();
6260
6295
} else {
6296
+ // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6297
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6298
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6299
+ "for nodes with padding.\n");
6300
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6301
+ return false;
6302
+ }
6261
6303
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6262
6304
if (NumUniqueScalarValues <= 1 ||
6263
6305
(UniquePositions.size() == 1 && all_of(UniqueValues,
@@ -7868,7 +7910,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7868
7910
for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7869
7911
if (VectorizedLoads.contains(VL[I]))
7870
7912
continue;
7871
- GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
7913
+ GatherCost +=
7914
+ getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
7872
7915
}
7873
7916
// Exclude potentially vectorized loads from list of gathered
7874
7917
// scalars.
@@ -10678,6 +10721,9 @@ BoUpSLP::isGatherShuffledEntry(
10678
10721
// No need to check for the topmost gather node.
10679
10722
if (TE == VectorizableTree.front().get())
10680
10723
return {};
10724
+ // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10725
+ if (TE->isNonPowOf2Vec())
10726
+ return {};
10681
10727
Mask.assign(VL.size(), PoisonMaskElem);
10682
10728
assert(TE->UserTreeIndices.size() == 1 &&
10683
10729
"Expected only single user of the gather node.");
@@ -14995,8 +15041,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
14995
15041
const unsigned Sz = R.getVectorElementSize(Chain[0]);
14996
15042
unsigned VF = Chain.size();
14997
15043
14998
- if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
14999
- return false;
15044
+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15045
+ // Check if vectorizing with a non-power-of-2 VF should be considered. At
15046
+ // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15047
+ // all vector lanes are used.
15048
+ if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15049
+ return false;
15050
+ }
15000
15051
15001
15052
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15002
15053
<< "\n");
@@ -15095,14 +15146,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
15095
15146
continue;
15096
15147
}
15097
15148
15149
+ unsigned NonPowerOf2VF = 0;
15150
+ if (VectorizeNonPowerOf2) {
15151
+ // First try vectorizing with a non-power-of-2 VF. At the moment, only
15152
+ // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15153
+ // lanes are used.
15154
+ unsigned CandVF = Operands.size();
15155
+ if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
15156
+ NonPowerOf2VF = CandVF;
15157
+ }
15158
+
15098
15159
unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15099
- SmallVector<unsigned> CandidateVFs(Sz);
15100
- // FIXME: Is division-by-2 the correct step? Should we assert that the
15101
- // register size is a power-of-2?
15102
- unsigned Size = MaxVF;
15103
- for_each(CandidateVFs, [&](unsigned &VF) {
15104
- VF = Size;
15105
- Size /= 2;
15160
+ SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15161
+ unsigned Size = MinVF;
15162
+ for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15163
+ VF = Size > MaxVF ? NonPowerOf2VF : Size;
15164
+ Size *= 2;
15106
15165
});
15107
15166
unsigned StartIdx = 0;
15108
15167
for (unsigned Size : CandidateVFs) {
0 commit comments