@@ -179,6 +179,10 @@ static cl::opt<bool>
179
179
ViewSLPTree("view-slp-tree", cl::Hidden,
180
180
cl::desc("Display the SLP trees with Graphviz"));
181
181
182
+ static cl::opt<bool> VectorizeNonPowerOf2(
183
+ "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
184
+ cl::desc("Try to vectorize with non-power-of-2 with number of elements."));
185
+
182
186
// Limit the number of alias checks. The limit is chosen so that
183
187
// it has no negative effect on the llvm benchmarks.
184
188
static const unsigned AliasedCheckLimit = 10;
@@ -2733,6 +2737,9 @@ class BoUpSLP {
2733
2737
SmallVectorImpl<Value *> *OpScalars = nullptr,
2734
2738
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2735
2739
2740
+ /// Return the number of padding lanes (containg poison) for this node.
2741
+ bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
2742
+
2736
2743
#ifndef NDEBUG
2737
2744
/// Debug printer.
2738
2745
LLVM_DUMP_METHOD void dump() const {
@@ -2891,9 +2898,13 @@ class BoUpSLP {
2891
2898
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2892
2899
}
2893
2900
2894
- if (UserTreeIdx.UserTE)
2901
+ if (UserTreeIdx.UserTE) {
2895
2902
Last->UserTreeIndices.push_back(UserTreeIdx);
2896
-
2903
+ if (!isPowerOf2_32(Last->Scalars.size())) {
2904
+ assert((Last->ReorderIndices.empty()) &&
2905
+ "Reodering isn't implemented for nodes with padding yet");
2906
+ }
2907
+ }
2897
2908
return Last;
2898
2909
}
2899
2910
@@ -2921,7 +2932,8 @@ class BoUpSLP {
2921
2932
/// and fills required data before actual scheduling of the instructions.
2922
2933
TreeEntry::EntryState getScalarsVectorizationState(
2923
2934
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2924
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2935
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
2936
+ bool HasPadding) const;
2925
2937
2926
2938
/// Maps a specific scalar to its tree entry.
2927
2939
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3881,6 +3893,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3881
3893
Order.clear();
3882
3894
// Check the order of pointer operands or that all pointers are the same.
3883
3895
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3896
+ if (!Order.empty() && !isPowerOf2_32(VL.size()))
3897
+ return LoadsState::Gather;
3898
+
3884
3899
if (IsSorted || all_of(PointerOps, [&](Value *P) {
3885
3900
return arePointersCompatible(P, PointerOps.front(), TLI);
3886
3901
})) {
@@ -4568,6 +4583,10 @@ bool BoUpSLP::canReorderOperands(
4568
4583
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4569
4584
ArrayRef<TreeEntry *> ReorderableGathers,
4570
4585
SmallVectorImpl<TreeEntry *> &GatherOps) {
4586
+ // Reordering isn't implemented for nodes with padding yet.
4587
+ if (UserTE->isNonPowOf2Vec())
4588
+ return false;
4589
+
4571
4590
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4572
4591
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4573
4592
return OpData.first == I &&
@@ -4746,6 +4765,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4746
4765
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4747
4766
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4748
4767
const TreeEntry *TE) {
4768
+ // Reordering for nodes with padding not implemented yet.
4769
+ if (TE->isNonPowOf2Vec())
4770
+ return false;
4749
4771
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4750
4772
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4751
4773
(IgnoreReorder && TE->Idx == 0))
@@ -5233,7 +5255,8 @@ static bool isAlternateInstruction(const Instruction *I,
5233
5255
5234
5256
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5235
5257
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5236
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5258
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
5259
+ bool HasPadding) const {
5237
5260
assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5238
5261
5239
5262
unsigned ShuffleOrOp =
@@ -5256,7 +5279,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5256
5279
}
5257
5280
case Instruction::ExtractValue:
5258
5281
case Instruction::ExtractElement: {
5259
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5282
+ bool Reuse = !HasPadding && canReuseExtract(VL, VL0, CurrentOrder);
5260
5283
if (Reuse || !CurrentOrder.empty())
5261
5284
return TreeEntry::Vectorize;
5262
5285
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -5583,6 +5606,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5583
5606
if (PWSz == VL.size()) {
5584
5607
ReuseShuffleIndicies.clear();
5585
5608
} else {
5609
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5610
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
5611
+ "for nodes with padding.\n");
5612
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5613
+ return false;
5614
+ }
5586
5615
NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
5587
5616
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5588
5617
UniqueValues.back());
@@ -5594,6 +5623,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5594
5623
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5595
5624
return false;
5596
5625
}
5626
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5627
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
5628
+ "nodes with padding.\n");
5629
+ newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
5630
+ return false;
5631
+ }
5597
5632
VL = UniqueValues;
5598
5633
}
5599
5634
return true;
@@ -5859,7 +5894,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5859
5894
OrdersType CurrentOrder;
5860
5895
SmallVector<Value *> PointerOps;
5861
5896
TreeEntry::EntryState State = getScalarsVectorizationState(
5862
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
5897
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps,
5898
+ UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec());
5863
5899
if (State == TreeEntry::NeedToGather) {
5864
5900
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5865
5901
ReuseShuffleIndicies);
@@ -6955,7 +6991,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6955
6991
return Constant::getAllOnesValue(Ty);
6956
6992
}
6957
6993
6958
- InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
6994
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
6995
+ bool WithPadding = false) {
6959
6996
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
6960
6997
return TTI::TCC_Free;
6961
6998
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
@@ -6966,7 +7003,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6966
7003
InstructionsState S = getSameOpcode(VL, *R.TLI);
6967
7004
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6968
7005
unsigned MinVF = R.getMinVF(2 * Sz);
6969
- if (VL.size() > 2 &&
7006
+ if (!WithPadding && VL.size() > 2 &&
6970
7007
((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
6971
7008
(InVectors.empty() &&
6972
7009
any_of(seq<unsigned>(0, VL.size() / MinVF),
@@ -7077,7 +7114,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7077
7114
}
7078
7115
GatherCost -= ScalarsCost;
7079
7116
}
7080
- } else if (!Root && isSplat(VL)) {
7117
+ } else if (!WithPadding && ! Root && isSplat(VL)) {
7081
7118
// Found the broadcasting of the single scalar, calculate the cost as
7082
7119
// the broadcast.
7083
7120
const auto *It =
@@ -7640,8 +7677,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7640
7677
CommonMask[Idx] = Mask[Idx] + VF;
7641
7678
}
7642
7679
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7643
- Value *Root = nullptr) {
7644
- Cost += getBuildVectorCost(VL, Root);
7680
+ Value *Root = nullptr, bool WithPadding = false ) {
7681
+ Cost += getBuildVectorCost(VL, Root, WithPadding );
7645
7682
if (!Root) {
7646
7683
// FIXME: Need to find a way to avoid use of getNullValue here.
7647
7684
SmallVector<Constant *> Vals;
@@ -9710,6 +9747,9 @@ BoUpSLP::isGatherShuffledEntry(
9710
9747
// No need to check for the topmost gather node.
9711
9748
if (TE == VectorizableTree.front().get())
9712
9749
return {};
9750
+ // Gathering for nodes with padding is not implemented yet.
9751
+ if (TE->isNonPowOf2Vec())
9752
+ return {};
9713
9753
Mask.assign(VL.size(), PoisonMaskElem);
9714
9754
assert(TE->UserTreeIndices.size() == 1 &&
9715
9755
"Expected only single user of the gather node.");
@@ -10422,7 +10462,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10422
10462
add(V1, NewMask);
10423
10463
}
10424
10464
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10425
- Value *Root = nullptr) {
10465
+ Value *Root = nullptr, bool WithPadding = false ) {
10426
10466
return R.gather(VL, Root);
10427
10467
}
10428
10468
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10491,7 +10531,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
10491
10531
SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
10492
10532
reorderScalars(VL, Mask);
10493
10533
}
10494
- const unsigned VF = VL.size();
10495
10534
InstructionsState S = getSameOpcode(VL, *TLI);
10496
10535
// Special processing for GEPs bundle, which may include non-gep values.
10497
10536
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10533,6 +10572,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
10533
10572
ShuffleBuilder.add(V, Mask);
10534
10573
return ShuffleBuilder.finalize(std::nullopt);
10535
10574
};
10575
+ const unsigned VF = VL.size();
10536
10576
Value *V = vectorizeTree(VE, PostponedPHIs);
10537
10577
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
10538
10578
if (!VE->ReuseShuffleIndices.empty()) {
@@ -10659,6 +10699,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10659
10699
return true;
10660
10700
};
10661
10701
BVTy ShuffleBuilder(Params...);
10702
+ if (E->isNonPowOf2Vec()) {
10703
+ Value *BV = ShuffleBuilder.gather(E->Scalars, 0, nullptr, true);
10704
+ SmallVector<int> Mask(VF, PoisonMaskElem);
10705
+ std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
10706
+ ShuffleBuilder.add(BV, Mask);
10707
+ return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
10708
+ }
10709
+
10662
10710
ResTy Res = ResTy();
10663
10711
SmallVector<int> Mask;
10664
10712
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -13422,8 +13470,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
13422
13470
const unsigned Sz = R.getVectorElementSize(Chain[0]);
13423
13471
unsigned VF = Chain.size();
13424
13472
13425
- if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
13426
- return false;
13473
+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
13474
+ // Check if vectorizing with a non-power-of-2 VF should be considered. At
13475
+ // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
13476
+ // all vector lanes are used.
13477
+ if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
13478
+ return false;
13479
+ }
13427
13480
13428
13481
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
13429
13482
<< "\n");
@@ -13519,9 +13572,39 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
13519
13572
<< "MinVF (" << MinVF << ")\n");
13520
13573
}
13521
13574
13575
+ unsigned StartIdx = 0;
13576
+ if (VectorizeNonPowerOf2) {
13577
+ // Try vectorizing with a non-power-of-2 VF. At the moment, only
13578
+ // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
13579
+ // lanes are used.
13580
+ unsigned CandVF = Operands.size() + 1;
13581
+ if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
13582
+ assert(
13583
+ all_of(
13584
+ Operands,
13585
+ [&](Value *V) {
13586
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
13587
+ cast<StoreInst>(Operands.front())
13588
+ ->getValueOperand()
13589
+ ->getType();
13590
+ }) &&
13591
+ "Expected all operands of same type.");
13592
+ if (!VectorizedStores.count(Operands.front()) &&
13593
+ !VectorizedStores.count(Operands.back()) &&
13594
+ TriedSequences
13595
+ .insert(std::make_pair(Operands.front(), Operands.back()))
13596
+ .second &&
13597
+ vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
13598
+ // Mark the vectorized stores so that we don't vectorize them again.
13599
+ VectorizedStores.insert(Operands.begin(), Operands.end());
13600
+ Changed = true;
13601
+ StartIdx += Operands.size();
13602
+ }
13603
+ }
13604
+ }
13605
+
13522
13606
// FIXME: Is division-by-2 the correct step? Should we assert that the
13523
13607
// register size is a power-of-2?
13524
- unsigned StartIdx = 0;
13525
13608
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
13526
13609
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
13527
13610
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
0 commit comments