@@ -179,6 +179,10 @@ static cl::opt<bool>
179
179
ViewSLPTree("view-slp-tree", cl::Hidden,
180
180
cl::desc("Display the SLP trees with Graphviz"));
181
181
182
+ static cl::opt<bool> VectorizeWithPadding(
183
+ "slp-vectorize-with-padding", cl::init(false), cl::Hidden,
184
+ cl::desc("Try to vectorize non-power-of-2 operations using padding."));
185
+
182
186
// Limit the number of alias checks. The limit is chosen so that
183
187
// it has no negative effect on the llvm benchmarks.
184
188
static const unsigned AliasedCheckLimit = 10;
@@ -2733,6 +2737,9 @@ class BoUpSLP {
2733
2737
SmallVectorImpl<Value *> *OpScalars = nullptr,
2734
2738
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2735
2739
2740
+ /// Return the number of padding lanes (containg poison) for this node.
2741
+ unsigned isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); }
2742
+
2736
2743
#ifndef NDEBUG
2737
2744
/// Debug printer.
2738
2745
LLVM_DUMP_METHOD void dump() const {
@@ -2891,9 +2898,13 @@ class BoUpSLP {
2891
2898
ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
2892
2899
}
2893
2900
2894
- if (UserTreeIdx.UserTE)
2901
+ if (UserTreeIdx.UserTE) {
2895
2902
Last->UserTreeIndices.push_back(UserTreeIdx);
2896
-
2903
+ if (!isPowerOf2_32(Last->Scalars.size())) {
2904
+ assert((Last->ReorderIndices.empty()) &&
2905
+ "Reodering isn't implemented for nodes with padding yet");
2906
+ }
2907
+ }
2897
2908
return Last;
2898
2909
}
2899
2910
@@ -2921,7 +2932,8 @@ class BoUpSLP {
2921
2932
/// and fills required data before actual scheduling of the instructions.
2922
2933
TreeEntry::EntryState getScalarsVectorizationState(
2923
2934
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
2924
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
2935
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
2936
+ bool HasPadding) const;
2925
2937
2926
2938
/// Maps a specific scalar to its tree entry.
2927
2939
SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3881,6 +3893,9 @@ static LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
3881
3893
Order.clear();
3882
3894
// Check the order of pointer operands or that all pointers are the same.
3883
3895
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, DL, SE, Order);
3896
+ if (!Order.empty() && !isPowerOf2_32(VL.size()))
3897
+ return LoadsState::Gather;
3898
+
3884
3899
if (IsSorted || all_of(PointerOps, [&](Value *P) {
3885
3900
return arePointersCompatible(P, PointerOps.front(), TLI);
3886
3901
})) {
@@ -4570,6 +4585,10 @@ bool BoUpSLP::canReorderOperands(
4570
4585
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
4571
4586
ArrayRef<TreeEntry *> ReorderableGathers,
4572
4587
SmallVectorImpl<TreeEntry *> &GatherOps) {
4588
+ // Reordering isn't implemented for nodes with padding yet.
4589
+ if (UserTE->isNonPowOf2Vec())
4590
+ return false;
4591
+
4573
4592
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
4574
4593
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
4575
4594
return OpData.first == I &&
@@ -4748,6 +4767,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
4748
4767
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
4749
4768
const auto &&AllowsReordering = [IgnoreReorder, &GathersToOrders](
4750
4769
const TreeEntry *TE) {
4770
+ // Reordering for nodes with padding not implemented yet.
4771
+ if (TE->isNonPowOf2Vec())
4772
+ return false;
4751
4773
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
4752
4774
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
4753
4775
(IgnoreReorder && TE->Idx == 0))
@@ -5235,7 +5257,8 @@ static bool isAlternateInstruction(const Instruction *I,
5235
5257
5236
5258
BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5237
5259
InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5238
- OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5260
+ OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps,
5261
+ bool HasPadding) const {
5239
5262
assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5240
5263
5241
5264
unsigned ShuffleOrOp =
@@ -5258,7 +5281,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5258
5281
}
5259
5282
case Instruction::ExtractValue:
5260
5283
case Instruction::ExtractElement: {
5261
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5284
+ bool Reuse = !HasPadding && canReuseExtract(VL, VL0, CurrentOrder);
5262
5285
if (Reuse || !CurrentOrder.empty())
5263
5286
return TreeEntry::Vectorize;
5264
5287
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -5355,6 +5378,15 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5355
5378
}
5356
5379
return TreeEntry::Vectorize;
5357
5380
}
5381
+ case Instruction::UDiv:
5382
+ case Instruction::SDiv:
5383
+ case Instruction::URem:
5384
+ case Instruction::SRem:
5385
+ // The instruction may trigger immediate UB on the poison/undef padding
5386
+ // elements, so force gather to avoid introducing new UB.
5387
+ if (HasPadding)
5388
+ return TreeEntry::NeedToGather;
5389
+ [[fallthrough]];
5358
5390
case Instruction::Select:
5359
5391
case Instruction::FNeg:
5360
5392
case Instruction::Add:
@@ -5363,11 +5395,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5363
5395
case Instruction::FSub:
5364
5396
case Instruction::Mul:
5365
5397
case Instruction::FMul:
5366
- case Instruction::UDiv:
5367
- case Instruction::SDiv:
5368
5398
case Instruction::FDiv:
5369
- case Instruction::URem:
5370
- case Instruction::SRem:
5371
5399
case Instruction::FRem:
5372
5400
case Instruction::Shl:
5373
5401
case Instruction::LShr:
@@ -5550,6 +5578,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5550
5578
bool DoNotFail = false) {
5551
5579
// Check that every instruction appears once in this bundle.
5552
5580
DenseMap<Value *, unsigned> UniquePositions(VL.size());
5581
+ auto OriginalVL = VL;
5553
5582
for (Value *V : VL) {
5554
5583
if (isConstant(V)) {
5555
5584
ReuseShuffleIndicies.emplace_back(
@@ -5562,6 +5591,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5562
5591
if (Res.second)
5563
5592
UniqueValues.emplace_back(V);
5564
5593
}
5594
+
5565
5595
size_t NumUniqueScalarValues = UniqueValues.size();
5566
5596
if (NumUniqueScalarValues == VL.size()) {
5567
5597
ReuseShuffleIndicies.clear();
@@ -5589,6 +5619,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5589
5619
NonUniqueValueVL.append(PWSz - UniqueValues.size(),
5590
5620
UniqueValues.back());
5591
5621
VL = NonUniqueValueVL;
5622
+
5623
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5624
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
5625
+ "for nodes with padding.\n");
5626
+ newTreeEntry(OriginalVL, std::nullopt /*not vectorized*/, S,
5627
+ UserTreeIdx);
5628
+ return false;
5629
+ }
5592
5630
}
5593
5631
return true;
5594
5632
}
@@ -5597,6 +5635,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5597
5635
return false;
5598
5636
}
5599
5637
VL = UniqueValues;
5638
+ if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
5639
+ LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported for "
5640
+ "nodes with padding.\n");
5641
+ newTreeEntry(OriginalVL, std::nullopt /*not vectorized*/, S,
5642
+ UserTreeIdx);
5643
+ return false;
5644
+ }
5600
5645
}
5601
5646
return true;
5602
5647
};
@@ -5861,7 +5906,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
5861
5906
OrdersType CurrentOrder;
5862
5907
SmallVector<Value *> PointerOps;
5863
5908
TreeEntry::EntryState State = getScalarsVectorizationState(
5864
- S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
5909
+ S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps,
5910
+ UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec() > 0);
5865
5911
if (State == TreeEntry::NeedToGather) {
5866
5912
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
5867
5913
ReuseShuffleIndicies);
@@ -6957,7 +7003,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6957
7003
return Constant::getAllOnesValue(Ty);
6958
7004
}
6959
7005
6960
- InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7006
+ InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
7007
+ bool WithPadding = false) {
6961
7008
if ((!Root && allConstant(VL)) || all_of(VL, UndefValue::classof))
6962
7009
return TTI::TCC_Free;
6963
7010
auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
@@ -6968,7 +7015,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
6968
7015
InstructionsState S = getSameOpcode(VL, *R.TLI);
6969
7016
const unsigned Sz = R.DL->getTypeSizeInBits(VL.front()->getType());
6970
7017
unsigned MinVF = R.getMinVF(2 * Sz);
6971
- if (VL.size() > 2 &&
7018
+ if (!WithPadding && VL.size() > 2 &&
6972
7019
((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
6973
7020
(InVectors.empty() &&
6974
7021
any_of(seq<unsigned>(0, VL.size() / MinVF),
@@ -7079,7 +7126,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7079
7126
}
7080
7127
GatherCost -= ScalarsCost;
7081
7128
}
7082
- } else if (!Root && isSplat(VL)) {
7129
+ } else if (!WithPadding && ! Root && isSplat(VL)) {
7083
7130
// Found the broadcasting of the single scalar, calculate the cost as
7084
7131
// the broadcast.
7085
7132
const auto *It =
@@ -7642,8 +7689,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7642
7689
CommonMask[Idx] = Mask[Idx] + VF;
7643
7690
}
7644
7691
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
7645
- Value *Root = nullptr) {
7646
- Cost += getBuildVectorCost(VL, Root);
7692
+ Value *Root = nullptr, bool WithPadding = false ) {
7693
+ Cost += getBuildVectorCost(VL, Root, WithPadding );
7647
7694
if (!Root) {
7648
7695
// FIXME: Need to find a way to avoid use of getNullValue here.
7649
7696
SmallVector<Constant *> Vals;
@@ -9712,6 +9759,9 @@ BoUpSLP::isGatherShuffledEntry(
9712
9759
// No need to check for the topmost gather node.
9713
9760
if (TE == VectorizableTree.front().get())
9714
9761
return {};
9762
+ // Gathering for nodes with padding is not implemented yet.
9763
+ if (TE->isNonPowOf2Vec())
9764
+ return {};
9715
9765
Mask.assign(VL.size(), PoisonMaskElem);
9716
9766
assert(TE->UserTreeIndices.size() == 1 &&
9717
9767
"Expected only single user of the gather node.");
@@ -10433,7 +10483,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
10433
10483
add(V1, NewMask);
10434
10484
}
10435
10485
Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10436
- Value *Root = nullptr) {
10486
+ Value *Root = nullptr, bool WithPadding = false ) {
10437
10487
return R.gather(VL, Root);
10438
10488
}
10439
10489
Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
@@ -10502,7 +10552,6 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
10502
10552
SmallVector<int> Mask(E->ReorderIndices.begin(), E->ReorderIndices.end());
10503
10553
reorderScalars(VL, Mask);
10504
10554
}
10505
- const unsigned VF = VL.size();
10506
10555
InstructionsState S = getSameOpcode(VL, *TLI);
10507
10556
// Special processing for GEPs bundle, which may include non-gep values.
10508
10557
if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
@@ -10544,6 +10593,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
10544
10593
ShuffleBuilder.add(V, Mask);
10545
10594
return ShuffleBuilder.finalize(std::nullopt);
10546
10595
};
10596
+ const unsigned VF = VL.size();
10547
10597
Value *V = vectorizeTree(VE, PostponedPHIs);
10548
10598
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
10549
10599
if (!VE->ReuseShuffleIndices.empty()) {
@@ -10670,6 +10720,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
10670
10720
return true;
10671
10721
};
10672
10722
BVTy ShuffleBuilder(Params...);
10723
+ if (E->isNonPowOf2Vec()) {
10724
+ Value *BV = ShuffleBuilder.gather(E->Scalars, 0, nullptr, true);
10725
+ SmallVector<int> Mask(VF, PoisonMaskElem);
10726
+ std::iota(Mask.begin(), Mask.begin() + E->Scalars.size(), 0);
10727
+ ShuffleBuilder.add(BV, Mask);
10728
+ return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
10729
+ }
10730
+
10673
10731
ResTy Res = ResTy();
10674
10732
SmallVector<int> Mask;
10675
10733
SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
@@ -13434,7 +13492,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
13434
13492
unsigned VF = Chain.size();
13435
13493
13436
13494
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
13437
- return false;
13495
+ if (!VectorizeWithPadding || (VF < MinVF && VF + 1 != MinVF))
13496
+ return false;
13438
13497
13439
13498
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
13440
13499
<< "\n");
@@ -13530,9 +13589,36 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
13530
13589
<< "MinVF (" << MinVF << ")\n");
13531
13590
}
13532
13591
13592
+ unsigned StartIdx = 0;
13593
+ if (VectorizeWithPadding) {
13594
+ unsigned CandVF = Operands.size() + 1;
13595
+ if (isPowerOf2_32(CandVF) && CandVF <= MaxVF) {
13596
+ assert(
13597
+ all_of(
13598
+ Operands,
13599
+ [&](Value *V) {
13600
+ return cast<StoreInst>(V)->getValueOperand()->getType() ==
13601
+ cast<StoreInst>(Operands.front())
13602
+ ->getValueOperand()
13603
+ ->getType();
13604
+ }) &&
13605
+ "Expected all operands of same type.");
13606
+ if (!VectorizedStores.count(Operands.front()) &&
13607
+ !VectorizedStores.count(Operands.back()) &&
13608
+ TriedSequences
13609
+ .insert(std::make_pair(Operands.front(), Operands.back()))
13610
+ .second &&
13611
+ vectorizeStoreChain(Operands, R, Operands.size(), MinVF)) {
13612
+ // Mark the vectorized stores so that we don't vectorize them again.
13613
+ VectorizedStores.insert(Operands.begin(), Operands.end());
13614
+ Changed = true;
13615
+ StartIdx += Operands.size();
13616
+ }
13617
+ }
13618
+ }
13619
+
13533
13620
// FIXME: Is division-by-2 the correct step? Should we assert that the
13534
13621
// register size is a power-of-2?
13535
- unsigned StartIdx = 0;
13536
13622
for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
13537
13623
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
13538
13624
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
0 commit comments