Skip to content

Commit 6d66db3

Browse files
authored
[SLP] Initial vectorization of non-power-of-2 ops. (#77790)
This patch enables vectorization for non-power-of-2 VFs. Initially only VFs where adding 1 makes the VF a power-of-2, i.e. we can still make relatively effective use of the vectors. It relies on the existing target cost-models to return accurate costs for non-power-of-2 vectors. I checked mostly AArch64 and X86 and there the costs seem reasonable for the costs I checked, although I expect there will be a need to refine both the cost-models and lowering to make most effective use of non-power-of-2 SLP vectorization. Note that re-ordering and shuffling is not implemented for nodes requiring padding yet to keep the initial implementation simpler. The feature is guarded by a new flag, off by defaul for now. PR: #77790
1 parent df9c00b commit 6d66db3

11 files changed

+947
-499
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ static cl::opt<bool>
190190
ViewSLPTree("view-slp-tree", cl::Hidden,
191191
cl::desc("Display the SLP trees with Graphviz"));
192192

193+
static cl::opt<bool> VectorizeNonPowerOf2(
194+
"slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
195+
cl::desc("Try to vectorize with non-power-of-2 number of elements."));
196+
193197
// Limit the number of alias checks. The limit is chosen so that
194198
// it has no negative effect on the llvm benchmarks.
195199
static const unsigned AliasedCheckLimit = 10;
@@ -2829,6 +2833,14 @@ class BoUpSLP {
28292833
SmallVectorImpl<Value *> *OpScalars = nullptr,
28302834
SmallVectorImpl<Value *> *AltScalars = nullptr) const;
28312835

2836+
/// Return true if this is a non-power-of-2 node.
2837+
bool isNonPowOf2Vec() const {
2838+
bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2839+
assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2840+
"Reshuffling not supported with non-power-of-2 vectors yet.");
2841+
return IsNonPowerOf2;
2842+
}
2843+
28322844
#ifndef NDEBUG
28332845
/// Debug printer.
28342846
LLVM_DUMP_METHOD void dump() const {
@@ -2994,9 +3006,11 @@ class BoUpSLP {
29943006
MustGather.insert(VL.begin(), VL.end());
29953007
}
29963008

2997-
if (UserTreeIdx.UserTE)
3009+
if (UserTreeIdx.UserTE) {
29983010
Last->UserTreeIndices.push_back(UserTreeIdx);
2999-
3011+
assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3012+
"Reordering isn't implemented for non-power-of-2 nodes yet");
3013+
}
30003014
return Last;
30013015
}
30023016

@@ -4256,6 +4270,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
42564270
auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
42574271
// Check the order of pointer operands or that all pointers are the same.
42584272
bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4273+
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4274+
if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4275+
assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4276+
"supported with VectorizeNonPowerOf2");
4277+
return LoadsState::Gather;
4278+
}
4279+
42594280
Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
42604281
if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
42614282
TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
@@ -4575,6 +4596,10 @@ static bool areTwoInsertFromSameBuildVector(
45754596

45764597
std::optional<BoUpSLP::OrdersType>
45774598
BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4599+
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4600+
if (TE.isNonPowOf2Vec())
4601+
return std::nullopt;
4602+
45784603
// No need to reorder if need to shuffle reuses, still need to shuffle the
45794604
// node.
45804605
if (!TE.ReuseShuffleIndices.empty()) {
@@ -5145,6 +5170,10 @@ bool BoUpSLP::canReorderOperands(
51455170
TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
51465171
ArrayRef<TreeEntry *> ReorderableGathers,
51475172
SmallVectorImpl<TreeEntry *> &GatherOps) {
5173+
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5174+
if (UserTE->isNonPowOf2Vec())
5175+
return false;
5176+
51485177
for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
51495178
if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
51505179
return OpData.first == I &&
@@ -5318,6 +5347,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
53185347
}
53195348
auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
53205349
const auto AllowsReordering = [&](const TreeEntry *TE) {
5350+
// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5351+
if (TE->isNonPowOf2Vec())
5352+
return false;
53215353
if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
53225354
(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
53235355
(IgnoreReorder && TE->Idx == 0))
@@ -5944,6 +5976,9 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
59445976
case Instruction::ExtractValue:
59455977
case Instruction::ExtractElement: {
59465978
bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
5979+
// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
5980+
if (!isPowerOf2_32(VL.size()))
5981+
return TreeEntry::NeedToGather;
59475982
if (Reuse || !CurrentOrder.empty())
59485983
return TreeEntry::Vectorize;
59495984
LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
@@ -6258,6 +6293,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
62586293
if (NumUniqueScalarValues == VL.size()) {
62596294
ReuseShuffleIndicies.clear();
62606295
} else {
6296+
// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6297+
if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6298+
LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6299+
"for nodes with padding.\n");
6300+
newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6301+
return false;
6302+
}
62616303
LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
62626304
if (NumUniqueScalarValues <= 1 ||
62636305
(UniquePositions.size() == 1 && all_of(UniqueValues,
@@ -7868,7 +7910,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
78687910
for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
78697911
if (VectorizedLoads.contains(VL[I]))
78707912
continue;
7871-
GatherCost += getBuildVectorCost(VL.slice(I, VF), Root);
7913+
GatherCost +=
7914+
getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
78727915
}
78737916
// Exclude potentially vectorized loads from list of gathered
78747917
// scalars.
@@ -10678,6 +10721,9 @@ BoUpSLP::isGatherShuffledEntry(
1067810721
// No need to check for the topmost gather node.
1067910722
if (TE == VectorizableTree.front().get())
1068010723
return {};
10724+
// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10725+
if (TE->isNonPowOf2Vec())
10726+
return {};
1068110727
Mask.assign(VL.size(), PoisonMaskElem);
1068210728
assert(TE->UserTreeIndices.size() == 1 &&
1068310729
"Expected only single user of the gather node.");
@@ -14995,8 +15041,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
1499515041
const unsigned Sz = R.getVectorElementSize(Chain[0]);
1499615042
unsigned VF = Chain.size();
1499715043

14998-
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
14999-
return false;
15044+
if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15045+
// Check if vectorizing with a non-power-of-2 VF should be considered. At
15046+
// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15047+
// all vector lanes are used.
15048+
if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15049+
return false;
15050+
}
1500015051

1500115052
LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
1500215053
<< "\n");
@@ -15095,14 +15146,22 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
1509515146
continue;
1509615147
}
1509715148

15149+
unsigned NonPowerOf2VF = 0;
15150+
if (VectorizeNonPowerOf2) {
15151+
// First try vectorizing with a non-power-of-2 VF. At the moment, only
15152+
// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15153+
// lanes are used.
15154+
unsigned CandVF = Operands.size();
15155+
if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF)
15156+
NonPowerOf2VF = CandVF;
15157+
}
15158+
1509815159
unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15099-
SmallVector<unsigned> CandidateVFs(Sz);
15100-
// FIXME: Is division-by-2 the correct step? Should we assert that the
15101-
// register size is a power-of-2?
15102-
unsigned Size = MaxVF;
15103-
for_each(CandidateVFs, [&](unsigned &VF) {
15104-
VF = Size;
15105-
Size /= 2;
15160+
SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15161+
unsigned Size = MinVF;
15162+
for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15163+
VF = Size > MaxVF ? NonPowerOf2VF : Size;
15164+
Size *= 2;
1510615165
});
1510715166
unsigned StartIdx = 0;
1510815167
for (unsigned Size : CandidateVFs) {

llvm/test/Transforms/SLPVectorizer/AArch64/vec15-base.ll

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,45 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2-
; RUN: opt -passes=slp-vectorizer -mtriple=arm64-apple-ios -S %s | FileCheck %s
2+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POW2 %s
3+
; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=POW2-ONLY %s
34

45
define void @v15_load_i8_mul_by_constant_store(ptr %src, ptr noalias %dst) {
5-
; CHECK-LABEL: define void @v15_load_i8_mul_by_constant_store(
6-
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
7-
; CHECK-NEXT: entry:
8-
; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
9-
; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
10-
; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
11-
; CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
12-
; CHECK-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
13-
; CHECK-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
14-
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
15-
; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
16-
; CHECK-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
17-
; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
18-
; CHECK-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
19-
; CHECK-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
20-
; CHECK-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
21-
; CHECK-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
22-
; CHECK-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
23-
; CHECK-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
24-
; CHECK-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
25-
; CHECK-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
26-
; CHECK-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
27-
; CHECK-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
28-
; CHECK-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
29-
; CHECK-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
30-
; CHECK-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
31-
; CHECK-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
32-
; CHECK-NEXT: ret void
6+
; NON-POW2-LABEL: define void @v15_load_i8_mul_by_constant_store(
7+
; NON-POW2-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
8+
; NON-POW2-NEXT: entry:
9+
; NON-POW2-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
10+
; NON-POW2-NEXT: [[TMP0:%.*]] = load <15 x i8>, ptr [[GEP_SRC_0]], align 4
11+
; NON-POW2-NEXT: [[TMP1:%.*]] = mul nsw <15 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
12+
; NON-POW2-NEXT: store <15 x i8> [[TMP1]], ptr [[DST]], align 1
13+
; NON-POW2-NEXT: ret void
14+
;
15+
; POW2-ONLY-LABEL: define void @v15_load_i8_mul_by_constant_store(
16+
; POW2-ONLY-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
17+
; POW2-ONLY-NEXT: entry:
18+
; POW2-ONLY-NEXT: [[GEP_SRC_0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 0
19+
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[GEP_SRC_0]], align 4
20+
; POW2-ONLY-NEXT: [[TMP1:%.*]] = mul nsw <8 x i8> [[TMP0]], <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
21+
; POW2-ONLY-NEXT: store <8 x i8> [[TMP1]], ptr [[DST]], align 1
22+
; POW2-ONLY-NEXT: [[GEP_SRC_8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 8
23+
; POW2-ONLY-NEXT: [[DST_8:%.*]] = getelementptr i8, ptr [[DST]], i8 8
24+
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_SRC_8]], align 4
25+
; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <4 x i8> [[TMP2]], <i8 10, i8 10, i8 10, i8 10>
26+
; POW2-ONLY-NEXT: store <4 x i8> [[TMP3]], ptr [[DST_8]], align 1
27+
; POW2-ONLY-NEXT: [[GEP_SRC_12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 12
28+
; POW2-ONLY-NEXT: [[L_SRC_12:%.*]] = load i8, ptr [[GEP_SRC_12]], align 4
29+
; POW2-ONLY-NEXT: [[MUL_12:%.*]] = mul nsw i8 [[L_SRC_12]], 10
30+
; POW2-ONLY-NEXT: [[DST_12:%.*]] = getelementptr i8, ptr [[DST]], i8 12
31+
; POW2-ONLY-NEXT: store i8 [[MUL_12]], ptr [[DST_12]], align 1
32+
; POW2-ONLY-NEXT: [[GEP_SRC_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 13
33+
; POW2-ONLY-NEXT: [[L_SRC_13:%.*]] = load i8, ptr [[GEP_SRC_13]], align 4
34+
; POW2-ONLY-NEXT: [[MUL_13:%.*]] = mul nsw i8 [[L_SRC_13]], 10
35+
; POW2-ONLY-NEXT: [[DST_13:%.*]] = getelementptr i8, ptr [[DST]], i8 13
36+
; POW2-ONLY-NEXT: store i8 [[MUL_13]], ptr [[DST_13]], align 1
37+
; POW2-ONLY-NEXT: [[GEP_SRC_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 14
38+
; POW2-ONLY-NEXT: [[L_SRC_14:%.*]] = load i8, ptr [[GEP_SRC_14]], align 4
39+
; POW2-ONLY-NEXT: [[MUL_14:%.*]] = mul nsw i8 [[L_SRC_14]], 10
40+
; POW2-ONLY-NEXT: [[DST_14:%.*]] = getelementptr i8, ptr [[DST]], i8 14
41+
; POW2-ONLY-NEXT: store i8 [[MUL_14]], ptr [[DST_14]], align 1
42+
; POW2-ONLY-NEXT: ret void
3343
;
3444
entry:
3545
%gep.src.0 = getelementptr inbounds i8, ptr %src, i8 0
@@ -123,5 +133,3 @@ entry:
123133

124134
ret void
125135
}
126-
127-

0 commit comments

Comments
 (0)