Skip to content

Commit 4eecf3c

Browse files
[SLP]Reorder buildvector/reduction vectorization and fuse the loops.
Currently SLP vectorizer tries at first to find reduction nodes, and then vectorize buildvector sequences. Need to try to vectorize wide buildvector sequences at first and only then try to vectorize reductions, and then smaller buildvector sequences. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #96943
1 parent d548020 commit 4eecf3c

File tree

4 files changed

+46
-35
lines changed

4 files changed

+46
-35
lines changed

llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -134,11 +134,11 @@ struct SLPVectorizerPass : public PassInfoMixin<SLPVectorizerPass> {
134134

135135
/// Try to vectorize trees that start at insertvalue instructions.
136136
bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
137-
slpvectorizer::BoUpSLP &R);
137+
slpvectorizer::BoUpSLP &R, bool MaxVFOnly);
138138

139139
/// Try to vectorize trees that start at insertelement instructions.
140140
bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB,
141-
slpvectorizer::BoUpSLP &R);
141+
slpvectorizer::BoUpSLP &R, bool MaxVFOnly);
142142

143143
/// Tries to vectorize \p CmpInts. \Returns true on success.
144144
template <typename ItT>

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18108,7 +18108,8 @@ bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
1810818108
}
1810918109

1811018110
bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18111-
BasicBlock *BB, BoUpSLP &R) {
18111+
BasicBlock *BB, BoUpSLP &R,
18112+
bool MaxVFOnly) {
1811218113
if (!R.canMapToVector(IVI->getType()))
1811318114
return false;
1811418115

@@ -18119,11 +18120,12 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
1811918120

1812018121
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
1812118122
// Aggregate value is unlikely to be processed in vector register.
18122-
return tryToVectorizeList(BuildVectorOpds, R);
18123+
return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
1812318124
}
1812418125

1812518126
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18126-
BasicBlock *BB, BoUpSLP &R) {
18127+
BasicBlock *BB, BoUpSLP &R,
18128+
bool MaxVFOnly) {
1812718129
SmallVector<Value *, 16> BuildVectorInsts;
1812818130
SmallVector<Value *, 16> BuildVectorOpds;
1812918131
SmallVector<int> Mask;
@@ -18133,7 +18135,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
1813318135
return false;
1813418136

1813518137
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18136-
return tryToVectorizeList(BuildVectorInsts, R);
18138+
return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
1813718139
}
1813818140

1813918141
template <typename T>
@@ -18353,20 +18355,30 @@ bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
1835318355
"This function only accepts Insert instructions");
1835418356
bool OpsChanged = false;
1835518357
SmallVector<WeakTrackingVH> PostponedInsts;
18356-
// pass1 - try to vectorize reductions only
1835718358
for (auto *I : reverse(Instructions)) {
18359+
// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18360+
if (R.isDeleted(I) || isa<CmpInst>(I))
18361+
continue;
18362+
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18363+
OpsChanged |=
18364+
vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18365+
} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18366+
OpsChanged |=
18367+
vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18368+
}
18369+
// pass2 - try to vectorize reductions only
1835818370
if (R.isDeleted(I))
1835918371
continue;
1836018372
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18361-
}
18362-
// pass2 - try to match and vectorize a buildvector sequence.
18363-
for (auto *I : reverse(Instructions)) {
1836418373
if (R.isDeleted(I) || isa<CmpInst>(I))
1836518374
continue;
18375+
// pass3 - try to match and vectorize a buildvector sequence.
1836618376
if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18367-
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18377+
OpsChanged |=
18378+
vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
1836818379
} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18369-
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18380+
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
18381+
/*MaxVFOnly=*/false);
1837018382
}
1837118383
}
1837218384
// Now try to vectorize postponed instructions.

llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,32 +10,33 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float
1010
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
1111
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1212
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
13-
; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
14-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0
15-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1
16-
; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
13+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
14+
; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
15+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
16+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
17+
; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
1718
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
1819
; CHECK: bb18:
19-
; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ]
20-
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
21-
; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00
22-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3
23-
; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00
20+
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ]
21+
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
22+
; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP10]], 2.000000e+00
23+
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
24+
; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP11]], 3.000000e+00
2425
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
2526
; CHECK: bb25:
26-
; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ]
27-
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
27+
; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ]
28+
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
2829
; CHECK-NEXT: br label [[BB30:%.*]]
2930
; CHECK: bb30:
3031
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
31-
; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP11]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
32-
; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
33-
; CHECK-NEXT: [[TMP13:%.*]] = uitofp <4 x i8> [[TMP12]] to <4 x float>
34-
; CHECK-NEXT: [[TMP14:%.*]] = fsub fast <4 x float> [[TMP13]], [[TMP3]]
35-
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP14]], [[TMP10]]
36-
; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
32+
; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[TMP13]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
33+
; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
34+
; CHECK-NEXT: [[TMP15:%.*]] = uitofp <4 x i8> [[TMP14]] to <4 x float>
35+
; CHECK-NEXT: [[TMP16:%.*]] = fsub fast <4 x float> [[TMP15]], [[TMP3]]
36+
; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP12]]
37+
; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP17]])
3738
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
38-
; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP16]])
39+
; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP18]])
3940
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
4041
; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]]
4142
; CHECK: bb57:

llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@
44

55
; YAML: --- !Missed
66
; YAML-NEXT: Pass: slp-vectorizer
7-
; YAML-NEXT: Name: NotBeneficial
7+
; YAML-NEXT: Name: NotPossible
88
; YAML-NEXT: Function: g
99
; YAML-NEXT: Args:
10-
; YAML-NEXT: - String: 'List vectorization was possible but not beneficial with cost '
11-
; YAML-NEXT: - Cost: '0'
12-
; YAML-NEXT: - String: ' >= '
13-
; YAML-NEXT: - Treshold: '0'
10+
; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible'
11+
; YAML-NEXT: - String: ' with available vectorization factors'
1412

1513
define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
1614
; CHECK-LABEL: @g(

0 commit comments

Comments
 (0)