Skip to content

Commit 0bb1b0a

Browse files
committed
[NFC][LoopVectorize] Cache result of requiresScalarEpilogue
Caching the decision returned by requiresScalarEpilogue means that we can avoid printing out the same debug many times, and also avoids repeating the same calculation. This function will get more complex when we start to reason about more early exit loops, such as in PR llvm#88385. The only problem with this is we sometimes have to invalidate the previous result due to changes in the scalar epilogue status or interleave groups.
1 parent 77afd9e commit 0bb1b0a

File tree

2 files changed

+61
-29
lines changed

2 files changed

+61
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,27 +1347,46 @@ class LoopVectorizationCostModel {
13471347
return InterleaveInfo.getInterleaveGroup(Instr);
13481348
}
13491349

1350+
/// Calculate in advance whether a scalar epilogue is required when
1351+
/// vectorizing and not vectorizing. If \p Invalidate is true then
1352+
/// invalidate a previous decision.
1353+
void collectScalarEpilogueRequirements(bool Invalidate) {
1354+
auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
1355+
if (!isScalarEpilogueAllowed()) {
1356+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1357+
return false;
1358+
}
1359+
// If we might exit from anywhere but the latch, must run the exiting
1360+
// iteration in scalar form.
1361+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1362+
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1363+
"from latch block\n");
1364+
return true;
1365+
}
1366+
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1367+
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1368+
"interleaved group requires scalar epilogue");
1369+
return true;
1370+
}
1371+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1372+
return false;
1373+
};
1374+
1375+
assert((Invalidate || !RequiresScalarEpilogue) &&
1376+
"Already determined scalar epilogue requirements!");
1377+
std::pair<bool, bool> Result;
1378+
Result.first = NeedsScalarEpilogue(true);
1379+
LLVM_DEBUG(dbgs() << ", when vectorizing\n");
1380+
Result.second = NeedsScalarEpilogue(false);
1381+
LLVM_DEBUG(dbgs() << ", when not vectorizing\n");
1382+
RequiresScalarEpilogue = Result;
1383+
}
1384+
13501385
/// Returns true if we're required to use a scalar epilogue for at least
13511386
/// the final iteration of the original loop.
13521387
bool requiresScalarEpilogue(bool IsVectorizing) const {
1353-
if (!isScalarEpilogueAllowed()) {
1354-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1355-
return false;
1356-
}
1357-
// If we might exit from anywhere but the latch, must run the exiting
1358-
// iteration in scalar form.
1359-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1360-
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1361-
"from latch block\n");
1362-
return true;
1363-
}
1364-
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1365-
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1366-
"interleaved group requires scalar epilogue\n");
1367-
return true;
1368-
}
1369-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1370-
return false;
1388+
auto &CachedResult = *RequiresScalarEpilogue;
1389+
return IsVectorizing ? CachedResult.first : CachedResult.second;
13711390
}
13721391

13731392
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1391,6 +1410,15 @@ class LoopVectorizationCostModel {
13911410
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
13921411
}
13931412

1413+
/// Update the ScalarEpilogueStatus to a new value, potentially triggering a
1414+
/// recalculation of the scalar epilogue requirements.
1415+
void setScalarEpilogueStatus(ScalarEpilogueLowering Status) {
1416+
bool Changed = ScalarEpilogueStatus != Status;
1417+
ScalarEpilogueStatus = Status;
1418+
if (Changed)
1419+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
1420+
}
1421+
13941422
/// Returns the TailFoldingStyle that is best for the current loop.
13951423
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
13961424
if (!ChosenTailFoldingStyle)
@@ -1771,6 +1799,9 @@ class LoopVectorizationCostModel {
17711799

17721800
/// All element types found in the loop.
17731801
SmallPtrSet<Type *, 16> ElementTypesInLoop;
1802+
1803+
/// Keeps track of whether we require a scalar epilogue.
1804+
std::optional<std::pair<bool, bool>> RequiresScalarEpilogue;
17741805
};
17751806
} // end namespace llvm
17761807

@@ -4058,7 +4089,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40584089
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
40594090
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
40604091
"scalar epilogue instead.\n");
4061-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4092+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
40624093
return computeFeasibleMaxVF(MaxTC, UserVF, false);
40634094
}
40644095
return FixedScalableVFPair::getNone();
@@ -4074,6 +4105,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40744105
// Note: There is no need to invalidate any cost modeling decisions here, as
40754106
// none were taken so far.
40764107
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4108+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
40774109
}
40784110

40794111
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
@@ -4145,7 +4177,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
41454177
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
41464178
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
41474179
"scalar epilogue instead.\n");
4148-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4180+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
41494181
return MaxFactors;
41504182
}
41514183

@@ -7058,6 +7090,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70587090
if (!OrigLoop->isInnermost()) {
70597091
// If the user doesn't provide a vectorization factor, determine a
70607092
// reasonable one.
7093+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
70617094
if (UserVF.isZero()) {
70627095
VF = determineVPlanVF(TTI, CM);
70637096
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7102,6 +7135,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
71027135

71037136
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71047137
assert(OrigLoop->isInnermost() && "Inner loop expected.");
7138+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
71057139
CM.collectValuesToIgnore();
71067140
CM.collectElementTypesForWidening();
71077141

@@ -7116,11 +7150,13 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71167150
dbgs()
71177151
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
71187152
"which requires masked-interleaved support.\n");
7119-
if (CM.InterleaveInfo.invalidateGroups())
7153+
if (CM.InterleaveInfo.invalidateGroups()) {
71207154
// Invalidating interleave groups also requires invalidating all decisions
71217155
// based on them, which includes widening decisions and uniform and scalar
71227156
// values.
71237157
CM.invalidateCostModelingDecisions();
7158+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/true);
7159+
}
71247160
}
71257161

71267162
if (CM.foldTailByMasking())

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
1717
; CHECK-NEXT: LV: Found an induction variable.
1818
; CHECK-NEXT: LV: Did not find one integer induction var.
1919
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
20-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
20+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
21+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
2122
; CHECK-NEXT: LV: Found trip count: 0
2223
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
2324
; CHECK-NEXT: LV: Scalable vectorization is available
@@ -45,7 +46,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4546
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
4647
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4748
; CHECK-NEXT: LV: Using user VF vscale x 4.
48-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
4949
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5050
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5151
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -134,7 +134,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
134134
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
135135
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
136136
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
137-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
138137
; CHECK-NEXT: LV: Loop cost is 32
139138
; CHECK-NEXT: LV: IC is 1
140139
; CHECK-NEXT: LV: VF is vscale x 4
@@ -194,7 +193,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
194193
; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
195194
; CHECK-NEXT: No successors
196195
; CHECK-NEXT: }
197-
; CHECK: LV: Loop does not require scalar epilogue
198196
;
199197
entry:
200198
%cmp7 = icmp sgt i32 %n, 0
@@ -231,7 +229,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
231229
; CHECK-NEXT: LV: Found FP op with unsafe algebra.
232230
; CHECK-NEXT: LV: Did not find one integer induction var.
233231
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
234-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
232+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
233+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
235234
; CHECK-NEXT: LV: Found trip count: 0
236235
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
237236
; CHECK-NEXT: LV: Scalable vectorization is available
@@ -259,7 +258,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
259258
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
260259
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
261260
; CHECK-NEXT: LV: Using user VF vscale x 4.
262-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
263261
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
264262
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
265263
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
@@ -348,7 +346,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
348346
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
349347
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
350348
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
351-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
352349
; CHECK-NEXT: LV: Loop cost is 34
353350
; CHECK-NEXT: LV: IC is 1
354351
; CHECK-NEXT: LV: VF is vscale x 4
@@ -408,7 +405,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
408405
; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
409406
; CHECK-NEXT: No successors
410407
; CHECK-NEXT: }
411-
; CHECK: LV: Loop does not require scalar epilogue
412408
;
413409
entry:
414410
%cmp7 = icmp sgt i32 %n, 0

0 commit comments

Comments
 (0)