Skip to content

Commit 5fae408

Browse files
authored
[VPlan] Dispatch to multiple exit blocks via middle blocks. (#112138)
A more lightweight variant of #109193, which dispatches to multiple exit blocks via the middle blocks. The patch also introduces a bit of required scaffolding to enable early-exit vectorization, including an option. At the moment, early-exit vectorization doesn't come with legality checks, and is only used if the option is provided and the loop has metadata forcing vectorization. This is only intended to be used for testing during bring-up, with @david-arm enabling auto early-exit vectorization plugging in the changes from #88385. PR: #112138
1 parent 19bc282 commit 5fae408

15 files changed

+650
-80
lines changed

llvm/docs/Vectorizers.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,19 @@ small trip counts.
399399

400400
.. image:: epilogue-vectorization-cfg.png
401401

402+
Early Exit Vectorization
403+
^^^^^^^^^^^^^^^^^^^^^^^^
404+
405+
When vectorizing a loop with a single early exit, the loop blocks following the
406+
early exit are predicated and the vector loop will always exit via the latch.
407+
If the early exit has been taken, the vector loop's successor block
408+
(``middle.split`` below) branches to the early exit block. Otherwise
409+
``middle.block`` selects between the exit block from the latch or the scalar
410+
remainder loop.
411+
412+
.. image:: vplan-early-exit.png
413+
414+
402415
Performance
403416
-----------
404417

llvm/docs/vplan-early-exit.dot

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
digraph VPlan {
2+
graph [labelloc=t, fontsize=30; label=""]
3+
node [shape=rect, fontname=Courier, fontsize=30]
4+
edge [fontname=Courier, fontsize=30]
5+
compound=true
6+
N1 [label =
7+
"vector.ph"
8+
]
9+
N1 -> N2 [ label="" lhead=cluster_N3]
10+
subgraph cluster_N3 {
11+
fontname=Courier
12+
label="\<x1\> vector loop"
13+
N2 [label =
14+
"vector.body"
15+
]
16+
}
17+
N2 -> N4 [ label="" ltail=cluster_N3]
18+
N4 [label =
19+
"middle.split"
20+
]
21+
N4 -> N5 [ label=""]
22+
N4 -> N6 [ label=""]
23+
N5 [label =
24+
"early.exit"
25+
]
26+
N6 [label =
27+
"middle.block"
28+
]
29+
N6 -> N9 [ label=""]
30+
N6 -> N7 [ label=""]
31+
N7 [label =
32+
"scalar.ph"
33+
]
34+
N7 -> N8 [ label=""]
35+
N8 [label =
36+
"loop.header"
37+
]
38+
N9 [label =
39+
"latch.exit"
40+
]
41+
}

llvm/docs/vplan-early-exit.png

129 KB
Loading

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,11 @@ class LoopVectorizationLegality {
395395

396396
/// Returns the uncountable early exiting block.
397397
BasicBlock *getUncountableEarlyExitingBlock() const {
398+
if (!HasUncountableEarlyExit) {
399+
assert(getUncountableExitingBlocks().empty() &&
400+
"Expected no uncountable exiting blocks");
401+
return nullptr;
402+
}
398403
assert(getUncountableExitingBlocks().size() == 1 &&
399404
"Expected only a single uncountable exiting block");
400405
return getUncountableExitingBlocks()[0];

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
13751375
}
13761376

13771377
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
1378+
// When vectorizing early exits, create predicates for the latch block only.
1379+
// The early exiting block must be a direct predecessor of the latch at the
1380+
// moment.
1381+
BasicBlock *Latch = TheLoop->getLoopLatch();
1382+
if (hasUncountableEarlyExit()) {
1383+
assert(
1384+
is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
1385+
"Uncountable exiting block must be a direct predecessor of latch");
1386+
return BB == Latch;
1387+
}
13781388
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
13791389
}
13801390

@@ -1788,13 +1798,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
17881798

17891799
HasUncountableEarlyExit = false;
17901800
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
1801+
HasUncountableEarlyExit = true;
17911802
if (!isVectorizableEarlyExitLoop()) {
1803+
UncountableExitingBlocks.clear();
1804+
HasUncountableEarlyExit = false;
17921805
if (DoExtraAnalysis)
17931806
Result = false;
17941807
else
17951808
return false;
1796-
} else
1797-
HasUncountableEarlyExit = true;
1809+
}
17981810
}
17991811

18001812
// Go over each instruction and look at memory deps.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 75 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
385385
cl::Hidden,
386386
cl::desc("Try wider VFs if they enable the use of vector variants"));
387387

388+
static cl::opt<bool> EnableEarlyExitVectorization(
389+
"enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390+
cl::desc(
391+
"Enable vectorization of early exit loops with uncountable exits."));
392+
388393
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
389394
// variables not overflowing do not hold. See `emitSCEVChecks`.
390395
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1382,9 +1387,10 @@ class LoopVectorizationCostModel {
13821387
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
13831388
return false;
13841389
}
1385-
// If we might exit from anywhere but the latch, must run the exiting
1386-
// iteration in scalar form.
1387-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1390+
// If we might exit from anywhere but the latch and early exit vectorization
1391+
// is disabled, we must run the exiting iteration in scalar form.
1392+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1393+
!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
13881394
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
13891395
"from latch block\n");
13901396
return true;
@@ -3656,10 +3662,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
36563662

36573663
// Start with the conditional branches exiting the loop. If the branch
36583664
// condition is an instruction contained in the loop that is only used by the
3659-
// branch, it is uniform.
3665+
// branch, it is uniform. Note conditions from uncountable early exits are not
3666+
// uniform.
36603667
SmallVector<BasicBlock *> Exiting;
36613668
TheLoop->getExitingBlocks(Exiting);
36623669
for (BasicBlock *E : Exiting) {
3670+
if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3671+
continue;
36633672
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
36643673
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
36653674
AddToWorklistIfAllowed(Cmp);
@@ -8239,8 +8248,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
82398248

82408249
// If source is an exiting block, we know the exit edge is dynamically dead
82418250
// in the vector loop, and thus we don't need to restrict the mask. Avoid
8242-
// adding uses of an otherwise potentially dead instruction.
8243-
if (OrigLoop->isLoopExiting(Src))
8251+
// adding uses of an otherwise potentially dead instruction unless we are
8252+
// vectorizing a loop with uncountable exits. In that case, we always
8253+
// materialize the mask.
8254+
if (OrigLoop->isLoopExiting(Src) &&
8255+
Src != Legal->getUncountableEarlyExitingBlock())
82448256
return EdgeMaskCache[Edge] = SrcMask;
82458257

82468258
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8931,50 +8943,58 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
89318943
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
89328944
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
89338945
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8946+
auto *MiddleVPBB = Plan.getMiddleBlock();
89348947
SetVector<VPIRInstruction *> ExitUsersToFix;
89358948
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8936-
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
8937-
BasicBlock *ExitingBB = find_singleton<BasicBlock>(
8938-
to_vector(predecessors(ExitBB)),
8939-
[OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
8940-
return OrigLoop->contains(Pred) ? Pred : nullptr;
8941-
});
89428949
for (VPRecipeBase &R : *ExitVPBB) {
89438950
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
89448951
if (!ExitIRI)
89458952
continue;
89468953
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
89478954
if (!ExitPhi)
89488955
break;
8949-
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8950-
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8951-
// Exit values for inductions are computed and updated outside of VPlan
8952-
// and independent of induction recipes.
8953-
// TODO: Compute induction exit values in VPlan.
8954-
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8955-
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8956-
isa<VPWidenPointerInductionRecipe>(V) ||
8957-
(isa<Instruction>(IncomingValue) &&
8958-
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8959-
any_of(IncomingValue->users(), [&Inductions](User *U) {
8960-
auto *P = dyn_cast<PHINode>(U);
8961-
return P && Inductions.contains(P);
8962-
})))
8963-
continue;
8964-
ExitUsersToFix.insert(ExitIRI);
8965-
ExitIRI->addOperand(V);
8956+
for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
8957+
BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
8958+
if (PredVPBB != MiddleVPBB) {
8959+
SmallVector<BasicBlock *> ExitingBlocks;
8960+
OrigLoop->getExitingBlocks(ExitingBlocks);
8961+
assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
8962+
ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
8963+
: ExitingBlocks[0];
8964+
}
8965+
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8966+
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8967+
// Exit values for inductions are computed and updated outside of VPlan
8968+
// and independent of induction recipes.
8969+
// TODO: Compute induction exit values in VPlan.
8970+
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8971+
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8972+
isa<VPWidenPointerInductionRecipe>(V) ||
8973+
(isa<Instruction>(IncomingValue) &&
8974+
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8975+
any_of(IncomingValue->users(), [&Inductions](User *U) {
8976+
auto *P = dyn_cast<PHINode>(U);
8977+
return P && Inductions.contains(P);
8978+
}))) {
8979+
if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
8980+
continue;
8981+
}
8982+
ExitUsersToFix.insert(ExitIRI);
8983+
ExitIRI->addOperand(V);
8984+
}
89668985
}
89678986
}
89688987
return ExitUsersToFix;
89698988
}
89708989

89718990
// Add exit values to \p Plan. Extracts are added for each entry in \p
8972-
// ExitUsersToFix if needed and their operands are updated.
8973-
static void
8991+
// ExitUsersToFix if needed and their operands are updated. Returns true if all
8992+
// exit users can be handled, otherwise return false.
8993+
static bool
89748994
addUsersInExitBlocks(VPlan &Plan,
89758995
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
89768996
if (ExitUsersToFix.empty())
8977-
return;
8997+
return true;
89788998

89798999
auto *MiddleVPBB = Plan.getMiddleBlock();
89809000
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -8988,14 +9008,18 @@ addUsersInExitBlocks(VPlan &Plan,
89889008
if (V->isLiveIn())
89899009
continue;
89909010

8991-
assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8992-
"Exit value not handled yet for this edge.");
9011+
// Currently only live-ins can be used by exit values from blocks not
9012+
// exiting via the vector latch through to the middle block.
9013+
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9014+
return false;
9015+
89939016
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
89949017
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
89959018
{V, Plan.getOrAddLiveIn(ConstantInt::get(
89969019
IntegerType::get(Ctx, 32), 1))});
89979020
ExitIRI->setOperand(0, Ext);
89989021
}
9022+
return true;
89999023
}
90009024

90019025
/// Handle users in the exit block for first order reductions in the original
@@ -9268,11 +9292,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92689292
"VPBasicBlock");
92699293
RecipeBuilder.fixHeaderPhis();
92709294

9295+
if (auto *UncountableExitingBlock =
9296+
Legal->getUncountableEarlyExitingBlock()) {
9297+
VPlanTransforms::handleUncountableEarlyExit(
9298+
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9299+
}
92719300
addScalarResumePhis(RecipeBuilder, *Plan);
92729301
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
92739302
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
92749303
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9275-
addUsersInExitBlocks(*Plan, ExitUsersToFix);
9304+
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9305+
reportVectorizationFailure(
9306+
"Some exit values in loop with uncountable exit not supported yet",
9307+
"Some exit values in loop with uncountable exit not supported yet",
9308+
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9309+
return nullptr;
9310+
}
9311+
92769312
// ---------------------------------------------------------------------------
92779313
// Transform initial VPlan: Apply previously taken decisions, in order, to
92789314
// bring the VPlan to its final state.
@@ -10138,12 +10174,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1013810174
return false;
1013910175
}
1014010176

10141-
if (LVL.hasUncountableEarlyExit()) {
10177+
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
1014210178
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10143-
"early exit is not yet supported",
10179+
"early exit is not enabled",
1014410180
"Auto-vectorization of loops with uncountable "
10145-
"early exit is not yet supported",
10146-
"UncountableEarlyExitLoopsUnsupported", ORE, L);
10181+
"early exit is not enabled",
10182+
"UncountableEarlyExitLoopsDisabled", ORE, L);
1014710183
return false;
1014810184
}
1014910185

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -861,14 +861,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
861861
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
862862

863863
// Create SCEV and VPValue for the trip count.
864-
865-
// Currently only loops with countable exits are vectorized, but calling
866-
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
867-
// uncountable exits whilst also ensuring the symbolic maximum and known
868-
// back-edge taken count remain identical for loops with countable exits.
864+
// We use the symbolic max backedge-taken-count, which works also when
865+
// vectorizing loops with uncountable early exits.
869866
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
870-
assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
871-
BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
867+
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
872868
"Invalid loop count");
873869
ScalarEvolution &SE = *PSE.getSE();
874870
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
@@ -903,7 +899,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
903899
// 2) If we require a scalar epilogue, there is no conditional branch as
904900
// we unconditionally branch to the scalar preheader. Do nothing.
905901
// 3) Otherwise, construct a runtime check.
906-
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
902+
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
907903
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
908904
// The connection order corresponds to the operands of the conditional branch.
909905
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,14 @@ class VPBlockBase {
621621
/// Remove all the successors of this block.
622622
void clearSuccessors() { Successors.clear(); }
623623

624+
/// Swap successors of the block. The block must have exactly 2 successors.
625+
// TODO: This should be part of introducing conditional branch recipes rather
626+
// than being independent.
627+
void swapSuccessors() {
628+
assert(Successors.size() == 2 && "must have 2 successors to swap");
629+
std::swap(Successors[0], Successors[1]);
630+
}
631+
624632
/// The method which generates the output IR that correspond to this
625633
/// VPBlockBase, thereby "executing" the VPlan.
626634
virtual void execute(VPTransformState *State) = 0;
@@ -1232,6 +1240,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
12321240
// operand). Only generates scalar values (either for the first lane only or
12331241
// for all lanes, depending on its uses).
12341242
PtrAdd,
1243+
// Returns a scalar boolean value, which is true if any lane of its single
1244+
// operand is true.
1245+
AnyOf,
12351246
};
12361247

12371248
private:
@@ -3884,10 +3895,10 @@ class VPlan {
38843895
/// whether to execute the scalar tail loop or the exit block from the loop
38853896
/// latch.
38863897
const VPBasicBlock *getMiddleBlock() const {
3887-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3898+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38883899
}
38893900
VPBasicBlock *getMiddleBlock() {
3890-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3901+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38913902
}
38923903

38933904
/// Return the VPBasicBlock for the preheader of the scalar loop.

0 commit comments

Comments
 (0)