Skip to content

Commit 0807837

Browse files
committed
[VPlan] Dispatch to multiple exit blocks via middle blocks. #112138
1 parent 323bedd commit 0807837

13 files changed

+609
-80
lines changed

llvm/docs/Vectorizers.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,19 @@ small trip counts.
399399

400400
.. image:: epilogue-vectorization-cfg.png
401401

402+
Early Exit Vectorization
403+
^^^^^^^^^^^^^^^^^^^^^^^^
404+
405+
When vectorizing a loop with a single early exit, the loop blocks following the
406+
early exit are predicated and the vector loop will always exit via the latch.
407+
If the early exit has been taken, the vector loop's successor block
408+
(``middle.split`` below) branches to the early exit block. Otherwise
409+
``middle.block`` selects between the exit block from the latch or the scalar
410+
remainder loop.
411+
412+
.. image:: vplan-early-exit.png
413+
414+
402415
Performance
403416
-----------
404417

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,11 @@ class LoopVectorizationLegality {
395395

396396
/// Returns the uncountable early exiting block.
397397
BasicBlock *getUncountableEarlyExitingBlock() const {
398+
if (!HasUncountableEarlyExit) {
399+
assert(getUncountableExitingBlocks().empty() &&
400+
"Expected no uncountable exiting blocks");
401+
return nullptr;
402+
}
398403
assert(getUncountableExitingBlocks().size() == 1 &&
399404
"Expected only a single uncountable exiting block");
400405
return getUncountableExitingBlocks()[0];

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,16 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
13751375
}
13761376

13771377
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
1378+
// When vectorizing early exits, create predicates for the latch block only.
1379+
// The early exiting block must be a direct predecessor of the latch at the
1380+
// moment.
1381+
BasicBlock *Latch = TheLoop->getLoopLatch();
1382+
if (hasUncountableEarlyExit()) {
1383+
assert(
1384+
is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
1385+
"Uncountable exiting block must be a direct predecessor of latch");
1386+
return BB == Latch;
1387+
}
13781388
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
13791389
}
13801390

@@ -1788,13 +1798,15 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
17881798

17891799
HasUncountableEarlyExit = false;
17901800
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
1801+
HasUncountableEarlyExit = true;
17911802
if (!isVectorizableEarlyExitLoop()) {
1803+
UncountableExitingBlocks.clear();
1804+
HasUncountableEarlyExit = false;
17921805
if (DoExtraAnalysis)
17931806
Result = false;
17941807
else
17951808
return false;
1796-
} else
1797-
HasUncountableEarlyExit = true;
1809+
}
17981810
}
17991811

18001812
// Go over each instruction and look at memory deps.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 75 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
385385
cl::Hidden,
386386
cl::desc("Try wider VFs if they enable the use of vector variants"));
387387

388+
static cl::opt<bool> EnableEarlyExitVectorization(
389+
"enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390+
cl::desc(
391+
"Enable vectorization of early exit loops with uncountable exits."));
392+
388393
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
389394
// variables not overflowing do not hold. See `emitSCEVChecks`.
390395
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1382,9 +1387,10 @@ class LoopVectorizationCostModel {
13821387
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
13831388
return false;
13841389
}
1385-
// If we might exit from anywhere but the latch, must run the exiting
1386-
// iteration in scalar form.
1387-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1390+
// If we might exit from anywhere but the latch and early exit vectorization
1391+
// is disabled, we must run the exiting iteration in scalar form.
1392+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1393+
!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
13881394
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
13891395
"from latch block\n");
13901396
return true;
@@ -3656,10 +3662,13 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
36563662

36573663
// Start with the conditional branches exiting the loop. If the branch
36583664
// condition is an instruction contained in the loop that is only used by the
3659-
// branch, it is uniform.
3665+
// branch, it is uniform. Note conditions from uncountable early exits are not
3666+
// uniform.
36603667
SmallVector<BasicBlock *> Exiting;
36613668
TheLoop->getExitingBlocks(Exiting);
36623669
for (BasicBlock *E : Exiting) {
3670+
if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3671+
continue;
36633672
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
36643673
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
36653674
AddToWorklistIfAllowed(Cmp);
@@ -8239,8 +8248,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
82398248

82408249
// If source is an exiting block, we know the exit edge is dynamically dead
82418250
// in the vector loop, and thus we don't need to restrict the mask. Avoid
8242-
// adding uses of an otherwise potentially dead instruction.
8243-
if (OrigLoop->isLoopExiting(Src))
8251+
// adding uses of an otherwise potentially dead instruction unless we are
8252+
// vectorizing a loop with uncountable exits. In that case, we always
8253+
// materialize the mask.
8254+
if (OrigLoop->isLoopExiting(Src) &&
8255+
Src != Legal->getUncountableEarlyExitingBlock())
82448256
return EdgeMaskCache[Edge] = SrcMask;
82458257

82468258
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8931,50 +8943,58 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
89318943
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
89328944
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
89338945
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8946+
auto *MiddleVPBB = Plan.getMiddleBlock();
89348947
SetVector<VPIRInstruction *> ExitUsersToFix;
89358948
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8936-
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
8937-
BasicBlock *ExitingBB = find_singleton<BasicBlock>(
8938-
to_vector(predecessors(ExitBB)),
8939-
[OrigLoop](BasicBlock *Pred, bool AllowRepeats) {
8940-
return OrigLoop->contains(Pred) ? Pred : nullptr;
8941-
});
89428949
for (VPRecipeBase &R : *ExitVPBB) {
89438950
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
89448951
if (!ExitIRI)
89458952
continue;
89468953
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
89478954
if (!ExitPhi)
89488955
break;
8949-
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8950-
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8951-
// Exit values for inductions are computed and updated outside of VPlan
8952-
// and independent of induction recipes.
8953-
// TODO: Compute induction exit values in VPlan.
8954-
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8955-
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8956-
isa<VPWidenPointerInductionRecipe>(V) ||
8957-
(isa<Instruction>(IncomingValue) &&
8958-
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8959-
any_of(IncomingValue->users(), [&Inductions](User *U) {
8960-
auto *P = dyn_cast<PHINode>(U);
8961-
return P && Inductions.contains(P);
8962-
})))
8963-
continue;
8964-
ExitUsersToFix.insert(ExitIRI);
8965-
ExitIRI->addOperand(V);
8956+
for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
8957+
BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
8958+
if (PredVPBB != MiddleVPBB) {
8959+
SmallVector<BasicBlock *> ExitingBlocks;
8960+
OrigLoop->getExitingBlocks(ExitingBlocks);
8961+
assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
8962+
ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
8963+
: ExitingBlocks[0];
8964+
}
8965+
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8966+
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8967+
// Exit values for inductions are computed and updated outside of VPlan
8968+
// and independent of induction recipes.
8969+
// TODO: Compute induction exit values in VPlan.
8970+
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8971+
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8972+
isa<VPWidenPointerInductionRecipe>(V) ||
8973+
(isa<Instruction>(IncomingValue) &&
8974+
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8975+
any_of(IncomingValue->users(), [&Inductions](User *U) {
8976+
auto *P = dyn_cast<PHINode>(U);
8977+
return P && Inductions.contains(P);
8978+
}))) {
8979+
if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
8980+
continue;
8981+
}
8982+
ExitUsersToFix.insert(ExitIRI);
8983+
ExitIRI->addOperand(V);
8984+
}
89668985
}
89678986
}
89688987
return ExitUsersToFix;
89698988
}
89708989

89718990
// Add exit values to \p Plan. Extracts are added for each entry in \p
8972-
// ExitUsersToFix if needed and their operands are updated.
8973-
static void
8991+
// ExitUsersToFix if needed and their operands are updated. Returns true if all
8992+
// exit users can be handled, otherwise return false.
8993+
static bool
89748994
addUsersInExitBlocks(VPlan &Plan,
89758995
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
89768996
if (ExitUsersToFix.empty())
8977-
return;
8997+
return true;
89788998

89798999
auto *MiddleVPBB = Plan.getMiddleBlock();
89809000
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -8988,14 +9008,18 @@ addUsersInExitBlocks(VPlan &Plan,
89889008
if (V->isLiveIn())
89899009
continue;
89909010

8991-
assert(ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8992-
"Exit value not handled yet for this edge.");
9011+
// Currently only live-ins can be used by exit values from blocks not
9012+
// exiting via the vector latch through to the middle block.
9013+
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9014+
return false;
9015+
89939016
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
89949017
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
89959018
{V, Plan.getOrAddLiveIn(ConstantInt::get(
89969019
IntegerType::get(Ctx, 32), 1))});
89979020
ExitIRI->setOperand(0, Ext);
89989021
}
9022+
return true;
89999023
}
90009024

90019025
/// Handle users in the exit block for first order reductions in the original
@@ -9268,11 +9292,23 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92689292
"VPBasicBlock");
92699293
RecipeBuilder.fixHeaderPhis();
92709294

9295+
if (auto *UncountableExitingBlock =
9296+
Legal->getUncountableEarlyExitingBlock()) {
9297+
VPlanTransforms::handleUncountableEarlyExit(
9298+
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9299+
}
92719300
addScalarResumePhis(RecipeBuilder, *Plan);
92729301
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
92739302
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
92749303
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9275-
addUsersInExitBlocks(*Plan, ExitUsersToFix);
9304+
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9305+
reportVectorizationFailure(
9306+
"Some exit values in loop with uncountable exit not supported yet",
9307+
"Some exit values in loop with uncountable exit not supported yet",
9308+
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9309+
return nullptr;
9310+
}
9311+
92769312
// ---------------------------------------------------------------------------
92779313
// Transform initial VPlan: Apply previously taken decisions, in order, to
92789314
// bring the VPlan to its final state.
@@ -10138,12 +10174,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1013810174
return false;
1013910175
}
1014010176

10141-
if (LVL.hasUncountableEarlyExit()) {
10177+
if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
1014210178
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10143-
"early exit is not yet supported",
10179+
"early exit is not enabled",
1014410180
"Auto-vectorization of loops with uncountable "
10145-
"early exit is not yet supported",
10146-
"UncountableEarlyExitLoopsUnsupported", ORE, L);
10181+
"early exit is not enabled",
10182+
"UncountableEarlyExitLoopsDisabled", ORE, L);
1014710183
return false;
1014810184
}
1014910185

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -861,14 +861,10 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
861861
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
862862

863863
// Create SCEV and VPValue for the trip count.
864-
865-
// Currently only loops with countable exits are vectorized, but calling
866-
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
867-
// uncountable exits whilst also ensuring the symbolic maximum and known
868-
// back-edge taken count remain identical for loops with countable exits.
864+
// We use the symbolic max backedge-taken-count, which works also when
865+
// vectorizing loops with uncountable early exits.
869866
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
870-
assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
871-
BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
867+
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
872868
"Invalid loop count");
873869
ScalarEvolution &SE = *PSE.getSE();
874870
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
@@ -903,7 +899,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
903899
// 2) If we require a scalar epilogue, there is no conditional branch as
904900
// we unconditionally branch to the scalar preheader. Do nothing.
905901
// 3) Otherwise, construct a runtime check.
906-
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
902+
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
907903
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
908904
// The connection order corresponds to the operands of the conditional branch.
909905
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,14 @@ class VPBlockBase {
621621
/// Remove all the successors of this block.
622622
void clearSuccessors() { Successors.clear(); }
623623

624+
/// Swap successors of the block. The block must have exactly 2 successors.
625+
// TODO: This should be part of introducing conditional branch recipes rather
626+
// than being independent.
627+
void swapSuccessors() {
628+
assert(Successors.size() == 2 && "must have 2 successors to swap");
629+
std::swap(Successors[0], Successors[1]);
630+
}
631+
624632
/// The method which generates the output IR that correspond to this
625633
/// VPBlockBase, thereby "executing" the VPlan.
626634
virtual void execute(VPTransformState *State) = 0;
@@ -1232,6 +1240,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
12321240
// operand). Only generates scalar values (either for the first lane only or
12331241
// for all lanes, depending on its uses).
12341242
PtrAdd,
1243+
// Returns a scalar boolean value, which is true if any lane of its single
1244+
// operand is true.
1245+
AnyOf,
12351246
};
12361247

12371248
private:
@@ -3884,10 +3895,10 @@ class VPlan {
38843895
/// whether to execute the scalar tail loop or the exit block from the loop
38853896
/// latch.
38863897
const VPBasicBlock *getMiddleBlock() const {
3887-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3898+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38883899
}
38893900
VPBasicBlock *getMiddleBlock() {
3890-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3901+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38913902
}
38923903

38933904
/// Return the VPBasicBlock for the preheader of the scalar loop.

0 commit comments

Comments
 (0)