Skip to content

Commit b1a40e4

Browse files
committed
[VPlan] Dispatch to multiple exit blocks via middle blocks. llvm#112138
1 parent 3097c60 commit b1a40e4

File tree

10 files changed

+417
-97
lines changed

10 files changed

+417
-97
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,10 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
13751375
}
13761376

13771377
bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
1378+
// When vectorizing early exits, create predicates for all blocks, except the
1379+
// header.
1380+
if (hasUncountableEarlyExit() && BB != TheLoop->getHeader())
1381+
return true;
13781382
return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
13791383
}
13801384

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 106 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,11 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
385385
cl::Hidden,
386386
cl::desc("Try wider VFs if they enable the use of vector variants"));
387387

388+
static cl::opt<bool> EnableEarlyExitVectorization(
389+
"enable-early-exit-vectorization", cl::init(false), cl::Hidden,
390+
cl::desc(
391+
"Enable vectorization of early exit loops with uncountable exits."));
392+
388393
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
389394
// variables not overflowing do not hold. See `emitSCEVChecks`.
390395
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
@@ -1350,9 +1355,10 @@ class LoopVectorizationCostModel {
13501355
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
13511356
return false;
13521357
}
1353-
// If we might exit from anywhere but the latch, must run the exiting
1354-
// iteration in scalar form.
1355-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1358+
// If we might exit from anywhere but the latch and early exit vectorization
1359+
// is disabled, we must run the exiting iteration in scalar form.
1360+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1361+
!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
13561362
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
13571363
"from latch block\n");
13581364
return true;
@@ -2568,9 +2574,9 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
25682574
void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
25692575
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
25702576
assert(LoopVectorPreHeader && "Invalid loop structure");
2571-
assert((OrigLoop->getUniqueExitBlock() ||
2577+
assert((OrigLoop->getUniqueLatchExitBlock() ||
25722578
Cost->requiresScalarEpilogue(VF.isVector())) &&
2573-
"multiple exit loop without required epilogue?");
2579+
"loops not exiting via the latch without required epilogue?");
25742580

25752581
LoopMiddleBlock =
25762582
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -2753,8 +2759,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
27532759
// value (the value that feeds into the phi from the loop latch).
27542760
// We allow both, but they, obviously, have different values.
27552761

2756-
assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2757-
27582762
DenseMap<Value *, Value *> MissingVals;
27592763

27602764
Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
@@ -2808,6 +2812,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
28082812
}
28092813
}
28102814

2815+
assert((MissingVals.empty() || OrigLoop->getUniqueExitBlock()) &&
2816+
"Expected a single exit block for escaping values");
28112817
for (auto &I : MissingVals) {
28122818
PHINode *PHI = cast<PHINode>(I.first);
28132819
// One corner case we have to handle is two IVs "chasing" each-other,
@@ -3591,7 +3597,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
35913597
TheLoop->getExitingBlocks(Exiting);
35923598
for (BasicBlock *E : Exiting) {
35933599
auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3594-
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
3600+
if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse() &&
3601+
(TheLoop->getLoopLatch() == E || !Legal->hasUncountableEarlyExit()))
35953602
AddToWorklistIfAllowed(Cmp);
35963603
}
35973604

@@ -7775,6 +7782,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77757782
LoopVectorizeHints Hints(L, true, *ORE);
77767783
Hints.setAlreadyVectorized();
77777784
}
7785+
77787786
TargetTransformInfo::UnrollingPreferences UP;
77797787
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
77807788
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
@@ -7787,15 +7795,17 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77877795
ILV.printDebugTracesAtEnd();
77887796

77897797
// 4. Adjust branch weight of the branch in the middle block.
7790-
auto *MiddleTerm =
7791-
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7792-
if (MiddleTerm->isConditional() &&
7793-
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7794-
// Assume that `Count % VectorTripCount` is equally distributed.
7795-
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7796-
assert(TripCount > 0 && "trip count should not be zero");
7797-
const uint32_t Weights[] = {1, TripCount - 1};
7798-
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7798+
if (ExitVPBB) {
7799+
auto *MiddleTerm =
7800+
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7801+
if (MiddleTerm->isConditional() &&
7802+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7803+
// Assume that `Count % VectorTripCount` is equally distributed.
7804+
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7805+
assert(TripCount > 0 && "trip count should not be zero");
7806+
const uint32_t Weights[] = {1, TripCount - 1};
7807+
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7808+
}
77997809
}
78007810

78017811
return State.ExpandedSCEVs;
@@ -8180,7 +8190,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
81808190
// If source is an exiting block, we know the exit edge is dynamically dead
81818191
// in the vector loop, and thus we don't need to restrict the mask. Avoid
81828192
// adding uses of an otherwise potentially dead instruction.
8183-
if (OrigLoop->isLoopExiting(Src))
8193+
if (!Legal->hasUncountableEarlyExit() && OrigLoop->isLoopExiting(Src))
81848194
return EdgeMaskCache[Edge] = SrcMask;
81858195

81868196
VPValue *EdgeMask = getVPValueOrAddLiveIn(BI->getCondition());
@@ -8863,76 +8873,78 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88638873
}
88648874
}
88658875

8866-
// Collect VPIRInstructions for phis in the original exit block that are modeled
8876+
// Collect VPIRInstructions for phis in the exit blocks that are modeled
88678877
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
88688878
// modeled explicitly yet and won't be included. Those are un-truncated
88698879
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
88708880
// increments.
8871-
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
8881+
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
88728882
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
88738883
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8874-
auto *MiddleVPBB = Plan.getMiddleBlock();
8875-
// No edge from the middle block to the unique exit block has been inserted
8876-
// and there is nothing to fix from vector loop; phis should have incoming
8877-
// from scalar loop only.
8878-
if (MiddleVPBB->getNumSuccessors() != 2)
8879-
return {};
88808884
SetVector<VPIRInstruction *> ExitUsersToFix;
8881-
VPBasicBlock *ExitVPBB = cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]);
8882-
BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8883-
for (VPRecipeBase &R : *ExitVPBB) {
8884-
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
8885-
if (!ExitIRI)
8886-
continue;
8887-
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
8888-
if (!ExitPhi)
8889-
break;
8890-
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8891-
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8892-
// Exit values for inductions are computed and updated outside of VPlan and
8893-
// independent of induction recipes.
8894-
// TODO: Compute induction exit values in VPlan.
8895-
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8896-
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8897-
isa<VPWidenPointerInductionRecipe>(V) ||
8898-
(isa<Instruction>(IncomingValue) &&
8899-
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8900-
any_of(IncomingValue->users(), [&Inductions](User *U) {
8901-
auto *P = dyn_cast<PHINode>(U);
8902-
return P && Inductions.contains(P);
8903-
})))
8904-
continue;
8905-
ExitUsersToFix.insert(ExitIRI);
8906-
ExitIRI->addOperand(V);
8885+
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8886+
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
8887+
for (VPRecipeBase &R : *ExitVPBB) {
8888+
auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
8889+
if (!ExitIRI)
8890+
continue;
8891+
auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
8892+
if (!ExitPhi)
8893+
break;
8894+
for (BasicBlock *ExitingBB : predecessors(ExitBB)) {
8895+
if (!OrigLoop->contains(ExitingBB))
8896+
continue;
8897+
Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
8898+
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
8899+
// Exit values for inductions are computed and updated outside of VPlan
8900+
// and independent of induction recipes.
8901+
// TODO: Compute induction exit values in VPlan.
8902+
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
8903+
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
8904+
isa<VPWidenPointerInductionRecipe>(V) ||
8905+
(isa<Instruction>(IncomingValue) &&
8906+
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
8907+
any_of(IncomingValue->users(), [&Inductions](User *U) {
8908+
auto *P = dyn_cast<PHINode>(U);
8909+
return P && Inductions.contains(P);
8910+
})))
8911+
continue;
8912+
ExitUsersToFix.insert(ExitIRI);
8913+
ExitIRI->addOperand(V);
8914+
}
8915+
}
89078916
}
89088917
return ExitUsersToFix;
89098918
}
89108919

89118920
// Add exit values to \p Plan. Extracts are added for each entry in \p
89128921
// ExitUsersToFix if needed and their operands are updated.
89138922
static void
8914-
addUsersInExitBlock(VPlan &Plan,
8915-
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8923+
addUsersInExitBlocks(VPlan &Plan,
8924+
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
89168925
if (ExitUsersToFix.empty())
89178926
return;
89188927

8919-
auto *MiddleVPBB = Plan.getMiddleBlock();
8920-
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8921-
89228928
// Introduce extract for exiting values and update the VPIRInstructions
89238929
// modeling the corresponding LCSSA phis.
89248930
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8931+
89258932
VPValue *V = ExitIRI->getOperand(0);
89268933
// Pass live-in values used by exit phis directly through to their users in
89278934
// the exit block.
89288935
if (V->isLiveIn())
89298936
continue;
89308937

8931-
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
8932-
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
8933-
{V, Plan.getOrAddLiveIn(ConstantInt::get(
8934-
IntegerType::get(Ctx, 32), 1))});
8935-
ExitIRI->setOperand(0, Ext);
8938+
for (VPBlockBase *PredVPB : ExitIRI->getParent()->getPredecessors()) {
8939+
auto *PredVPBB = cast<VPBasicBlock>(PredVPB);
8940+
VPBuilder B(PredVPBB, PredVPBB->getFirstNonPhi());
8941+
8942+
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
8943+
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
8944+
{V, Plan.getOrAddLiveIn(ConstantInt::get(
8945+
IntegerType::get(Ctx, 32), 1))});
8946+
ExitIRI->setOperand(0, Ext);
8947+
}
89368948
}
89378949
}
89388950

@@ -9204,11 +9216,32 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92049216
"VPBasicBlock");
92059217
RecipeBuilder.fixHeaderPhis();
92069218

9219+
if (Legal->hasUncountableEarlyExit()) {
9220+
VPlanTransforms::handleUncountableEarlyExit(
9221+
*Plan, *PSE.getSE(), OrigLoop, Legal->getUncountableExitingBlocks(),
9222+
RecipeBuilder);
9223+
}
92079224
addScalarResumePhis(RecipeBuilder, *Plan);
9208-
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
9225+
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
92099226
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
92109227
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9211-
addUsersInExitBlock(*Plan, ExitUsersToFix);
9228+
addUsersInExitBlocks(*Plan, ExitUsersToFix);
9229+
9230+
// Currently only live-ins can be used by exit values. We also bail out if any
9231+
// exit value isn't handled in VPlan yet, i.e. a VPIRInstruction in the exit
9232+
// without any operands.
9233+
if (Legal->hasUncountableEarlyExit()) {
9234+
if (any_of(Plan->getExitBlocks(), [](VPIRBasicBlock *ExitBB) {
9235+
return any_of(*ExitBB, [](VPRecipeBase &R) {
9236+
auto VPIRI = cast<VPIRInstruction>(&R);
9237+
return VPIRI->getNumOperands() == 0 ||
9238+
any_of(VPIRI->operands(),
9239+
[](VPValue *Op) { return !Op->isLiveIn(); });
9240+
});
9241+
}))
9242+
return nullptr;
9243+
}
9244+
92129245
// ---------------------------------------------------------------------------
92139246
// Transform initial VPlan: Apply previously taken decisions, in order, to
92149247
// bring the VPlan to its final state.
@@ -9968,12 +10001,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
996810001
}
996910002

997010003
if (LVL.hasUncountableEarlyExit()) {
9971-
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
9972-
"early exit is not yet supported",
9973-
"Auto-vectorization of loops with uncountable "
9974-
"early exit is not yet supported",
9975-
"UncountableEarlyExitLoopsUnsupported", ORE, L);
9976-
return false;
10004+
if (!EnableEarlyExitVectorization) {
10005+
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
10006+
"early exit is not yet supported",
10007+
"Auto-vectorization of loops with uncountable "
10008+
"early exit is not yet supported",
10009+
"UncountableEarlyExitLoopsUnsupported", ORE,
10010+
L);
10011+
return false;
10012+
}
997710013
}
997810014

997910015
// Entrance to the VPlan-native vectorization path. Outer loops are processed

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -870,15 +870,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
870870
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader, ScalarHeader);
871871

872872
// Create SCEV and VPValue for the trip count.
873-
874-
// Currently only loops with countable exits are vectorized, but calling
875-
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
876-
// uncountable exits whilst also ensuring the symbolic maximum and known
877-
// back-edge taken count remain identical for loops with countable exits.
873+
// We use the symbolic max backedge-taken-count, which is used when
874+
// vectorizing loops with uncountable early exits
878875
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
879-
assert((!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
880-
BackedgeTakenCountSCEV == PSE.getBackedgeTakenCount()) &&
881-
"Invalid loop count");
882876
ScalarEvolution &SE = *PSE.getSE();
883877
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
884878
InductionTy, TheLoop);
@@ -913,8 +907,18 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
913907
// we unconditionally branch to the scalar preheader. Do nothing.
914908
// 3) Otherwise, construct a runtime check.
915909
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
910+
if (!IRExitBlock) {
911+
// If there's no unique exit block (i.e. vectorizing with an uncountable
912+
// early exit), use the block exiting from the latch. The other uncountable
913+
// exit blocks will be added later.
914+
auto *Term = cast<BranchInst>(TheLoop->getLoopLatch()->getTerminator());
915+
IRExitBlock = TheLoop->contains(Term->getSuccessor(0))
916+
? Term->getSuccessor(1)
917+
: Term->getSuccessor(0);
918+
}
916919
auto *VPExitBlock = VPIRBasicBlock::fromBasicBlock(IRExitBlock);
917-
// The connection order corresponds to the operands of the conditional branch.
920+
// The connection order corresponds to the operands of the conditional
921+
// branch.
918922
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
919923
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
920924

@@ -1039,7 +1043,10 @@ void VPlan::execute(VPTransformState *State) {
10391043
{{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
10401044

10411045
// Generate code in the loop pre-header and body.
1042-
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
1046+
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
1047+
Entry);
1048+
1049+
for (VPBlockBase *Block : RPOT)
10431050
Block->execute(State);
10441051

10451052
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
@@ -1071,7 +1078,10 @@ void VPlan::execute(VPTransformState *State) {
10711078
// Move the last step to the end of the latch block. This ensures
10721079
// consistent placement of all induction updates.
10731080
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
1074-
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
1081+
if (VectorLatchBB->getTerminator() == &*VectorLatchBB->getFirstNonPHI())
1082+
Inc->moveBefore(VectorLatchBB->getTerminator());
1083+
else
1084+
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
10751085

10761086
// Use the steps for the last part as backedge value for the induction.
10771087
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1230,6 +1230,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
12301230
// operand). Only generates scalar values (either for the first lane only or
12311231
// for all lanes, depending on its uses).
12321232
PtrAdd,
1233+
AnyOf,
12331234
};
12341235

12351236
private:
@@ -3831,12 +3832,16 @@ class VPlan {
38313832
/// whether to execute the scalar tail loop or the exit block from the loop
38323833
/// latch.
38333834
const VPBasicBlock *getMiddleBlock() const {
3834-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3835+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38353836
}
38363837
VPBasicBlock *getMiddleBlock() {
3837-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
3838+
return cast<VPBasicBlock>(getScalarPreheader()->getSinglePredecessor());
38383839
}
38393840

3841+
/// Return the exit blocks of the VPlan, that is leaf nodes except the scalar
3842+
/// header.
3843+
auto getExitBlocks();
3844+
38403845
/// The trip count of the original loop.
38413846
VPValue *getTripCount() const {
38423847
assert(TripCount && "trip count needs to be set before accessing it");

0 commit comments

Comments
 (0)