Skip to content

Commit 7d354a6

Browse files
committed
[LoopVectorize] Enable vectorisation of early exit loops with live-outs
This work feeds part of PR llvm#88385, and adds support for vectorising loops with uncountable early exits and outside users of loop-defined variables. When calculating the final value from an uncountable early exit we need to calculate the vector lane that triggered the exit, and hence determine the value at the point we exited. All code for calculating the last value when exiting the loop early now lives in a new vector.early.exit block, which sits between the middle.split block and the original exit block. Doing this required two fixes: 1. The vplan verifier incorrectly assumed that the block containing a definition always dominates the block of the user. That's not true if you can arrive at the use block from multiple incoming blocks. This is possible for early exit loops where both the early exit and the latch jump to the same block. I've added a new ExtractFirstActive VPInstruction that extracts the first active lane of a vector, i.e. the lane of the vector predicate that triggered the exit. NOTE: The IR generated for dealing with live-outs from early exit loops is unoptimised, as opposed to normal loops. This inevitably leads to poor quality code, but this can be fixed up later.
1 parent 0d7c8c0 commit 7d354a6

File tree

15 files changed

+1041
-167
lines changed

15 files changed

+1041
-167
lines changed

llvm/docs/Vectorizers.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,9 @@ Early Exit Vectorization
405405
When vectorizing a loop with a single early exit, the loop blocks following the
406406
early exit are predicated and the vector loop will always exit via the latch.
407407
If the early exit has been taken, the vector loop's successor block
408-
(``middle.split`` below) branches to the early exit block. Otherwise
409-
``middle.block`` selects between the exit block from the latch or the scalar
410-
remainder loop.
408+
(``middle.split`` below) branches to the early exit block via an intermediate
409+
block (``vector.early.exit`` below). Otherwise ``middle.block`` selects between
410+
the exit block from the latch or the scalar remainder loop.
411411

412412
.. image:: vplan-early-exit.png
413413

llvm/docs/vplan-early-exit.dot

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,23 +19,27 @@ compound=true
1919
"middle.split"
2020
]
2121
N4 -> N5 [ label=""]
22-
N4 -> N6 [ label=""]
22+
N4 -> N7 [ label=""]
2323
N5 [label =
24-
"early.exit"
24+
"vector.early.exit"
2525
]
26+
N5 -> N6 [ label=""]
2627
N6 [label =
27-
"middle.block"
28+
"early.exit"
2829
]
29-
N6 -> N9 [ label=""]
30-
N6 -> N7 [ label=""]
3130
N7 [label =
32-
"scalar.ph"
31+
"middle.block"
3332
]
33+
N7 -> N10 [ label=""]
3434
N7 -> N8 [ label=""]
3535
N8 [label =
36-
"loop.header"
36+
"scalar.ph"
3737
]
38+
N8 -> N9 [ label=""]
3839
N9 [label =
40+
"loop.header"
41+
]
42+
N10 [label =
3943
"latch.exit"
4044
]
4145
}

llvm/docs/vplan-early-exit.png

-83.3 KB
Loading

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,11 @@ class LoopVectorizationLegality {
407407

408408
/// Returns the destination of an uncountable early exiting block.
409409
BasicBlock *getUncountableEarlyExitBlock() const {
410+
if (!HasUncountableEarlyExit) {
411+
assert(getUncountableExitBlocks().empty() &&
412+
"Expected no uncountable exiting blocks");
413+
return nullptr;
414+
}
410415
assert(getUncountableExitBlocks().size() == 1 &&
411416
"Expected only a single uncountable exit block");
412417
return getUncountableExitBlocks()[0];

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 64 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2777,6 +2777,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
27772777
return LoopVectorPreHeader;
27782778
}
27792779

2780+
static bool isValueIncomingFromBlock(BasicBlock *ExitingBB, Value *V,
2781+
Instruction *UI) {
2782+
PHINode *PHI = dyn_cast<PHINode>(UI);
2783+
assert(PHI && "Expected LCSSA form");
2784+
2785+
// If this loop has an uncountable early exit then there could be
2786+
// different users of OrigPhi with either:
2787+
// 1. Multiple users, because each exiting block (countable or
2788+
// uncountable) jumps to the same exit block, or ..
2789+
// 2. A single user with an incoming value from a countable or
2790+
// uncountable exiting block.
2791+
// In both cases there is no guarantee this came from a countable exiting
2792+
// block, i.e. the latch.
2793+
int Index = PHI->getBasicBlockIndex(ExitingBB);
2794+
return Index != -1 && PHI->getIncomingValue(Index) == V;
2795+
}
2796+
27802797
// Fix up external users of the induction variable. At this point, we are
27812798
// in LCSSA form, with all external PHIs that use the IV having one input value,
27822799
// coming from the remainder loop. We need those PHIs to also have a correct
@@ -2799,12 +2816,13 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
27992816

28002817
// An external user of the last iteration's value should see the value that
28012818
// the remainder loop uses to initialize its own IV.
2802-
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
2819+
BasicBlock *OrigLoopLatch = OrigLoop->getLoopLatch();
2820+
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoopLatch);
28032821
for (User *U : PostInc->users()) {
28042822
Instruction *UI = cast<Instruction>(U);
28052823
if (!OrigLoop->contains(UI)) {
2806-
assert(isa<PHINode>(UI) && "Expected LCSSA form");
2807-
MissingVals[UI] = EndValue;
2824+
if (isValueIncomingFromBlock(OrigLoopLatch, PostInc, UI))
2825+
MissingVals[cast<PHINode>(UI)] = EndValue;
28082826
}
28092827
}
28102828

@@ -2814,7 +2832,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
28142832
for (User *U : OrigPhi->users()) {
28152833
auto *UI = cast<Instruction>(U);
28162834
if (!OrigLoop->contains(UI)) {
2817-
assert(isa<PHINode>(UI) && "Expected LCSSA form");
2835+
if (!isValueIncomingFromBlock(OrigLoopLatch, OrigPhi, UI))
2836+
continue;
28182837
IRBuilder<> B(MiddleBlock->getTerminator());
28192838

28202839
// Fast-math-flags propagate from the original induction instruction.
@@ -2844,18 +2863,6 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
28442863
}
28452864
}
28462865

2847-
assert((MissingVals.empty() ||
2848-
all_of(MissingVals,
2849-
[MiddleBlock, this](const std::pair<Value *, Value *> &P) {
2850-
return all_of(
2851-
predecessors(cast<Instruction>(P.first)->getParent()),
2852-
[MiddleBlock, this](BasicBlock *Pred) {
2853-
return Pred == MiddleBlock ||
2854-
Pred == OrigLoop->getLoopLatch();
2855-
});
2856-
})) &&
2857-
"Expected escaping values from latch/middle.block only");
2858-
28592866
for (auto &I : MissingVals) {
28602867
PHINode *PHI = cast<PHINode>(I.first);
28612868
// One corner case we have to handle is two IVs "chasing" each-other,
@@ -7811,6 +7818,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78117818
State.LVer->prepareNoAliasMetadata();
78127819
}
78137820

7821+
// Set the uncountable early exit block in the VPTransformState.
7822+
State.CFG.UncountableEarlyExitBB = ILV.Legal->getUncountableEarlyExitBlock();
7823+
78147824
ILV.printDebugTracesAtStart();
78157825

78167826
//===------------------------------------------------===//
@@ -9237,14 +9247,20 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
92379247
// Add exit values to \p Plan. Extracts are added for each entry in \p
92389248
// ExitUsersToFix if needed and their operands are updated. Returns true if all
92399249
// exit users can be handled, otherwise return false.
9240-
static bool
9250+
static void
92419251
addUsersInExitBlocks(VPlan &Plan,
92429252
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
92439253
if (ExitUsersToFix.empty())
9244-
return true;
9254+
return;
92459255

92469256
auto *MiddleVPBB = Plan.getMiddleBlock();
9247-
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9257+
VPBuilder MiddleB(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9258+
VPBuilder EarlyExitB;
9259+
VPBasicBlock *VectorEarlyExitVPBB = Plan.getEarlyExit();
9260+
VPValue *EarlyExitMask = nullptr;
9261+
if (VectorEarlyExitVPBB)
9262+
EarlyExitB.setInsertPoint(VectorEarlyExitVPBB,
9263+
VectorEarlyExitVPBB->getFirstNonPhi());
92489264

92499265
// Introduce extract for exiting values and update the VPIRInstructions
92509266
// modeling the corresponding LCSSA phis.
@@ -9255,19 +9271,38 @@ addUsersInExitBlocks(VPlan &Plan,
92559271
if (Op->isLiveIn())
92569272
continue;
92579273

9258-
// Currently only live-ins can be used by exit values from blocks not
9259-
// exiting via the vector latch through to the middle block.
9260-
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
9261-
return false;
9262-
92639274
LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
9264-
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
9265-
{Op, Plan.getOrAddLiveIn(ConstantInt::get(
9266-
IntegerType::get(Ctx, 32), 1))});
9275+
VPValue *Ext;
9276+
VPBasicBlock *PredVPBB =
9277+
cast<VPBasicBlock>(ExitIRI->getParent()->getPredecessors()[Idx]);
9278+
if (PredVPBB != MiddleVPBB) {
9279+
assert(ExitIRI->getParent()->getNumPredecessors() <= 2);
9280+
9281+
// Cache the early exit mask
9282+
if (!EarlyExitMask) {
9283+
VPBasicBlock *MiddleSplitVPBB =
9284+
cast<VPBasicBlock>(VectorEarlyExitVPBB->getSinglePredecessor());
9285+
VPInstruction *PredTerm =
9286+
cast<VPInstruction>(MiddleSplitVPBB->getTerminator());
9287+
assert(PredTerm->getOpcode() == VPInstruction::BranchOnCond &&
9288+
"Unexpected middle split block terminator");
9289+
VPInstruction *ScalarCond =
9290+
cast<VPInstruction>(PredTerm->getOperand(0));
9291+
assert(
9292+
ScalarCond->getOpcode() == VPInstruction::AnyOf &&
9293+
"Unexpected condition for middle split block terminator branch");
9294+
EarlyExitMask = ScalarCond->getOperand(0);
9295+
}
9296+
Ext = EarlyExitB.createNaryOp(VPInstruction::ExtractFirstActive,
9297+
{Op, EarlyExitMask});
9298+
} else {
9299+
Ext = MiddleB.createNaryOp(VPInstruction::ExtractFromEnd,
9300+
{Op, Plan.getOrAddLiveIn(ConstantInt::get(
9301+
IntegerType::get(Ctx, 32), 1))});
9302+
}
92679303
ExitIRI->setOperand(Idx, Ext);
92689304
}
92699305
}
9270-
return true;
92719306
}
92729307

92739308
/// Handle users in the exit block for first order reductions in the original
@@ -9570,12 +9605,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
95709605
SetVector<VPIRInstruction *> ExitUsersToFix =
95719606
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
95729607
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9573-
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
9574-
reportVectorizationFailure(
9575-
"Some exit values in loop with uncountable exit not supported yet",
9576-
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9577-
return nullptr;
9578-
}
9608+
addUsersInExitBlocks(*Plan, ExitUsersToFix);
95799609

95809610
// ---------------------------------------------------------------------------
95819611
// Transform initial VPlan: Apply previously taken decisions, in order, to

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,15 @@ void VPBasicBlock::execute(VPTransformState *State) {
501501
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
502502
// Register NewBB in its loop. In innermost loops its the same for all
503503
// BB's.
504-
if (State->CurrentParentLoop)
504+
if (this == State->Plan->getEarlyExit()) {
505+
// If this is the vector early exit block then it has a single successor,
506+
// which is the uncountable early exit block of the original loop. The
507+
// parent loop for the exit block may not be the same as the parent loop
508+
// of the vectorised loop, so we have to treat this differently.
509+
Loop *EEL = State->LI->getLoopFor(State->CFG.UncountableEarlyExitBB);
510+
if (EEL)
511+
EEL->addBasicBlockToLoop(NewBB, *State->LI);
512+
} else if (State->CurrentParentLoop)
505513
State->CurrentParentLoop->addBasicBlockToLoop(NewBB, *State->LI);
506514
State->Builder.SetInsertPoint(Terminator);
507515

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,9 @@ struct VPTransformState {
347347
/// vector loop.
348348
BasicBlock *ExitBB = nullptr;
349349

350+
/// The uncountable early exit block in the original scalar loop.
351+
BasicBlock *UncountableEarlyExitBB = nullptr;
352+
350353
/// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
351354
/// of replication, maps the BasicBlock of the last replica created.
352355
SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
@@ -1226,6 +1229,9 @@ class VPInstruction : public VPRecipeWithIRFlags,
12261229
// Returns a scalar boolean value, which is true if any lane of its (only
12271230
// boolean) vector operand is true.
12281231
AnyOf,
1232+
// Extracts the first active lane of a vector, where the first operand is
1233+
// the predicate, and the second operand is the vector to extract.
1234+
ExtractFirstActive,
12291235
};
12301236

12311237
private:
@@ -3930,6 +3936,22 @@ class VPlan {
39303936
VPRegionBlock *getVectorLoopRegion();
39313937
const VPRegionBlock *getVectorLoopRegion() const;
39323938

3939+
/// Get the vector early exit block
3940+
VPBasicBlock *getEarlyExit() {
3941+
auto LoopRegion = getVectorLoopRegion();
3942+
if (!LoopRegion)
3943+
return nullptr;
3944+
3945+
auto *SuccessorVPBB = LoopRegion->getSingleSuccessor();
3946+
auto *MiddleVPBB = getMiddleBlock();
3947+
if (SuccessorVPBB == MiddleVPBB)
3948+
return nullptr;
3949+
3950+
assert(SuccessorVPBB->getSuccessors()[1] == MiddleVPBB &&
3951+
"Expected second successor to be the middle block");
3952+
return cast<VPBasicBlock>(SuccessorVPBB->getSuccessors()[0]);
3953+
}
3954+
39333955
/// Returns the 'middle' block of the plan, that is the block that selects
39343956
/// whether to execute the scalar tail loop or the exit block from the loop
39353957
/// latch.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,14 +696,21 @@ Value *VPInstruction::generate(VPTransformState &State) {
696696
Value *A = State.get(getOperand(0));
697697
return Builder.CreateOrReduce(A);
698698
}
699-
699+
case VPInstruction::ExtractFirstActive: {
700+
Value *Vec = State.get(getOperand(0));
701+
Value *Mask = State.get(getOperand(1));
702+
Value *Ctz =
703+
Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask);
704+
return Builder.CreateExtractElement(Vec, Ctz);
705+
}
700706
default:
701707
llvm_unreachable("Unsupported opcode for instruction");
702708
}
703709
}
704710

705711
bool VPInstruction::isVectorToScalar() const {
706712
return getOpcode() == VPInstruction::ExtractFromEnd ||
713+
getOpcode() == VPInstruction::ExtractFirstActive ||
707714
getOpcode() == VPInstruction::ComputeReductionResult ||
708715
getOpcode() == VPInstruction::AnyOf;
709716
}
@@ -768,6 +775,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
768775
case VPInstruction::CalculateTripCountMinusVF:
769776
case VPInstruction::CanonicalIVIncrementForPart:
770777
case VPInstruction::ExtractFromEnd:
778+
case VPInstruction::ExtractFirstActive:
771779
case VPInstruction::FirstOrderRecurrenceSplice:
772780
case VPInstruction::LogicalAnd:
773781
case VPInstruction::Not:
@@ -887,6 +895,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
887895
case VPInstruction::AnyOf:
888896
O << "any-of";
889897
break;
898+
case VPInstruction::ExtractFirstActive:
899+
O << "extract-first-active";
900+
break;
890901
default:
891902
O << Instruction::getOpcodeName(getOpcode());
892903
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1973,10 +1973,13 @@ void VPlanTransforms::handleUncountableEarlyExit(
19731973
Builder.createNaryOp(VPInstruction::AnyOf, {EarlyExitTakenCond});
19741974

19751975
VPBasicBlock *NewMiddle = Plan.createVPBasicBlock("middle.split");
1976+
VPBasicBlock *EarlyExitVPBB = Plan.createVPBasicBlock("vector.early.exit");
19761977
VPBlockUtils::insertOnEdge(LoopRegion, MiddleVPBB, NewMiddle);
1977-
VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
1978+
VPBlockUtils::connectBlocks(NewMiddle, EarlyExitVPBB);
19781979
NewMiddle->swapSuccessors();
19791980

1981+
VPBlockUtils::connectBlocks(EarlyExitVPBB, VPEarlyExitBlock);
1982+
19801983
VPBuilder MiddleBuilder(NewMiddle);
19811984
MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
19821985

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
222222
continue;
223223
}
224224

225-
if (!VPDT.dominates(VPBB, UI->getParent())) {
225+
// Now that we support vectorising loops with uncountable early exits
226+
// we can end up in situations where VPBB does not dominate the exit
227+
// block. Only do the check if the user is not in a VPIRBasicBlock.
228+
if (!isa<VPIRBasicBlock>(UI->getParent()) &&
229+
!VPDT.dominates(VPBB, UI->getParent())) {
226230
errs() << "Use before def!\n";
227231
return false;
228232
}

0 commit comments

Comments
 (0)