Skip to content

Commit 02d078e

Browse files
committed
[LoopVectorize] Add support for vectorisation of more early exit loops
This patch follows on from PR llvm#107004 by adding support for vectorisation of a simple class of loops that typically involves searching for something, i.e. for (int i = 0; i < n; i++) { if (p[i] == val) return i; } return n; or for (int i = 0; i < n; i++) { if (p1[i] != p2[i]) return i; } return n; In this initial commit we will only vectorise early exit loops legal if they follow these criteria: 1. There are no stores in the loop. 2. The loop must have only one early uncountable exit like those shown in the above example. 3. The early exit block dominates the latch block. 4. The latch block must have an exact exit count. 6. The loop must not contain reductions or recurrences. 7. We must be able to prove at compile-time that loops will not contain faulting loads. For point 7 once this patch lands I intend to follow up by supporting some limited cases of faulting loops where we can version the loop based on pointer alignment. For example, it turns out in the SPEC2017 benchmark (xalancbmk) there is a std::find loop that we can vectorise provided we add SCEV checks for the initial pointer being aligned to a multiple of the VF. In practice, the pointer is regularly aligned to at least 32/64 bytes and since the VF is a power of 2, any vector loads <= 32/64 bytes in size will always fault on the first lane, following the same behaviour as the scalar loop. Given we already do such speculative versioning for loops with unknown strides, alignment-based versioning doesn't seem to be any worse at least for loops with only one load. This patch makes use of the existing experimental_cttz_elems intrinsic that's required in the vectorised early exit block to determine the first lane that triggered the exit. This intrinsic has generic lowering support so it's guaranteed to work for all targets. Tests have been updated here: Transforms/LoopVectorize/simple_early_exit.ll
1 parent f4eeae1 commit 02d078e

File tree

12 files changed

+2000
-372
lines changed

12 files changed

+2000
-372
lines changed

llvm/include/llvm/Support/GenericLoopInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,10 @@ template <class BlockT, class LoopT> class LoopBase {
294294
/// Otherwise return null.
295295
BlockT *getUniqueExitBlock() const;
296296

297+
/// Return the unique exit block for the latch, or null if there are multiple
298+
/// different exit blocks.
299+
BlockT *getUniqueLatchExitBlock() const;
300+
297301
/// Return true if this loop does not have any exit blocks.
298302
bool hasNoExitBlocks() const;
299303

llvm/include/llvm/Support/GenericLoopInfoImpl.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,16 @@ BlockT *LoopBase<BlockT, LoopT>::getUniqueExitBlock() const {
159159
return getExitBlockHelper(this, true).first;
160160
}
161161

162+
template <class BlockT, class LoopT>
163+
BlockT *LoopBase<BlockT, LoopT>::getUniqueLatchExitBlock() const {
164+
const BlockT *Latch = getLoopLatch();
165+
assert(Latch && "Latch block must exists");
166+
SmallVector<BlockT *, 4> ExitBlocks;
167+
getUniqueExitBlocksHelper(this, ExitBlocks,
168+
[Latch](const BlockT *BB) { return BB == Latch; });
169+
return ExitBlocks.size() == 1 ? ExitBlocks[0] : nullptr;
170+
}
171+
162172
/// getExitEdges - Return all pairs of (_inside_block_,_outside_block_).
163173
template <class BlockT, class LoopT>
164174
void LoopBase<BlockT, LoopT>::getExitEdges(

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
7979
"Scalable vectorization is available and favored when the "
8080
"cost is inconclusive.")));
8181

82+
static cl::opt<bool> AssumeNoMemFault(
83+
"vectorizer-no-mem-fault", cl::init(false), cl::Hidden,
84+
cl::desc("Assume vectorized loops will not have memory faults, which is "
85+
"potentially unsafe but can be useful for testing vectorization "
86+
"of early exit loops."));
87+
8288
/// Maximum vectorization interleave count.
8389
static const unsigned MaxInterleaveFactor = 16;
8490

@@ -1579,11 +1585,15 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
15791585
Predicates.clear();
15801586
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
15811587
&Predicates)) {
1582-
reportVectorizationFailure(
1583-
"Loop may fault",
1584-
"Cannot vectorize potentially faulting early exit loop",
1585-
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1586-
return false;
1588+
if (!AssumeNoMemFault) {
1589+
reportVectorizationFailure(
1590+
"Loop may fault",
1591+
"Cannot vectorize potentially faulting early exit loop",
1592+
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1593+
return false;
1594+
} else
1595+
LLVM_DEBUG(dbgs() << "LV: Assuming early exit vector loop will not "
1596+
<< "fault\n");
15871597
}
15881598

15891599
[[maybe_unused]] const SCEV *SymbolicMaxBTC =

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 377 additions & 48 deletions
Large diffs are not rendered by default.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 66 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
421421

422422
// Hook up the new basic block to its predecessors.
423423
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
424-
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
424+
auto *VPRB = dyn_cast<VPRegionBlock>(PredVPBlock);
425+
426+
// The exiting block that leads to this block might be an early exit from
427+
// a loop region.
428+
VPBasicBlock *PredVPBB = VPRB && VPRB->getEarlyExit() == this
429+
? cast<VPBasicBlock>(VPRB->getEarlyExiting())
430+
: PredVPBlock->getExitingBasicBlock();
431+
425432
auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
426433
BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
427434

@@ -443,6 +450,11 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
443450
// Set each forward successor here when it is created, excluding
444451
// backedges. A backward successor is set when the branch is created.
445452
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
453+
VPRegionBlock *PredParentRegion =
454+
dyn_cast_or_null<VPRegionBlock>(PredVPBB->getParent());
455+
if (PredParentRegion->getEarlyExiting() == PredVPBB) {
456+
idx = 1 - idx;
457+
}
446458
assert(!TermBr->getSuccessor(idx) &&
447459
"Trying to reset an existing successor block.");
448460
TermBr->setSuccessor(idx, NewBB);
@@ -499,6 +511,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
499511
!((SingleHPred = getSingleHierarchicalPredecessor()) &&
500512
SingleHPred->getExitingBasicBlock() == PrevVPBB &&
501513
PrevVPBB->getSingleHierarchicalSuccessor() &&
514+
PrevVPBB != getEnclosingLoopRegion()->getEarlyExiting() &&
502515
(SingleHPred->getParent() == getEnclosingLoopRegion() &&
503516
!IsLoopRegion(SingleHPred))) && /* B */
504517
!(Replica && getPredecessors().empty())) { /* C */
@@ -517,7 +530,8 @@ void VPBasicBlock::execute(VPTransformState *State) {
517530
UnreachableInst *Terminator = State->Builder.CreateUnreachable();
518531
// Register NewBB in its loop. In innermost loops its the same for all
519532
// BB's.
520-
if (State->CurrentVectorLoop)
533+
if (State->CurrentVectorLoop &&
534+
this != getEnclosingLoopRegion()->getEarlyExit())
521535
State->CurrentVectorLoop->addBasicBlockToLoop(NewBB, *State->LI);
522536
State->Builder.SetInsertPoint(Terminator);
523537
State->CFG.PrevBB = NewBB;
@@ -635,7 +649,11 @@ const VPRecipeBase *VPBasicBlock::getTerminator() const {
635649
}
636650

637651
bool VPBasicBlock::isExiting() const {
638-
return getParent() && getParent()->getExitingBasicBlock() == this;
652+
const VPRegionBlock *VPRB = getParent();
653+
if (!VPRB)
654+
return false;
655+
return VPRB->getExitingBasicBlock() == this ||
656+
VPRB->getEarlyExiting() == this;
639657
}
640658

641659
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -876,13 +894,15 @@ static VPIRBasicBlock *createVPIRBasicBlockFor(BasicBlock *BB) {
876894
VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
877895
PredicatedScalarEvolution &PSE,
878896
bool RequiresScalarEpilogueCheck,
879-
bool TailFolded, Loop *TheLoop) {
897+
bool TailFolded, Loop *TheLoop,
898+
BasicBlock *EarlyExitingBB,
899+
BasicBlock *EarlyExitBB) {
880900
VPIRBasicBlock *Entry = createVPIRBasicBlockFor(TheLoop->getLoopPreheader());
881901
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
882902
auto Plan = std::make_unique<VPlan>(Entry, VecPreheader);
883903

884904
// Create SCEV and VPValue for the trip count.
885-
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
905+
const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
886906
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
887907
ScalarEvolution &SE = *PSE.getSE();
888908
const SCEV *TripCount =
@@ -902,6 +922,13 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
902922
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
903923
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
904924

925+
if (EarlyExitingBB) {
926+
VPBasicBlock *EarlyExitVPBB = new VPBasicBlock("vector.early.exit");
927+
TopRegion->setEarlyExit(EarlyExitVPBB);
928+
VPBlockUtils::connectBlocks(TopRegion, EarlyExitVPBB);
929+
TopRegion->setOrigEarlyExit(EarlyExitBB);
930+
}
931+
905932
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
906933
if (!RequiresScalarEpilogueCheck) {
907934
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
@@ -916,7 +943,7 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
916943
// 2) If we require a scalar epilogue, there is no conditional branch as
917944
// we unconditionally branch to the scalar preheader. Do nothing.
918945
// 3) Otherwise, construct a runtime check.
919-
BasicBlock *IRExitBlock = TheLoop->getUniqueExitBlock();
946+
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
920947
auto *VPExitBlock = createVPIRBasicBlockFor(IRExitBlock);
921948
// The connection order corresponds to the operands of the conditional branch.
922949
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
@@ -992,7 +1019,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
9921019
/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
9931020
/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
9941021
/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
995-
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
1022+
static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
1023+
BasicBlock *IRBB) {
9961024
VPIRBasicBlock *IRVPBB = createVPIRBasicBlockFor(IRBB);
9971025
for (auto &R : make_early_inc_range(*VPBB)) {
9981026
assert(!R.isPhi() && "Tried to move phi recipe to end of block");
@@ -1006,6 +1034,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
10061034
VPBlockUtils::disconnectBlocks(VPBB, Succ);
10071035
}
10081036
delete VPBB;
1037+
return IRVPBB;
10091038
}
10101039

10111040
/// Generate the code inside the preheader and body of the vectorized loop.
@@ -1029,7 +1058,7 @@ void VPlan::execute(VPTransformState *State) {
10291058
// VPlan execution rather than earlier during VPlan construction.
10301059
BasicBlock *MiddleBB = State->CFG.ExitBB;
10311060
VPBasicBlock *MiddleVPBB =
1032-
cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
1061+
cast<VPBasicBlock>(getVectorLoopRegion()->getSuccessors()[0]);
10331062
// Find the VPBB for the scalar preheader, relying on the current structure
10341063
// when creating the middle block and its successrs: if there's a single
10351064
// predecessor, it must be the scalar preheader. Otherwise, the second
@@ -1043,7 +1072,14 @@ void VPlan::execute(VPTransformState *State) {
10431072
assert(!isa<VPIRBasicBlock>(ScalarPhVPBB) &&
10441073
"scalar preheader cannot be wrapped already");
10451074
replaceVPBBWithIRVPBB(ScalarPhVPBB, ScalarPh);
1046-
replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
1075+
MiddleVPBB = replaceVPBBWithIRVPBB(MiddleVPBB, MiddleBB);
1076+
1077+
// Ensure the middle block is still the first successor.
1078+
for (auto *Succ : getVectorLoopRegion()->getSuccessors())
1079+
if (Succ == MiddleVPBB) {
1080+
getVectorLoopRegion()->moveSuccessorToFront(MiddleVPBB);
1081+
break;
1082+
}
10471083

10481084
// Disconnect the middle block from its single successor (the scalar loop
10491085
// header) in both the CFG and DT. The branch will be recreated during VPlan
@@ -1104,6 +1140,20 @@ void VPlan::execute(VPTransformState *State) {
11041140
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
11051141
}
11061142

1143+
// Patch up early exiting vector block to jump to the original scalar loop's
1144+
// early exit block.
1145+
if (getVectorLoopRegion()->getEarlyExit()) {
1146+
VPBasicBlock *EarlyExitVPBB =
1147+
cast<VPBasicBlock>(getVectorLoopRegion()->getEarlyExit());
1148+
BasicBlock *VectorEarlyExitBB = State->CFG.VPBB2IRBB[EarlyExitVPBB];
1149+
BasicBlock *OrigEarlyExitBB = getVectorLoopRegion()->getOrigEarlyExit();
1150+
BranchInst *BI = BranchInst::Create(OrigEarlyExitBB);
1151+
BI->insertBefore(VectorEarlyExitBB->getTerminator());
1152+
VectorEarlyExitBB->getTerminator()->eraseFromParent();
1153+
State->CFG.DTU.applyUpdates(
1154+
{{DominatorTree::Insert, VectorEarlyExitBB, OrigEarlyExitBB}});
1155+
}
1156+
11071157
State->CFG.DTU.flush();
11081158
assert(State->CFG.DTU.getDomTree().verify(
11091159
DominatorTree::VerificationLevel::Fast) &&
@@ -1212,9 +1262,10 @@ LLVM_DUMP_METHOD
12121262
void VPlan::dump() const { print(dbgs()); }
12131263
#endif
12141264

1215-
void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
1216-
assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
1217-
LiveOuts.insert({PN, new VPLiveOut(PN, V)});
1265+
void VPlan::addLiveOut(PHINode *PN, VPValue *V, VPBasicBlock *IncomingBlock) {
1266+
auto Key = std::pair<PHINode *, VPBasicBlock *>(PN, IncomingBlock);
1267+
assert(LiveOuts.count(Key) == 0 && "an exit value for PN already exists");
1268+
LiveOuts.insert({Key, new VPLiveOut(PN, V)});
12181269
}
12191270

12201271
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
@@ -1285,8 +1336,9 @@ VPlan *VPlan::duplicate() {
12851336
remapOperands(Entry, NewEntry, Old2NewVPValues);
12861337

12871338
// Clone live-outs.
1288-
for (const auto &[_, LO] : LiveOuts)
1289-
NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
1339+
for (const auto &[Key, LO] : LiveOuts)
1340+
NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)],
1341+
Key.second);
12901342

12911343
// Initialize remaining fields of cloned VPlan.
12921344
NewPlan->VFs = VFs;

0 commit comments

Comments
 (0)