-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[VPlan] Model branch cond to enter scalar epilogue in VPlan. #92651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
53e8e56
3a4ecfc
10e747a
4dfba5d
fe03d51
2372421
ab78e2a
446b313
5c6e07e
0a34761
a5c1fd6
045f051
2e4bb36
189aa60
014c393
c983acf
b165cca
852d28d
b506722
3236f1d
a3c7747
8e7f8dd
0e74f48
bd99d7b
887a56d
8cab6d2
2ba349c
fd9975c
dee166d
d5cd98d
ab6c49a
7867bfa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2972,22 +2972,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { | |
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, | ||
nullptr, Twine(Prefix) + "scalar.ph"); | ||
|
||
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); | ||
|
||
// Set up the middle block terminator. Two cases: | ||
// 1) If we know that we must execute the scalar epilogue, emit an | ||
// unconditional branch. | ||
// 2) Otherwise, we must have a single unique exit block (due to how we | ||
// implement the multiple exit case). In this case, set up a conditional | ||
// branch from the middle block to the loop scalar preheader, and the | ||
// exit block. completeLoopSkeleton will update the condition to use an | ||
// iteration check, if required to decide whether to execute the remainder. | ||
BranchInst *BrInst = | ||
Cost->requiresScalarEpilogue(VF.isVector()) | ||
? BranchInst::Create(LoopScalarPreHeader) | ||
: BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, | ||
Builder.getTrue()); | ||
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); | ||
auto *BrInst = new UnreachableInst(LoopMiddleBlock->getContext()); | ||
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); | ||
|
||
// Update dominator for loop exit. During skeleton creation, only the vector | ||
|
@@ -3094,51 +3079,6 @@ void InnerLoopVectorizer::createInductionResumeValues( | |
} | ||
} | ||
|
||
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { | ||
// The trip counts should be cached by now. | ||
Value *Count = getTripCount(); | ||
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); | ||
|
||
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); | ||
|
||
// Add a check in the middle block to see if we have completed | ||
// all of the iterations in the first vector loop. Three cases: | ||
// 1) If we require a scalar epilogue, there is no conditional branch as | ||
// we unconditionally branch to the scalar preheader. Do nothing. | ||
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder. | ||
// Thus if tail is to be folded, we know we don't need to run the | ||
// remainder and we can use the previous value for the condition (true). | ||
// 3) Otherwise, construct a runtime check. | ||
if (!Cost->requiresScalarEpilogue(VF.isVector()) && | ||
!Cost->foldTailByMasking()) { | ||
// Here we use the same DebugLoc as the scalar loop latch terminator instead | ||
// of the corresponding compare because they may have ended up with | ||
// different line numbers and we want to avoid awkward line stepping while | ||
// debugging. Eg. if the compare has got a line number inside the loop. | ||
// TODO: At the moment, CreateICmpEQ will simplify conditions with constant | ||
// operands. Perform simplification directly on VPlan once the branch is | ||
// modeled there. | ||
IRBuilder<> B(LoopMiddleBlock->getTerminator()); | ||
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); | ||
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); | ||
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); | ||
BI.setCondition(CmpN); | ||
if (hasBranchWeightMD(*ScalarLatchTerm)) { | ||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = UF * VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(BI, Weights); | ||
} | ||
} | ||
|
||
#ifdef EXPENSIVE_CHECKS | ||
assert(DT->verify(DominatorTree::VerificationLevel::Fast)); | ||
#endif | ||
|
||
return LoopVectorPreHeader; | ||
} | ||
|
||
std::pair<BasicBlock *, Value *> | ||
InnerLoopVectorizer::createVectorizedLoopSkeleton( | ||
const SCEV2ValueTy &ExpandedSCEVs) { | ||
|
@@ -3198,7 +3138,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton( | |
// Emit phis for the new starting index of the scalar loop. | ||
createInductionResumeValues(ExpandedSCEVs); | ||
|
||
return {completeLoopSkeleton(), nullptr}; | ||
return {LoopVectorPreHeader, nullptr}; | ||
} | ||
|
||
// Fix up external users of the induction variable. At this point, we are | ||
|
@@ -3470,6 +3410,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, | |
VF.getKnownMinValue() * UF); | ||
} | ||
|
||
// Helper to reorder blocks so they match the original order even after the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "FIXME: " can be added. Unclear how this reordering works, even if temporarily to appease phi operand order: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed the helper for now and just went with updating the tests. |
||
// order of the predecessors changes. This is only used to avoid a number of | ||
// test changes due to reordering of incoming blocks in phi nodes and should be | ||
// removed soon, with the tests being updated. | ||
static void reorderIncomingBlocks(SmallVectorImpl<BasicBlock *> &Blocks, | ||
BasicBlock *LoopMiddleBlock) { | ||
if (Blocks.front() == LoopMiddleBlock) | ||
std::swap(Blocks.front(), Blocks.back()); | ||
if (Blocks.size() == 3) | ||
std::swap(Blocks[0], Blocks[1]); | ||
} | ||
|
||
void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO, | ||
VPTransformState &State) { | ||
// Extract the last vector element in the middle block. This will be the | ||
|
@@ -3488,7 +3440,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(VPLiveOut *LO, | |
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin()); | ||
auto *ScalarPreheaderPhi = | ||
Builder.CreatePHI(ScalarHeaderPhi->getType(), 2, "scalar.recur.init"); | ||
for (auto *BB : predecessors(LoopScalarPreHeader)) { | ||
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader)); | ||
reorderIncomingBlocks(Blocks, LoopMiddleBlock); | ||
for (auto *BB : Blocks) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could the operands of ScalarPreheaderPhi be reordered, after being set, instead of reordering the predecessor blocks? That directly fixes the desired result, and seems slightly easier to clean up later. Same for BCBlockPhi below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think reordering the operands would be a bit more work, probably better to just directly update the tests. |
||
auto *Incoming = BB == LoopMiddleBlock ? ResumeScalarFOR : InitScalarFOR; | ||
ScalarPreheaderPhi->addIncoming(Incoming, BB); | ||
} | ||
|
@@ -7388,7 +7342,9 @@ static void createAndCollectMergePhiForReduction( | |
// If we are fixing reductions in the epilogue loop then we should already | ||
// have created a bc.merge.rdx Phi after the main vector body. Ensure that | ||
// we carry over the incoming values correctly. | ||
for (auto *Incoming : predecessors(LoopScalarPreHeader)) { | ||
SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader)); | ||
reorderIncomingBlocks(Blocks, LoopMiddleBlock); | ||
for (auto *Incoming : Blocks) { | ||
if (Incoming == LoopMiddleBlock) | ||
BCBlockPhi->addIncoming(FinalValue, Incoming); | ||
else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) | ||
|
@@ -7459,6 +7415,21 @@ LoopVectorizationPlanner::executePlan( | |
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = | ||
ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs | ||
: State.ExpandedSCEVs); | ||
#ifdef EXPENSIVE_CHECKS | ||
assert(DT->verify(DominatorTree::VerificationLevel::Fast)); | ||
#endif | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better move the expensive assert to the end of createVectorizedLoopSkeleton()'s than here after calling it? This appeared at the end of completeLoopSkeleton(), called at the end of all three create[Epilogue]VectorizedLoopSkeleton()'s. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we would have to move it to all implementations of |
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good to add some explanation somewhere what IR blocks are wrapped when building VPlan(s), what IR blocks are built when creating the Skeleton and need VPlan to be wrap them now, and what blocks are created during VPlan execution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extended comment in createVectorizaedLoopSkeleton, moved the replacement to VPlan::execute and added comment there as well. |
||
VPBasicBlock *MiddleVPBB = | ||
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have MiddleVPBB be a VPIRBasicBlock and reset its IRBB now, temporarily until its IRBB gets created by VPlan (along with induction resume values)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Split off a patch that uses VPIRBasicBlock for middle.block in #95816. It might be cleaner to create a regular VPBasicBlock first if the IR basic block hasn't been created yet and later replace it with a VPIRBasicBlock? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replacing VPBB with VPIRBB should also be fine. What happens if the replacement is missing? If IRBB is to be set late, it can be asserted to exist. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to VPlan::execute like replacing middle.block |
||
|
||
using namespace llvm::VPlanPatternMatch; | ||
if (MiddleVPBB->begin() != MiddleVPBB->end() && | ||
match(&MiddleVPBB->back(), m_BranchOnCond(m_VPValue()))) { | ||
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[1]) | ||
->resetBlock(OrigLoop->getLoopPreheader()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is searching for the VPIRBB representing the scalar preheader, and setting its underlying IRBB to the new loop preheader of the original scalar loop (after having split the original one to serve as the Entry-VPIRBB), right? Is there a better way to find this VPIRBB, which MiddleVPBB may bypass when folding tail or knowing that VFxUF divides the trip count? VPlan::getEntry() should return a VPIRBB wrapping OrigLoop->getLoopPreHeader(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I m not sure if there's a better way at the moment. For now, we rely on the structure of the created middle block and it's successors: single successor must be to the scalar ph, 2 successors meas the second one is the scalar ph; if tail folding it will still have 2 successors, but as condition. Added a comment. |
||
} else | ||
cast<VPIRBasicBlock>(MiddleVPBB->getSuccessors()[0]) | ||
->resetBlock(OrigLoop->getLoopPreheader()); | ||
|
||
// Only use noalias metadata when using memory checks guaranteeing no overlap | ||
// across all iterations. | ||
|
@@ -7539,6 +7510,18 @@ LoopVectorizationPlanner::executePlan( | |
|
||
ILV.printDebugTracesAtEnd(); | ||
|
||
// Adjust branch weight of the branch in the middle block. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. LVP::executePlan() is already quite long, with various step enumerated. Should additional steps be listed, and/or folded away somehow? Specifically, this weight adjustment was also part of completeLoopSkeleton(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, unfortunately it only fits here at the moment, assigned a number he |
||
auto *MiddleTerm = | ||
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator()); | ||
if (MiddleTerm->isConditional() && | ||
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thoughts independent of this patch:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where should we record it? Ideally on the branch-on-cond VPInst, but that would require supporting metadata on VPinst at least. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assigning weights deserves further discussion; the indicator could be recorded in VPlan itself - whether weights should be assigned to all new branches (possibly adjusting weights of existing branches). |
||
// Assume that `Count % VectorTripCount` is equally distributed. | ||
unsigned TripCount = State.UF * State.VF.getKnownMinValue(); | ||
assert(TripCount > 0 && "trip count should not be zero"); | ||
const uint32_t Weights[] = {1, TripCount - 1}; | ||
setBranchWeights(*MiddleTerm, Weights); | ||
} | ||
|
||
return {State.ExpandedSCEVs, ReductionResumeValues}; | ||
} | ||
|
||
|
@@ -7595,7 +7578,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( | |
// inductions in the epilogue loop are created before executing the plan for | ||
// the epilogue loop. | ||
|
||
return {completeLoopSkeleton(), nullptr}; | ||
return {LoopVectorPreHeader, nullptr}; | ||
} | ||
|
||
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { | ||
|
@@ -7719,8 +7702,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( | |
VecEpilogueIterationCountCheck, | ||
VecEpilogueIterationCountCheck->getSinglePredecessor()); | ||
|
||
DT->changeImmediateDominator(LoopScalarPreHeader, | ||
EPI.EpilogueIterationCountCheck); | ||
if (auto *N = DT->getNode(LoopScalarPreHeader)) | ||
DT->changeImmediateDominator(LoopScalarPreHeader, | ||
EPI.EpilogueIterationCountCheck); | ||
else | ||
DT->addNewBlock(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); | ||
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())) | ||
// If there is an epilogue which must run, there's no edge from the | ||
// middle block to exit blocks and thus no need to update the immediate | ||
|
@@ -7784,7 +7770,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( | |
{VecEpilogueIterationCountCheck, | ||
EPI.VectorTripCount} /* AdditionalBypass */); | ||
|
||
return {completeLoopSkeleton(), EPResumeVal}; | ||
return {LoopVectorPreHeader, EPResumeVal}; | ||
} | ||
|
||
BasicBlock * | ||
|
@@ -8534,9 +8520,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { | |
// modified; a basic block for the vector pre-header, followed by a region for | ||
// the vector loop, followed by the middle basic block. The skeleton vector | ||
// loop region contains a header and latch basic blocks. | ||
|
||
// Add a check in the middle block to see if we have completed | ||
// all of the iterations in the first vector loop. Three cases: | ||
// 1) If we require a scalar epilogue, there is no conditional branch as | ||
// we unconditionally branch to the scalar preheader. Do nothing. | ||
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder. | ||
// Thus if tail is to be folded, we know we don't need to run the | ||
// remainder and we can use the previous value for the condition (true). | ||
// 3) Otherwise, construct a runtime check. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this explanation about the three cases belong here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to createInitialVPlan where the VPInstructions are created. |
||
bool RequiresScalarEpilogueCheck = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More consistent to set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing the logic leads to a few regressions, better to change this as follow up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree to have a follow up patch deal with regressions, but does retaining the original condition alone leads to any? I.e., There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now after updating all tests, there appears to be some changes only in the VPlan-native outer-loop vectorization path. Need to take a closer look. |
||
LoopVectorizationPlanner::getDecisionAndClampRange( | ||
[this](ElementCount VF) { | ||
return !CM.requiresScalarEpilogue(VF.isVector()); | ||
}, | ||
Range); | ||
VPlanPtr Plan = VPlan::createInitialVPlan( | ||
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), | ||
*PSE.getSE(), OrigLoop->getLoopPreheader()); | ||
*PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), | ||
OrigLoop); | ||
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); | ||
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); | ||
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); | ||
|
@@ -8784,7 +8786,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { | |
// Create new empty VPlan | ||
auto Plan = VPlan::createInitialVPlan( | ||
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop), | ||
*PSE.getSE(), OrigLoop->getLoopPreheader()); | ||
*PSE.getSE(), true, false, OrigLoop); | ||
|
||
// Build hierarchical CFG | ||
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); | ||
|
@@ -8993,6 +8995,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |
} | ||
} | ||
Builder.setInsertPoint(&*LatchVPBB->begin()); | ||
VPBasicBlock *MiddleVPBB = | ||
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()); | ||
VPBasicBlock::iterator IP = MiddleVPBB->begin(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. getFirstNonPhi() instead of begin()? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to use getFirstNonPhi, thanks,
|
||
for (VPRecipeBase &R : | ||
Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { | ||
VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R); | ||
|
@@ -9101,8 +9106,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( | |
// also modeled in VPlan. | ||
auto *FinalReductionResult = new VPInstruction( | ||
VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); | ||
cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) | ||
->appendRecipe(FinalReductionResult); | ||
FinalReductionResult->insertBefore(*MiddleVPBB, IP); | ||
IP = std::next(FinalReductionResult->getIterator()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this std::next needed - having inserted FinalReductionResult before IP implies that the latter already follows the former? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not needed in the current version, it is sufficient to use the original IP |
||
OrigExitingVPV->replaceUsesWithIf( | ||
FinalReductionResult, | ||
[](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); }); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, LoopMiddleBlock already unconditionally branches to LoopScalarPreHeader by construction of the latter, so this replacement seems redundant(?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes seems like it, adjusted in b6476e5, thanks!