Skip to content

[VPlan] Introduce scalar loop header in plan, remove VPLiveOut. #109975

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Oct 31, 2024
Merged
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ class LoopVectorizationPlanner {
// instructions leading from the loop exit instr to the phi need to be
// converted to reductions, with one operand being vector and the other being
// the scalar reduction chain. For other reductions, a select is introduced
// between the phi and live-out recipes when folding the tail.
// between the phi and users outside the vector region when folding the tail.
void adjustRecipesForReductions(VPlanPtr &Plan,
VPRecipeBuilder &RecipeBuilder,
ElementCount MinVF);
Expand Down
131 changes: 60 additions & 71 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ class InnerLoopVectorizer {
virtual std::pair<BasicBlock *, Value *>
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);

/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
/// Fix the vectorized code, taking care of header phi's, and more.
void fixVectorizedLoop(VPTransformState &State);

// Return true if any runtime check is added.
Expand Down Expand Up @@ -2713,7 +2713,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
| |
(opt) v <-- edge from middle to exit iff epilogue is not required.
| [ ] \
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
| | wrapped in VPIRBasicBlock).
\ |
\ v
>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
Expand Down Expand Up @@ -2956,7 +2957,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else {
// TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
// TODO: Check in VPlan to see if IV users need fixing instead of checking
// the cost model.

// If we inserted an edge from the middle block to the unique exit block,
Expand All @@ -2970,10 +2971,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
IVEndValues[Entry.first], LoopMiddleBlock, State);
}

// Fix live-out phis not already fixed earlier.
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);

for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);

Expand Down Expand Up @@ -8790,6 +8787,41 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
}

/// Create resume phis in the scalar preheader for first-order recurrences and
/// reductions and update the VPIRInstructions wrapping the original phis in the
/// scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
VPBuilder ScalarPHBuilder(ScalarPH);
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {
auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
if (!ScalarPhiI)
break;
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));
if (!isa<VPFirstOrderRecurrencePHIRecipe, VPReductionPHIRecipe>(VectorPhiR))
continue;
// The backedge value provides the value to resume coming out of a loop,
// which for FORs is a vector whose last element needs to be extracted. The
// start value provides the value if the loop is bypassed.
bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
if (IsFOR)
ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},
"vector.recur.extract");
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi,
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
ScalarPhiIRI->addOperand(ResumePhiR);
}
}

// Collect VPIRInstructions for phis in the original exit block that are modeled
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
// modeled explicitly yet and won't be included. Those are un-truncated
Expand Down Expand Up @@ -8819,8 +8851,7 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
// Exit values for inductions are computed and updated outside of VPlan and
// independent of induction recipes.
// TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
// live-outs.
// TODO: Compute induction exit values in VPlan.
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
Expand Down Expand Up @@ -8853,7 +8884,8 @@ addUsersInExitBlock(VPlan &Plan,
// modeling the corresponding LCSSA phis.
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
VPValue *V = ExitIRI->getOperand(0);
// Pass live-in values used by exit phis directly through to the live-out.
// Pass live-in values used by exit phis directly through to their users in
// the exit block.
if (V->isLiveIn())
continue;

Expand All @@ -8865,39 +8897,17 @@ addUsersInExitBlock(VPlan &Plan,
}
}

/// Handle live-outs for first order reductions, both in the scalar preheader
/// and the original exit block:
/// 1. Feed a resume value for every FOR from the vector loop to the scalar
/// loop, if middle block branches to scalar preheader, by introducing
/// ExtractFromEnd and ResumePhi recipes in each, respectively, and a
/// VPLiveOut which uses the latter and corresponds to the scalar header.
/// 2. Feed the penultimate value of recurrences to their LCSSA phi users in
/// the original exit block using a VPLiveOut.
static void addLiveOutsForFirstOrderRecurrences(
/// Handle users in the exit block for first order reductions in the original
/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
/// users in the original exit block using the VPIRInstruction wrapping to the
/// LCSSA phi.
static void addExitUsersForFirstOrderRecurrences(
VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix) {
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

// Start by finding out if middle block branches to scalar preheader, which is
// not a VPIRBasicBlock, unlike Exit block - the other possible successor of
// middle block.
// TODO: Should be replaced by
// Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
// scalar region is modeled as well.
auto *ScalarPHVPBB = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(VectorRegion->getSingleSuccessor());
VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else if (ExitUsersToFix.empty()) {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
} else {
llvm_unreachable("unsupported CFG in VPlan");
}

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
VPValue *OneVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
VPValue *TwoVPV = Plan.getOrAddLiveIn(
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));

Expand Down Expand Up @@ -8973,26 +8983,16 @@ static void addLiveOutsForFirstOrderRecurrences(
// lo = lcssa.phi [s1, scalar.body],
// [vector.recur.extract.for.phi, middle.block]
//
// Extract the resume value and create a new VPLiveOut for it.
auto *Resume = MiddleBuilder.createNaryOp(VPInstruction::ExtractFromEnd,
{FOR->getBackedgeValue(), OneVPV},
{}, "vector.recur.extract");
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
Plan.addLiveOut(FORPhi, ResumePhiRecipe);

// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(independent): ok to remove ExitIRI during traversal of ExitUsersToFix below? Better make_early_inc_range?
OTOH, is more than one LCSSA phi expected to have FOR as its operand? If not better break as soon as it is found and handled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I thought the same thing, will adjust separately.

// the VPIRInstruction modeling the phi.
for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
if (ExitIRI->getOperand(0) != FOR)
continue;
VPValue *Ext = MiddleBuilder.createNaryOp(
VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},
"vector.recur.extract.for.phi");
ExitIRI->setOperand(0, Ext);
ExitIRI->setOperand(0, PenultimateElement);
ExitUsersToFix.remove(ExitIRI);
}
}
Expand Down Expand Up @@ -9166,11 +9166,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
"VPBasicBlock");
RecipeBuilder.fixHeaderPhis();

addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
Expand All @@ -9192,9 +9192,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Replace VPValues for known constant strides guaranteed by predicate scalar
// evolution.
auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
auto *R = dyn_cast<VPRecipeBase>(&U);
if (!R)
return false;
auto *R = cast<VPRecipeBase>(&U);
return R->getParent()->getParent() ||
R->getParent() ==
Plan->getVectorLoopRegion()->getSinglePredecessor();
Expand Down Expand Up @@ -9291,7 +9289,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
// instructions leading from the loop exit instr to the phi need to be converted
// to reductions, with one operand being vector and the other being the scalar
// reduction chain. For other reductions, a select is introduced between the phi
// and live-out recipes when folding the tail.
// and users outside the vector region when folding the tail.
//
// A ComputeReductionResult recipe is added to the middle block, also for
// in-loop reductions which compute their result in-loop, because generating
Expand Down Expand Up @@ -9325,8 +9323,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
for (VPUser *U : Cur->users()) {
auto *UserRecipe = cast<VPSingleDefRecipe>(U);
if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
assert(UserRecipe->getParent() == MiddleVPBB &&
"U must be either in the loop region or the middle block.");
assert((UserRecipe->getParent() == MiddleVPBB ||
UserRecipe->getParent() == Plan->getScalarPreheader()) &&
"U must be either in the loop region, the middle block or the "
"scalar preheader.");
continue;
}
Worklist.insert(UserRecipe);
Expand Down Expand Up @@ -9440,8 +9440,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(

const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
// If tail is folded by masking, introduce selects between the phi
// and the live-out instruction of each reduction, at the beginning of the
// dedicated latch block.
// and the users outside the vector region of each reduction, at the
// beginning of the dedicated latch block.
auto *OrigExitingVPV = PhiR->getBackedgeValue();
auto *NewExitingVPV = PhiR->getBackedgeValue();
if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
Expand Down Expand Up @@ -9513,17 +9513,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
});
FinalReductionResult->insertBefore(*MiddleVPBB, IP);

// Order is strict: if there are multiple successors, the first is the exit
// block, second is the scalar preheader.
VPBasicBlock *ScalarPHVPBB =
cast<VPBasicBlock>(MiddleVPBB->getSuccessors().back());
VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()},
{}, "bc.merge.rdx");
auto *RedPhi = cast<PHINode>(PhiR->getUnderlyingInstr());
Plan->addLiveOut(RedPhi, ResumePhiRecipe);

// Adjust AnyOf reductions; replace the reduction phi for the selected value
// with a boolean reduction phi node to check if the condition is true in
// any iteration. The final value is selected by the final
Expand Down
Loading
Loading