Skip to content

Commit f48884d

Browse files
authored
[VPlan] Remove loop region in optimizeForVFAndUF. (#108378)
Update optimizeForVFAndUF to completely remove the vector loop region when possible. At the moment, we cannot remove the region if it contains * widened IVs: the recipe is needed to generate the step vector * reductions: ComputeReductionResults requires the reduction phi recipe for codegen. Both cases can be addressed by more explicit modeling. The patch also includes a number of updates to allow executing VPlans without a vector loop region. Depends on #110004
1 parent 2adcec7 commit f48884d

17 files changed

+567
-676
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2394,12 +2394,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23942394
// End if-block.
23952395
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
23962396
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
2397-
assert((Parent || all_of(RepRecipe->operands(),
2398-
[](VPValue *Op) {
2399-
return Op->isDefinedOutsideLoopRegions();
2400-
})) &&
2401-
"Expected a recipe is either within a region or all of its operands "
2402-
"are defined outside the vectorized region.");
2397+
assert(
2398+
(Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2399+
all_of(RepRecipe->operands(),
2400+
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401+
"Expected a recipe is either within a region or all of its operands "
2402+
"are defined outside the vectorized region.");
24032403
if (IfPredicateInstr)
24042404
PredicatedInstructions.push_back(Cloned);
24052405
}
@@ -3012,6 +3012,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
30123012
getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
30133013
}
30143014

3015+
// Don't apply optimizations below when no vector region remains, as they all
3016+
// require a vector loop at the moment.
3017+
if (!State.Plan->getVectorLoopRegion())
3018+
return;
3019+
30153020
for (Instruction *PI : PredicatedInstructions)
30163021
sinkScalarOperands(&*PI);
30173022

@@ -7744,6 +7749,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77447749

77457750
// 1. Set up the skeleton for vectorization, including vector pre-header and
77467751
// middle block. The vector loop is created during VPlan execution.
7752+
VPBasicBlock *VectorPH =
7753+
cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
77477754
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
77487755
ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
77497756
if (VectorizingEpilogue)
@@ -7781,7 +7788,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77817788
BestVPlan.prepareToExecute(
77827789
ILV.getTripCount(),
77837790
ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);
7784-
replaceVPBBWithIRVPBB(BestVPlan.getVectorPreheader(), State.CFG.PrevBB);
7791+
replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);
77857792

77867793
BestVPlan.execute(&State);
77877794

@@ -7807,30 +7814,31 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78077814
// 2.6. Maintain Loop Hints
78087815
// Keep all loop hints from the original loop on the vector loop (we'll
78097816
// replace the vectorizer-specific hints below).
7810-
MDNode *OrigLoopID = OrigLoop->getLoopID();
7817+
if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {
7818+
MDNode *OrigLoopID = OrigLoop->getLoopID();
78117819

7812-
std::optional<MDNode *> VectorizedLoopID =
7813-
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7814-
LLVMLoopVectorizeFollowupVectorized});
7815-
7816-
VPBasicBlock *HeaderVPBB =
7817-
BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7818-
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7819-
if (VectorizedLoopID)
7820-
L->setLoopID(*VectorizedLoopID);
7821-
else {
7822-
// Keep all loop hints from the original loop on the vector loop (we'll
7823-
// replace the vectorizer-specific hints below).
7824-
if (MDNode *LID = OrigLoop->getLoopID())
7825-
L->setLoopID(LID);
7826-
7827-
LoopVectorizeHints Hints(L, true, *ORE);
7828-
Hints.setAlreadyVectorized();
7820+
std::optional<MDNode *> VectorizedLoopID =
7821+
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7822+
LLVMLoopVectorizeFollowupVectorized});
7823+
7824+
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
7825+
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7826+
if (VectorizedLoopID) {
7827+
L->setLoopID(*VectorizedLoopID);
7828+
} else {
7829+
// Keep all loop hints from the original loop on the vector loop (we'll
7830+
// replace the vectorizer-specific hints below).
7831+
if (MDNode *LID = OrigLoop->getLoopID())
7832+
L->setLoopID(LID);
7833+
7834+
LoopVectorizeHints Hints(L, true, *ORE);
7835+
Hints.setAlreadyVectorized();
7836+
}
7837+
TargetTransformInfo::UnrollingPreferences UP;
7838+
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7839+
if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7840+
addRuntimeUnrollDisableMetaData(L);
78297841
}
7830-
TargetTransformInfo::UnrollingPreferences UP;
7831-
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7832-
if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7833-
addRuntimeUnrollDisableMetaData(L);
78347842

78357843
// 3. Fix the vectorized code: take care of header phi's, live-outs,
78367844
// predication, updating analyses.
@@ -7839,15 +7847,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78397847
ILV.printDebugTracesAtEnd();
78407848

78417849
// 4. Adjust branch weight of the branch in the middle block.
7842-
auto *MiddleTerm =
7843-
cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7844-
if (MiddleTerm->isConditional() &&
7845-
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7846-
// Assume that `Count % VectorTripCount` is equally distributed.
7847-
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7848-
assert(TripCount > 0 && "trip count should not be zero");
7849-
const uint32_t Weights[] = {1, TripCount - 1};
7850-
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7850+
if (BestVPlan.getVectorLoopRegion()) {
7851+
auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7852+
auto *MiddleTerm =
7853+
cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
7854+
if (MiddleTerm->isConditional() &&
7855+
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7856+
// Assume that `Count % VectorTripCount` is equally distributed.
7857+
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
7858+
assert(TripCount > 0 && "trip count should not be zero");
7859+
const uint32_t Weights[] = {1, TripCount - 1};
7860+
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
7861+
}
78517862
}
78527863

78537864
return State.ExpandedSCEVs;

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,9 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) {
555555
template <typename T> static T *getEnclosingLoopRegionForRegion(T *P) {
556556
if (P && P->isReplicator()) {
557557
P = P->getParent();
558-
assert(!cast<VPRegionBlock>(P)->isReplicator() &&
558+
// Multiple loop regions can be nested, but replicate regions can only be
559+
// nested inside a loop region or must be outside any other region.
560+
assert((!P || !cast<VPRegionBlock>(P)->isReplicator()) &&
559561
"unexpected nested replicate regions");
560562
}
561563
return P;
@@ -934,7 +936,8 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
934936

935937
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
936938
// FIXME: Model VF * UF computation completely in VPlan.
937-
assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
939+
assert((!getVectorLoopRegion() || VFxUF.getNumUsers()) &&
940+
"VFxUF expected to always have users");
938941
unsigned UF = getUF();
939942
if (VF.getNumUsers()) {
940943
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
@@ -988,12 +991,18 @@ void VPlan::execute(VPTransformState *State) {
988991
for (VPBlockBase *Block : RPOT)
989992
Block->execute(State);
990993

991-
VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
994+
State->CFG.DTU.flush();
995+
996+
auto *LoopRegion = getVectorLoopRegion();
997+
if (!LoopRegion)
998+
return;
999+
1000+
VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
9921001
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];
9931002

9941003
// Fix the latch value of canonical, reduction and first-order recurrences
9951004
// phis in the vector loop.
996-
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
1005+
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
9971006
for (VPRecipeBase &R : Header->phis()) {
9981007
// Skip phi-like recipes that generate their backedege values themselves.
9991008
if (isa<VPWidenPHIRecipe>(&R))
@@ -1032,8 +1041,6 @@ void VPlan::execute(VPTransformState *State) {
10321041
Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
10331042
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
10341043
}
1035-
1036-
State->CFG.DTU.flush();
10371044
}
10381045

10391046
InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
@@ -1046,14 +1053,14 @@ VPRegionBlock *VPlan::getVectorLoopRegion() {
10461053
// TODO: Cache if possible.
10471054
for (VPBlockBase *B : vp_depth_first_shallow(getEntry()))
10481055
if (auto *R = dyn_cast<VPRegionBlock>(B))
1049-
return R;
1056+
return R->isReplicator() ? nullptr : R;
10501057
return nullptr;
10511058
}
10521059

10531060
const VPRegionBlock *VPlan::getVectorLoopRegion() const {
10541061
for (const VPBlockBase *B : vp_depth_first_shallow(getEntry()))
10551062
if (auto *R = dyn_cast<VPRegionBlock>(B))
1056-
return R;
1063+
return R->isReplicator() ? nullptr : R;
10571064
return nullptr;
10581065
}
10591066

@@ -1399,11 +1406,17 @@ void VPlanIngredient::print(raw_ostream &O) const {
13991406

14001407
#endif
14011408

1402-
bool VPValue::isDefinedOutsideLoopRegions() const {
1403-
return !hasDefiningRecipe() ||
1404-
!getDefiningRecipe()->getParent()->getEnclosingLoopRegion();
1409+
/// Returns true if there is a vector loop region and \p VPV is defined in a
1410+
/// loop region.
1411+
static bool isDefinedInsideLoopRegions(const VPValue *VPV) {
1412+
const VPRecipeBase *DefR = VPV->getDefiningRecipe();
1413+
return DefR && (!DefR->getParent()->getPlan()->getVectorLoopRegion() ||
1414+
DefR->getParent()->getEnclosingLoopRegion());
14051415
}
14061416

1417+
bool VPValue::isDefinedOutsideLoopRegions() const {
1418+
return !isDefinedInsideLoopRegions(this);
1419+
}
14071420
void VPValue::replaceAllUsesWith(VPValue *New) {
14081421
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
14091422
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3853,9 +3853,13 @@ class VPlan {
38533853
VPBasicBlock *getEntry() { return Entry; }
38543854
const VPBasicBlock *getEntry() const { return Entry; }
38553855

3856-
/// Returns the preheader of the vector loop region.
3856+
/// Returns the preheader of the vector loop region, if one exists, or null
3857+
/// otherwise.
38573858
VPBasicBlock *getVectorPreheader() {
3858-
return cast<VPBasicBlock>(getVectorLoopRegion()->getSinglePredecessor());
3859+
VPRegionBlock *VectorRegion = getVectorLoopRegion();
3860+
return VectorRegion
3861+
? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor())
3862+
: nullptr;
38593863
}
38603864

38613865
/// Returns the VPRegionBlock of the vector loop.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -794,12 +794,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
794794
return R.getVPSingleValue()->replaceAllUsesWith(R.getOperand(1));
795795
}
796796

797-
/// Try to simplify the recipes in \p Plan
798-
static void simplifyRecipes(VPlan &Plan) {
797+
/// Try to simplify the recipes in \p Plan. Use \p CanonicalIVTy as type for all
798+
/// un-typed live-ins in VPTypeAnalysis.
799+
static void simplifyRecipes(VPlan &Plan, Type *CanonicalIVTy) {
799800
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
800801
Plan.getEntry());
801-
Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
802-
VPTypeAnalysis TypeInfo(CanonicalIVType);
802+
VPTypeAnalysis TypeInfo(CanonicalIVTy);
803803
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
804804
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
805805
simplifyRecipe(R, TypeInfo);
@@ -812,8 +812,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
812812
PredicatedScalarEvolution &PSE) {
813813
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
814814
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
815-
VPBasicBlock *ExitingVPBB =
816-
Plan.getVectorLoopRegion()->getExitingBasicBlock();
815+
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
816+
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
817817
auto *Term = &ExitingVPBB->back();
818818
// Try to simplify the branch condition if TC <= VF * UF when preparing to
819819
// execute the plan for the main vector loop. We only do this if the
@@ -837,14 +837,42 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
837837
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
838838
return;
839839

840-
LLVMContext &Ctx = SE.getContext();
841-
auto *BOC = new VPInstruction(
842-
VPInstruction::BranchOnCond,
843-
{Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
840+
// The vector loop region only executes once. If possible, completely remove
841+
// the region, otherwise replace the terminator controlling the latch with
842+
// (BranchOnCond true).
843+
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
844+
auto *CanIVTy = Plan.getCanonicalIV()->getScalarType();
845+
if (all_of(
846+
Header->phis(),
847+
IsaPred<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe>)) {
848+
for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
849+
auto *HeaderPhiR = cast<VPHeaderPHIRecipe>(&HeaderR);
850+
HeaderPhiR->replaceAllUsesWith(HeaderPhiR->getStartValue());
851+
HeaderPhiR->eraseFromParent();
852+
}
844853

845-
Term->eraseFromParent();
846-
ExitingVPBB->appendRecipe(BOC);
854+
VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
855+
VPBlockBase *Exit = VectorRegion->getSingleSuccessor();
856+
VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
857+
VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
858+
859+
for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
860+
B->setParent(nullptr);
847861

862+
VPBlockUtils::connectBlocks(Preheader, Header);
863+
VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
864+
simplifyRecipes(Plan, CanIVTy);
865+
} else {
866+
// The vector region contains header phis for which we cannot remove the
867+
// loop region yet.
868+
LLVMContext &Ctx = SE.getContext();
869+
auto *BOC = new VPInstruction(
870+
VPInstruction::BranchOnCond,
871+
{Plan.getOrAddLiveIn(ConstantInt::getTrue(Ctx))}, Term->getDebugLoc());
872+
ExitingVPBB->appendRecipe(BOC);
873+
}
874+
875+
Term->eraseFromParent();
848876
VPlanTransforms::removeDeadRecipes(Plan);
849877

850878
Plan.setVF(BestVF);
@@ -1258,10 +1286,10 @@ void VPlanTransforms::optimize(VPlan &Plan) {
12581286
removeRedundantCanonicalIVs(Plan);
12591287
removeRedundantInductionCasts(Plan);
12601288

1261-
simplifyRecipes(Plan);
1289+
simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
12621290
legalizeAndOptimizeInductions(Plan);
12631291
removeRedundantExpandSCEVRecipes(Plan);
1264-
simplifyRecipes(Plan);
1292+
simplifyRecipes(Plan, Plan.getCanonicalIV()->getScalarType());
12651293
removeDeadRecipes(Plan);
12661294

12671295
createAndOptimizeReplicateRegions(Plan);

llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,13 @@ define void @powi_call(ptr %P) {
8080
; CHECK: [[VECTOR_PH]]:
8181
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
8282
; CHECK: [[VECTOR_BODY]]:
83-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
84-
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
85-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[TMP0]]
83+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 0
8684
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
8785
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
8886
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.powi.v2f64.i32(<2 x double> [[WIDE_LOAD]], i32 3)
89-
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP2]], align 8
90-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
91-
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
87+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
88+
; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP4]], align 8
89+
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
9290
; CHECK: [[MIDDLE_BLOCK]]:
9391
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
9492
; CHECK: [[SCALAR_PH]]:
@@ -102,7 +100,7 @@ define void @powi_call(ptr %P) {
102100
; CHECK-NEXT: store double [[POWI]], ptr [[GEP]], align 8
103101
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
104102
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1
105-
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
103+
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
106104
; CHECK: [[EXIT]]:
107105
; CHECK-NEXT: ret void
108106
;
@@ -233,6 +231,5 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
233231
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
234232
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
235233
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
236-
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
237-
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
234+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
238235
;.

0 commit comments

Comments
 (0)