-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[VPlan] Connect Entry to scalar preheader during initial construction. #140132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-vectorizers Author: Florian Hahn (fhahn) ChangesUpdate initial construction to connect the Plan's entry to the scalar preheader during initial construction. This moves a small part of the Resume phis need 2 incoming values to start with, the second being the bypass value from the scalar ph (and used to replicate the incoming value for other bypass blocks). Adding the extra edge ensures we incoming values for resume phis match the incoming blocks. Still needs various VPlan printing tests to be updated. vplan-printing.ll shows the changes. Patch is 22.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140132.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7ad02956a5b69..a43c72451eaff 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -490,7 +490,7 @@ class InnerLoopVectorizer {
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
RTChecks(RTChecks), Plan(Plan),
- VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {}
+ VectorPHVPB(Plan.getEntry()->getSuccessors()[1]) {}
virtual ~InnerLoopVectorizer() = default;
@@ -2365,14 +2365,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
- if (PreVectorPH->getNumSuccessors() != 1) {
- assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
- assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
- "Unexpected successor");
- VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
- VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
- PreVectorPH = CheckVPIRBB;
- }
+ assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
+ assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
+ VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
+ VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
+ PreVectorPH = CheckVPIRBB;
VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
PreVectorPH->swapSuccessors();
@@ -2463,9 +2460,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);
-
- // TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
- introduceCheckBlockInVPlan(TCCheckBlock);
}
BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
@@ -7838,7 +7832,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
- VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSingleSuccessor());
+ VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors()[1]);
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
if (VectorizingEpilogue)
VPlanTransforms::removeDeadRecipes(BestVPlan);
@@ -8068,7 +8062,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
- introduceCheckBlockInVPlan(TCCheckBlock);
+ if (!ForEpilogue)
+ introduceCheckBlockInVPlan(TCCheckBlock);
return TCCheckBlock;
}
@@ -8198,7 +8193,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
Plan.setEntry(NewEntry);
// OldEntry is now dead and will be cleaned up when the plan gets destroyed.
- introduceCheckBlockInVPlan(Insert);
return Insert;
}
@@ -9158,7 +9152,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
DenseMap<VPValue *, VPValue *> &IVEndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
- auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
+ auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
VPBuilder VectorPHBuilder(
cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5fd7a369bf735..df4ba857d4e3a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3995,7 +3995,8 @@ class VPlan {
/// that this relies on unneeded branches to the scalar tail loop being
/// removed.
bool hasScalarTail() const {
- return getScalarPreheader()->getNumPredecessors() != 0;
+ return getScalarPreheader()->getNumPredecessors() != 0 &&
+ getScalarPreheader()->getSinglePredecessor() != getEntry();
}
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 287bc93ce496a..78c6e2f98387f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -546,6 +546,9 @@ void VPlanTransforms::prepareForVectorization(
if (auto *LatchExitVPB = MiddleVPBB->getSingleSuccessor())
VPBlockUtils::disconnectBlocks(MiddleVPBB, LatchExitVPB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
+ Plan.getEntry()->swapSuccessors();
+
// The exit blocks are unreachable, remove their recipes to make sure no
// users remain that may pessimize transforms.
for (auto *EB : Plan.getExitBlocks()) {
@@ -558,6 +561,9 @@ void VPlanTransforms::prepareForVectorization(
// The connection order corresponds to the operands of the conditional branch,
// with the middle block already connected to the exit block.
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
+ // Also connect the entry block to the scalar preheader.
+ VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
+ Plan.getEntry()->swapSuccessors();
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 806c20ef8cf73..0dcb6536e9985 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1733,7 +1733,7 @@ static void removeBranchOnCondTrue(VPlan &Plan) {
using namespace llvm::VPlanPatternMatch;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()))) {
- if (VPBB->getNumSuccessors() != 2 ||
+ if (VPBB->getNumSuccessors() != 2 || isa<VPIRBasicBlock>(VPBB) ||
!match(&VPBB->back(), m_BranchOnCond(m_True())))
continue;
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 727784de2cb6d..3504558d1b545 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -15,7 +15,7 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -42,6 +42,9 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end.loopexit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.end.loopexit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
@@ -50,9 +53,6 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw
; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.end.loopexit>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -83,7 +83,7 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<for.body.preheader>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -113,6 +113,9 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end.loopexit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.end.loopexit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
@@ -121,9 +124,6 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x,
; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %for.body ], [ 0, %for.body.preheader ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.end.loopexit>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -157,7 +157,7 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 smax %n)
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -202,6 +202,9 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
@@ -210,9 +213,6 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) {
; CHECK-NEXT: IR %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK-NEXT: IR %cmp = icmp ult i64 %i, 5
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -251,7 +251,7 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: Live-in ir<256> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * ir<4>
@@ -285,6 +285,9 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<for.end>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.end>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
@@ -293,9 +296,6 @@ define void @print_interleave_groups(i32 %C, i32 %D) {
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %cmp = icmp slt i64 %iv.next, 1024
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.end>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -338,7 +338,7 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: Live-in ir<128> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -390,6 +390,9 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph:
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -398,9 +401,6 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %cmp1 = icmp slt i32 %lsd, 100
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT: No successors
; CHECK-NEXT:}
;
entry:
@@ -449,7 +449,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: IR %inc = add i64 %div, 1
; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: vp<[[IV_END:%.+]]> = DERIVED-IV ir<0> + vp<[[VTC]]> * vp<[[EXP_SCEV]]>
@@ -475,6 +475,9 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<loop.exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<loop.exit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[IV_END]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -483,9 +486,6 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
; CHECK-NEXT: IR %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %iv.next = add i64 %iv, %inc
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<loop.exit>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -517,7 +517,7 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: Live-in ir<1000> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -543,6 +543,10 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -551,10 +555,6 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) {
; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %ec = icmp eq i32 %iv.next, 1000
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT: IR %lcssa = phi i32 [ %add, %loop ] (extra operand: vp<[[EXIT]]> from middle.block)
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -583,7 +583,7 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -612,6 +612,9 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -620,9 +623,6 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr %
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -654,7 +654,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -682,6 +682,9 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -690,10 +693,7 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
+; ; CHECK-NEXT: }
;
entry:
br label %loop
@@ -723,7 +723,7 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -772,6 +772,9 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<end>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<end>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<for.body>
@@ -780,9 +783,6 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) {
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %ifcond = fcmp oeq float %ld.value, 5.0
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<end>
-; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
entry:
@@ -823,7 +823,7 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: Live-in ir<%n> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
; CHECK-EMPTY:
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: Successor(s): vector loop
@@ -851,6 +851,9 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
; CHECK-NEXT: scalar.ph
; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.+]]> = resume-phi vp<[[VTC]]>, ir<0>
; CHECK-NEXT: Successor(s): ir-bb<loop>
@@ -859,9 +862,6 @@ define void @print_disjoint_flags(i64 %n, ptr noalias %x) {
; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph)
; CHECK: IR %exitcond = icmp eq i64 %iv.next, %n
; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit...
[truncated]
|
49578c3
to
4acf4d4
Compare
Update initial construction to connect the Plan's entry to the scalar preheader during initial construction. This moves a small part of the skeleton creation out of ILV and will also enable replacing VPInstruction::ResumePhi with regular VPPhi recipes. Resume phis need 2 incoming values to start with, the second being the bypass value from the scalar ph (and used to replicate the incoming value for other bypass blocks). Adding the extra edge ensures we incoming values for resume phis match the incoming blocks. Still needs various VPlan printing tests to be updated. vplan-printing.ll shows the changes.
4acf4d4
to
d481278
Compare
Use regular VPPhi instead of a separate opcode for resume phis. This removes an unneeded specialized opcode and unifies the code (verification, printing, updating when CFG is changed). Depends on llvm#140132.
@@ -490,7 +490,7 @@ class InnerLoopVectorizer { | |||
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), | |||
Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI), | |||
RTChecks(RTChecks), Plan(Plan), | |||
VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {} | |||
VectorPHVPB(Plan.getEntry()->getSuccessors()[1]) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So now Plan->entry has ScalarPH as its first successor and VectorPH as its second, rather than the latter as its only successor.
Good to assert there are two successors. Perhaps some API to retrieve "1of2" and "2of2", analogous to existing "1of1"?
Better look instead for first predecessor (among two) of first header block? Possibly as follow-up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to get the single predecessor of the vector loop region, which should be clearer, thanks
return getScalarPreheader()->getNumPredecessors() != 0 && | ||
getScalarPreheader()->getSinglePredecessor() != getEntry(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps clearer as:
return getScalarPreheader()->getNumPredecessors() != 0 && | |
getScalarPreheader()->getSinglePredecessor() != getEntry(); | |
return !(getScalarPreheader()->getNumPredecessors() == 0 || | |
getScalarPreheader()->getSinglePredecessor() == getEntry()); |
I.e., scalar loop does not take care of leftover iterations if (a) it doesn't take care of anything, or (b) it only takes care of all iterations as an alternative to the vector loop?
The latter may reach scalar loop from various bypassing blocks?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated thanks
|
||
// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. | ||
introduceCheckBlockInVPlan(TCCheckBlock); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Connecting entry VPBB to scalar preheader VPBB is done here as part of introducing the conditional branch at end of entry IRBB?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TCCheckBlock is the same block as the VPlan entry block. Before the change, we needed special logic to not create a new VPIRBB for this call.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TCCheckBlock is the same block as the VPlan entry block.
Worth an assert to clarify?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added thanks
@@ -8070,7 +8064,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, | |||
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); | |||
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); | |||
|
|||
introduceCheckBlockInVPlan(TCCheckBlock); | |||
if (!ForEpilogue) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!ForEpilogue) | |
// Worth explaining? | |
if (!ForEpilogue) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added thanks
// Also connect the entry block to the scalar preheader. | ||
VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The CFG is less consistent with VPlan's recipes, connecting entry VPBB to scalarPH VPBB now, when it is already connected to vectorPH, w/o introducing a conditional branch recipe at the end of entry - the branch instruction is generated outside VPlan's execute.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, we still need to allow terminator-less VPIRBBs for various parts of the skeleton. One of the next steps after adding it early here is adding a branch recipe.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Temporary inconsistency is worth clarifying in a TODO.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks! Adding several comments.
@@ -490,7 +490,7 @@ class InnerLoopVectorizer { | |||
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor), | |||
Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI), | |||
RTChecks(RTChecks), Plan(Plan), | |||
VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {} | |||
VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that ILV now depends on HCFG here, and elsewhere. Maybe better to retrieve the first hierarchical predecessor of first header block, to relax this dependence, possibly as follow up.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good!
@@ -2368,14 +2368,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { | |||
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Clarify that CheckBlock now excludes the initial trip-count check, which is expected to be already introduced before calling introduceCheckBlockInVPlan()
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a note, thanks
|
||
// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. | ||
introduceCheckBlockInVPlan(TCCheckBlock); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TCCheckBlock is the same block as the VPlan entry block.
Worth an assert to clarify?
// Only generate add a new block for the trip-count check for the main loop. | ||
// The epilogue will use the already existing VPlan entry block. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Only generate add a new block for the trip-count check for the main loop. | |
// The epilogue will use the already existing VPlan entry block. | |
// When vectorizing the main loop, its trip-count check is placed in a new block, whereas the overall trip-count check is placed in the VPlan entry block. | |
// When vectorizing the epilogue loop, its trip-count check is placed in the VPlan entry block. |
sounds clearer? Referring to https://llvm.org/docs/Vectorizers.html#epilogue-vectorization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated thanks
auto *ScalarPH = Plan.getScalarPreheader(); | ||
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor()); | ||
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggesting a getMiddleBlock(Plan)
utility?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, will check where this could be useful separately.
@@ -540,6 +540,9 @@ void VPlanTransforms::prepareForVectorization( | |||
if (auto *LatchExitVPB = MiddleVPBB->getSingleSuccessor()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Independent: worth clarifying that not requiring a scalar epilog check means the scalar epilog is (always) required, i.e., case 1.
// Also connect the entry block to the scalar preheader. | ||
VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Temporary inconsistency is worth clarifying in a TODO.
@@ -1846,7 +1846,7 @@ static void removeBranchOnCondTrue(VPlan &Plan) { | |||
using namespace llvm::VPlanPatternMatch; | |||
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( | |||
vp_depth_first_shallow(Plan.getEntry()))) { | |||
if (VPBB->getNumSuccessors() != 2 || | |||
if (VPBB->getNumSuccessors() != 2 || isa<VPIRBasicBlock>(VPBB) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Excluding VPIRBB's only because of the temporary terminator-less entry block with two successors? If so, suffice to exclude VPBB == Plan.getEntry()
? But from an earlier comment: "we still need to allow terminator-less VPIRBBs for various parts of the skeleton."
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks! We don't run removeBranchOnCond after introducing the other, terminator-less VPIRBBs.
@@ -484,7 +479,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias | |||
; CHECK-EMPTY: | |||
; CHECK-NEXT: ir-bb<exit> | |||
; CHECK-NEXT: No successors | |||
; CHECK-NEXT: } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to drop the check for }
? Here and above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, otherwise we would have to match scalar.ph and scalar header here as well, which isn't really relevant for the test
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/23251 Here is the relevant piece of the build log for the reference
|
…onstruction. (#140132) Update initial construction to connect the Plan's entry to the scalar preheader during initial construction. This moves a small part of the skeleton creation out of ILV and will also enable replacing VPInstruction::ResumePhi with regular VPPhi recipes. Resume phis need 2 incoming values to start with, the second being the bypass value from the scalar ph (and used to replicate the incoming value for other bypass blocks). Adding the extra edge ensures we incoming values for resume phis match the incoming blocks. PR: llvm/llvm-project#140132
Use regular VPPhi instead of a separate opcode for resume phis. This removes an unneeded specialized opcode and unifies the code (verification, printing, updating when CFG is changed). Depends on llvm#140132.
…NFC). (#140405) Use regular VPPhi instead of a separate opcode for resume phis. This removes an unneeded specialized opcode and unifies the code (verification, printing, updating when CFG is changed). Depends on llvm/llvm-project#140132. PR: llvm/llvm-project#140405
llvm#140132) Update initial construction to connect the Plan's entry to the scalar preheader during initial construction. This moves a small part of the skeleton creation out of ILV and will also enable replacing VPInstruction::ResumePhi with regular VPPhi recipes. Resume phis need 2 incoming values to start with, the second being the bypass value from the scalar ph (and used to replicate the incoming value for other bypass blocks). Adding the extra edge ensures we incoming values for resume phis match the incoming blocks. PR: llvm#140132
…#140405) Use regular VPPhi instead of a separate opcode for resume phis. This removes an unneeded specialized opcode and unifies the code (verification, printing, updating when CFG is changed). Depends on llvm#140132. PR: llvm#140405
Update initial construction to connect the Plan's entry to the scalar preheader during initial construction. This moves a small part of the
skeleton creation out of ILV and will also enable replacing
VPInstruction::ResumePhi with regular VPPhi recipes.
Resume phis need 2 incoming values to start with, the second being the bypass value from the scalar ph (and used to replicate the incoming value for other bypass blocks). Adding the extra edge ensures we incoming values for resume phis match the incoming blocks.