@@ -487,12 +487,11 @@ class InnerLoopVectorizer {
487
487
/// on, while the old loop will be used as the scalar remainder. Control flow
488
488
/// is generated around the vectorized (and scalar epilogue) loops consisting
489
489
/// of various checks and bypasses. Return the pre-header block of the new
490
- /// loop and the start value for the canonical induction, if it is != 0. The
491
- /// latter is the case when vectorizing the epilogue loop. In the case of
492
- /// epilogue vectorization, this function is overriden to handle the more
493
- /// complex control flow around the loops. \p ExpandedSCEVs is used to
494
- /// look up SCEV expansions for expressions needed during skeleton creation.
495
- virtual std::pair<BasicBlock *, Value *>
490
+ /// loop. In the case of epilogue vectorization, this function is overriden to
491
+ /// handle the more complex control flow around the loops. \p ExpandedSCEVs is
492
+ /// used to look up SCEV expansions for expressions needed during skeleton
493
+ /// creation.
494
+ virtual BasicBlock *
496
495
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
497
496
498
497
/// Fix the vectorized code, taking care of header phi's, and more.
@@ -747,15 +746,15 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
747
746
748
747
// Override this function to handle the more complex control flow around the
749
748
// three loops.
750
- std::pair< BasicBlock *, Value *> createVectorizedLoopSkeleton(
751
- const SCEV2ValueTy &ExpandedSCEVs) final {
749
+ BasicBlock *
750
+ createVectorizedLoopSkeleton( const SCEV2ValueTy &ExpandedSCEVs) final {
752
751
return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
753
752
}
754
753
755
754
/// The interface for creating a vectorized skeleton using one of two
756
755
/// different strategies, each corresponding to one execution of the vplan
757
756
/// as described above.
758
- virtual std::pair< BasicBlock *, Value *>
757
+ virtual BasicBlock *
759
758
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
760
759
761
760
/// Holds and updates state information required to vectorize the main loop
@@ -784,7 +783,7 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
784
783
EPI, LVL, CM, BFI, PSI, Check, Plan) {}
785
784
/// Implements the interface for creating a vectorized skeleton using the
786
785
/// *main loop* strategy (ie the first pass of vplan execution).
787
- std::pair< BasicBlock *, Value *>
786
+ BasicBlock *
788
787
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
789
788
790
789
protected:
@@ -819,7 +818,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
819
818
}
820
819
/// Implements the interface for creating a vectorized skeleton using the
821
820
/// *epilogue loop* strategy (ie the second pass of vplan execution).
822
- std::pair< BasicBlock *, Value *>
821
+ BasicBlock *
823
822
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
824
823
825
824
protected:
@@ -2716,6 +2715,7 @@ void InnerLoopVectorizer::createInductionResumeVPValues(
2716
2715
// Otherwise we provide the trip count from the main vector loop.
2717
2716
VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader();
2718
2717
VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin());
2718
+ bool HasCanonical = false;
2719
2719
for (VPRecipeBase &R : *Plan.getScalarHeader()) {
2720
2720
auto *PhiR = cast<VPIRInstruction>(&R);
2721
2721
auto *Phi = dyn_cast<PHINode>(&PhiR->getInstruction());
@@ -2728,11 +2728,25 @@ void InnerLoopVectorizer::createInductionResumeVPValues(
2728
2728
createInductionResumeVPValue(PhiR, II, getExpandedStep(II, ExpandedSCEVs),
2729
2729
LoopBypassBlocks, ScalarPHBuilder,
2730
2730
MainVectorTripCount);
2731
+ auto *ConstStart = dyn_cast<ConstantInt>(II.getStartValue());
2732
+ auto *ConstStep = II.getConstIntStepValue();
2733
+ if (Phi->getType() == VectorTripCount->getType() && ConstStart &&
2734
+ ConstStart->isZero() && ConstStep && ConstStep->isOne())
2735
+ HasCanonical = true;
2731
2736
}
2737
+
2738
+ if (!IVSubset || HasCanonical)
2739
+ return;
2740
+ // When vectorizing the epilogue, create a resume phi for the canonical IV if
2741
+ // no suitable resume phi was already created.
2742
+ ScalarPHBuilder.createNaryOp(
2743
+ VPInstruction::ResumePhi,
2744
+ {Plan.getOrAddLiveIn(VectorTripCount),
2745
+ Plan.getOrAddLiveIn(ConstantInt::get(VectorTripCount->getType(), 0))},
2746
+ {}, "vec.epilog.resume.val");
2732
2747
}
2733
2748
2734
- std::pair<BasicBlock *, Value *>
2735
- InnerLoopVectorizer::createVectorizedLoopSkeleton(
2749
+ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2736
2750
const SCEV2ValueTy &ExpandedSCEVs) {
2737
2751
/*
2738
2752
In this function we generate a new loop. The new loop will contain
@@ -2792,7 +2806,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
2792
2806
// Emit phis for the new starting index of the scalar loop.
2793
2807
createInductionResumeVPValues(ExpandedSCEVs);
2794
2808
2795
- return { LoopVectorPreHeader, nullptr} ;
2809
+ return LoopVectorPreHeader;
2796
2810
}
2797
2811
2798
2812
// Fix up external users of the induction variable. At this point, we are
@@ -7740,10 +7754,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7740
7754
7741
7755
// 1. Set up the skeleton for vectorization, including vector pre-header and
7742
7756
// middle block. The vector loop is created during VPlan execution.
7743
- Value *CanonicalIVStartValue;
7744
- std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7745
- ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
7746
- : State.ExpandedSCEVs);
7757
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7758
+ ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7747
7759
if (VectorizingEpilogue)
7748
7760
VPlanTransforms::removeDeadRecipes(BestVPlan);
7749
7761
@@ -7781,8 +7793,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7781
7793
7782
7794
// 2. Copy and widen instructions from the old loop into the new loop.
7783
7795
BestVPlan.prepareToExecute(ILV.getTripCount(),
7784
- ILV.getOrCreateVectorTripCount(nullptr),
7785
- CanonicalIVStartValue, State);
7796
+ ILV.getOrCreateVectorTripCount(nullptr), State);
7786
7797
VPlanTransforms::convertToConcreteRecipes(BestVPlan);
7787
7798
7788
7799
BestVPlan.execute(&State);
@@ -7859,8 +7870,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7859
7870
7860
7871
/// This function is partially responsible for generating the control flow
7861
7872
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7862
- std::pair<BasicBlock *, Value *>
7863
- EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7873
+ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7864
7874
const SCEV2ValueTy &ExpandedSCEVs) {
7865
7875
createVectorLoopSkeleton("");
7866
7876
@@ -7904,7 +7914,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7904
7914
}
7905
7915
createInductionResumeVPValues(ExpandedSCEVs, nullptr, &WideIVs);
7906
7916
7907
- return { LoopVectorPreHeader, nullptr} ;
7917
+ return LoopVectorPreHeader;
7908
7918
}
7909
7919
7910
7920
void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7984,7 +7994,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7984
7994
7985
7995
/// This function is partially responsible for generating the control flow
7986
7996
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7987
- std::pair< BasicBlock *, Value *>
7997
+ BasicBlock *
7988
7998
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7989
7999
const SCEV2ValueTy &ExpandedSCEVs) {
7990
8000
createVectorLoopSkeleton("vec.epilog.");
@@ -8068,30 +8078,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8068
8078
Phi->removeIncomingValue(EPI.MemSafetyCheck);
8069
8079
}
8070
8080
8071
- // Generate a resume phi for the canonical induction of the vector epilogue
8072
- // and put it in the vector epilogue preheader, unless such a phi already
8073
- // exists there - and can be reused.
8074
- PHINode *EPResumeVal = nullptr;
8075
- Type *IdxTy = Legal->getWidestInductionType();
8076
- Value *TC = EPI.VectorTripCount;
8077
- Constant *Init = ConstantInt::get(IdxTy, 0);
8078
-
8079
- for (PHINode &P : LoopVectorPreHeader->phis()) {
8080
- if (P.getType() == IdxTy &&
8081
- P.getIncomingValueForBlock(VecEpilogueIterationCountCheck) == TC &&
8082
- P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck) == Init) {
8083
- EPResumeVal = &P;
8084
- EPResumeVal->setName("vec.epilog.resume.val");
8085
- break;
8086
- }
8087
- }
8088
- if (!EPResumeVal) {
8089
- EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
8090
- EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
8091
- EPResumeVal->addIncoming(TC, VecEpilogueIterationCountCheck);
8092
- EPResumeVal->addIncoming(Init, EPI.MainLoopIterationCountCheck);
8093
- }
8094
-
8095
8081
// Generate induction resume values. These variables save the new starting
8096
8082
// indexes for the scalar loop. They are used to test if there are any tail
8097
8083
// iterations left once the vector loop has completed.
@@ -8100,7 +8086,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8100
8086
// the trip count of the main vector loop, passed as the second argument.
8101
8087
createInductionResumeVPValues(ExpandedSCEVs, EPI.VectorTripCount);
8102
8088
8103
- return { LoopVectorPreHeader, EPResumeVal} ;
8089
+ return LoopVectorPreHeader;
8104
8090
}
8105
8091
8106
8092
BasicBlock *
@@ -9993,7 +9979,8 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9993
9979
/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9994
9980
static void
9995
9981
preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9996
- const SCEV2ValueTy &ExpandedSCEVs) {
9982
+ const SCEV2ValueTy &ExpandedSCEVs,
9983
+ const EpilogueLoopVectorizationInfo &EPI) {
9997
9984
VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9998
9985
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9999
9986
Header->setName("vec.epilog.vector.body");
@@ -10016,12 +10003,53 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
10016
10003
ExpandR->eraseFromParent();
10017
10004
}
10018
10005
10019
- // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10020
- // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10021
- // before vectorizing the epilogue loop.
10006
+ // Ensure that the start values for all header phi recipes are updated before
10007
+ // vectorizing the epilogue loop.
10022
10008
for (VPRecipeBase &R : Header->phis()) {
10023
- if (isa<VPCanonicalIVPHIRecipe>(&R))
10009
+ if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10010
+ // When vectorizing the epilogue loop, the canonical induction start
10011
+ // value needs to be changed from zero to the value after the main
10012
+ // vector loop. Find the resume value created during execution of the main
10013
+ // VPlan.
10014
+ // FIXME: Improve modeling for canonical IV start values in the epilogue
10015
+ // loop.
10016
+ BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10017
+ predecessors(L->getLoopPreheader()),
10018
+ [&EPI](BasicBlock *BB, bool) -> BasicBlock * {
10019
+ if (BB != EPI.MainLoopIterationCountCheck &&
10020
+ BB != EPI.EpilogueIterationCountCheck &&
10021
+ BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10022
+ return BB;
10023
+ return nullptr;
10024
+ });
10025
+ using namespace llvm::PatternMatch;
10026
+ Type *IdxTy = IV->getScalarType();
10027
+ PHINode *EPResumeVal = find_singleton<PHINode>(
10028
+ L->getLoopPreheader()->phis(),
10029
+ [&EPI, IdxTy, MainMiddle](PHINode &P, bool) -> PHINode * {
10030
+ if (P.getType() == IdxTy &&
10031
+ P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10032
+ match(
10033
+ P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10034
+ m_SpecificInt(0)))
10035
+ return &P;
10036
+ return nullptr;
10037
+ });
10038
+ assert(EPResumeVal && "must have a resume value for the canonical IV");
10039
+ VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);
10040
+ assert(all_of(IV->users(),
10041
+ [](const VPUser *U) {
10042
+ return isa<VPScalarIVStepsRecipe>(U) ||
10043
+ isa<VPScalarCastRecipe>(U) ||
10044
+ isa<VPDerivedIVRecipe>(U) ||
10045
+ cast<VPInstruction>(U)->getOpcode() ==
10046
+ Instruction::Add;
10047
+ }) &&
10048
+ "the canonical IV should only be used by its increment or "
10049
+ "ScalarIVSteps when resetting the start value");
10050
+ IV->setOperand(0, VPV);
10024
10051
continue;
10052
+ }
10025
10053
10026
10054
Value *ResumeV = nullptr;
10027
10055
// TODO: Move setting of resume values to prepareToExecute.
@@ -10425,7 +10453,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10425
10453
ORE, EPI, &LVL, &CM, BFI, PSI,
10426
10454
Checks, BestEpiPlan);
10427
10455
EpilogILV.setTripCount(MainILV.getTripCount());
10428
- preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs);
10456
+ preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI );
10429
10457
10430
10458
assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10431
10459
"DT not preserved correctly");
0 commit comments