@@ -543,6 +543,11 @@ class InnerLoopVectorizer {
543
543
protected:
544
544
friend class LoopVectorizationPlanner ;
545
545
546
+ // / Set up the values of the IVs correctly when exiting the vector loop.
547
+ virtual void fixupIVUsers (PHINode *OrigPhi, const InductionDescriptor &II,
548
+ Value *VectorTripCount, BasicBlock *MiddleBlock,
549
+ VPTransformState &State);
550
+
546
551
// / Iteratively sink the scalarized operands of a predicated instruction into
547
552
// / the block that was created for it.
548
553
void sinkScalarOperands (Instruction *PredInst);
@@ -780,6 +785,10 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
780
785
BasicBlock *emitIterationCountCheck (BasicBlock *Bypass, bool ForEpilogue);
781
786
void printDebugTracesAtStart () override ;
782
787
void printDebugTracesAtEnd () override ;
788
+
789
+ void fixupIVUsers (PHINode *OrigPhi, const InductionDescriptor &II,
790
+ Value *VectorTripCount, BasicBlock *MiddleBlock,
791
+ VPTransformState &State) override {};
783
792
};
784
793
785
794
// A specialized derived class of inner loop vectorizer that performs
@@ -2773,6 +2782,97 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2773
2782
return LoopVectorPreHeader;
2774
2783
}
2775
2784
2785
+ // Fix up external users of the induction variable. At this point, we are
2786
+ // in LCSSA form, with all external PHIs that use the IV having one input value,
2787
+ // coming from the remainder loop. We need those PHIs to also have a correct
2788
+ // value for the IV when arriving directly from the middle block.
2789
+ void InnerLoopVectorizer::fixupIVUsers (PHINode *OrigPhi,
2790
+ const InductionDescriptor &II,
2791
+ Value *VectorTripCount,
2792
+ BasicBlock *MiddleBlock,
2793
+ VPTransformState &State) {
2794
+ // There are two kinds of external IV usages - those that use the value
2795
+ // computed in the last iteration (the PHI) and those that use the penultimate
2796
+ // value (the value that feeds into the phi from the loop latch).
2797
+ // We allow both, but they, obviously, have different values.
2798
+
2799
+ DenseMap<Value *, Value *> MissingVals;
2800
+
2801
+ Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock (
2802
+ OrigLoop->getLoopPreheader ()))
2803
+ ->getIncomingValueForBlock (MiddleBlock);
2804
+
2805
+ // An external user of the last iteration's value should see the value that
2806
+ // the remainder loop uses to initialize its own IV.
2807
+ Value *PostInc = OrigPhi->getIncomingValueForBlock (OrigLoop->getLoopLatch ());
2808
+ for (User *U : PostInc->users ()) {
2809
+ Instruction *UI = cast<Instruction>(U);
2810
+ if (!OrigLoop->contains (UI)) {
2811
+ assert (isa<PHINode>(UI) && " Expected LCSSA form" );
2812
+ MissingVals[UI] = EndValue;
2813
+ }
2814
+ }
2815
+
2816
+ // An external user of the penultimate value need to see EndValue - Step.
2817
+ // The simplest way to get this is to recompute it from the constituent SCEVs,
2818
+ // that is Start + (Step * (CRD - 1)).
2819
+ for (User *U : OrigPhi->users ()) {
2820
+ auto *UI = cast<Instruction>(U);
2821
+ if (!OrigLoop->contains (UI)) {
2822
+ assert (isa<PHINode>(UI) && " Expected LCSSA form" );
2823
+ IRBuilder<> B (MiddleBlock->getTerminator ());
2824
+
2825
+ // Fast-math-flags propagate from the original induction instruction.
2826
+ if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp ()))
2827
+ B.setFastMathFlags (II.getInductionBinOp ()->getFastMathFlags ());
2828
+
2829
+ VPValue *StepVPV = Plan.getSCEVExpansion (II.getStep ());
2830
+ assert (StepVPV && " step must have been expanded during VPlan execution" );
2831
+ Value *Step = StepVPV->isLiveIn () ? StepVPV->getLiveInIRValue ()
2832
+ : State.get (StepVPV, VPLane (0 ));
2833
+ Value *Escape = nullptr ;
2834
+ if (EndValue->getType ()->isIntegerTy ())
2835
+ Escape = B.CreateSub (EndValue, Step);
2836
+ else if (EndValue->getType ()->isPointerTy ())
2837
+ Escape = B.CreatePtrAdd (EndValue, B.CreateNeg (Step));
2838
+ else {
2839
+ assert (EndValue->getType ()->isFloatingPointTy () &&
2840
+ " Unexpected induction type" );
2841
+ Escape = B.CreateBinOp (II.getInductionBinOp ()->getOpcode () ==
2842
+ Instruction::FAdd
2843
+ ? Instruction::FSub
2844
+ : Instruction::FAdd,
2845
+ EndValue, Step);
2846
+ }
2847
+ Escape->setName (" ind.escape" );
2848
+ MissingVals[UI] = Escape;
2849
+ }
2850
+ }
2851
+
2852
+ assert ((MissingVals.empty () ||
2853
+ all_of (MissingVals,
2854
+ [MiddleBlock, this ](const std::pair<Value *, Value *> &P) {
2855
+ return all_of (
2856
+ predecessors (cast<Instruction>(P.first )->getParent ()),
2857
+ [MiddleBlock, this ](BasicBlock *Pred) {
2858
+ return Pred == MiddleBlock ||
2859
+ Pred == OrigLoop->getLoopLatch ();
2860
+ });
2861
+ })) &&
2862
+ " Expected escaping values from latch/middle.block only" );
2863
+
2864
+ for (auto &I : MissingVals) {
2865
+ PHINode *PHI = cast<PHINode>(I.first );
2866
+ // One corner case we have to handle is two IVs "chasing" each-other,
2867
+ // that is %IV2 = phi [...], [ %IV1, %latch ]
2868
+ // In this case, if IV1 has an external use, we need to avoid adding both
2869
+ // "last value of IV1" and "penultimate value of IV2". So, verify that we
2870
+ // don't already have an incoming value for the middle block.
2871
+ if (PHI->getBasicBlockIndex (MiddleBlock) == -1 )
2872
+ PHI->addIncoming (I.second , MiddleBlock);
2873
+ }
2874
+ }
2875
+
2776
2876
namespace {
2777
2877
2778
2878
struct CSEDenseMapInfo {
@@ -2899,6 +2999,24 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2899
2999
for (PHINode &PN : Exit->phis ())
2900
3000
PSE.getSE ()->forgetLcssaPhiWithNewPredecessor (OrigLoop, &PN);
2901
3001
3002
+ if (Cost->requiresScalarEpilogue (VF.isVector ())) {
3003
+ // No edge from the middle block to the unique exit block has been inserted
3004
+ // and there is nothing to fix from vector loop; phis should have incoming
3005
+ // from scalar loop only.
3006
+ } else {
3007
+ // TODO: Check in VPlan to see if IV users need fixing instead of checking
3008
+ // the cost model.
3009
+
3010
+ // If we inserted an edge from the middle block to the unique exit block,
3011
+ // update uses outside the loop (phis) to account for the newly inserted
3012
+ // edge.
3013
+
3014
+ // Fix-up external users of the induction variables.
3015
+ for (const auto &Entry : Legal->getInductionVars ())
3016
+ fixupIVUsers (Entry.first , Entry.second ,
3017
+ getOrCreateVectorTripCount (nullptr ), LoopMiddleBlock, State);
3018
+ }
3019
+
2902
3020
// Don't apply optimizations below when no vector region remains, as they all
2903
3021
// require a vector loop at the moment.
2904
3022
if (!State.Plan ->getVectorLoopRegion ())
@@ -8931,9 +9049,11 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8931
9049
// / Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8932
9050
// / induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8933
9051
// / the end value of the induction.
8934
- static VPInstruction *addResumePhiRecipeForInduction (
8935
- VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8936
- VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
9052
+ static VPValue *addResumePhiRecipeForInduction (VPWidenInductionRecipe *WideIV,
9053
+ VPBuilder &VectorPHBuilder,
9054
+ VPBuilder &ScalarPHBuilder,
9055
+ VPTypeAnalysis &TypeInfo,
9056
+ VPValue *VectorTC) {
8937
9057
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8938
9058
// Truncated wide inductions resume from the last lane of their vector value
8939
9059
// in the last vector iteration which is handled elsewhere.
@@ -8967,10 +9087,8 @@ static VPInstruction *addResumePhiRecipeForInduction(
8967
9087
8968
9088
// / Create resume phis in the scalar preheader for first-order recurrences,
8969
9089
// / reductions and inductions, and update the VPIRInstructions wrapping the
8970
- // / original phis in the scalar header. End values for inductions are added to
8971
- // / \p IVEndValues.
8972
- static void addScalarResumePhis (VPRecipeBuilder &Builder, VPlan &Plan,
8973
- DenseMap<VPValue *, VPValue *> &IVEndValues) {
9090
+ // / original phis in the scalar header.
9091
+ static void addScalarResumePhis (VPRecipeBuilder &Builder, VPlan &Plan) {
8974
9092
VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
8975
9093
auto *ScalarPH = Plan.getScalarPreheader ();
8976
9094
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor ());
@@ -8987,16 +9105,11 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8987
9105
if (!ScalarPhiI)
8988
9106
break ;
8989
9107
8990
- // TODO: Extract final value from induction recipe initially, optimize to
8991
- // pre-computed end value together in optimizeInductionExitUsers.
8992
9108
auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe (ScalarPhiI));
8993
9109
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
8994
- if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction (
9110
+ if (VPValue *ResumePhi = addResumePhiRecipeForInduction (
8995
9111
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8996
9112
&Plan.getVectorTripCount ())) {
8997
- assert (ResumePhi->getOpcode () == VPInstruction::ResumePhi &&
8998
- " Expected a ResumePhi" );
8999
- IVEndValues[WideIVR] = ResumePhi->getOperand (0 );
9000
9113
ScalarPhiIRI->addOperand (ResumePhi);
9001
9114
continue ;
9002
9115
}
@@ -9027,6 +9140,65 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
9027
9140
}
9028
9141
}
9029
9142
9143
+ // / Return true if \p VPV is an optimizable IV or IV use. That is, if \p VPV is
9144
+ // / either an untruncated wide induction, or if it increments a wide induction
9145
+ // / by its step.
9146
+ static bool isOptimizableIVOrUse (VPValue *VPV) {
9147
+ VPRecipeBase *Def = VPV->getDefiningRecipe ();
9148
+ if (!Def)
9149
+ return false ;
9150
+ auto *WideIV = dyn_cast<VPWidenInductionRecipe>(Def);
9151
+ if (WideIV) {
9152
+ // VPV itself is a wide induction, separately compute the end value for exit
9153
+ // users if it is not a truncated IV.
9154
+ return isa<VPWidenPointerInductionRecipe>(WideIV) ||
9155
+ !cast<VPWidenIntOrFpInductionRecipe>(WideIV)->getTruncInst ();
9156
+ }
9157
+
9158
+ // Check if VPV is an optimizable induction increment.
9159
+ if (Def->getNumOperands () != 2 )
9160
+ return false ;
9161
+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand (0 ));
9162
+ if (!WideIV)
9163
+ WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand (1 ));
9164
+ if (!WideIV)
9165
+ return false ;
9166
+
9167
+ using namespace VPlanPatternMatch ;
9168
+ auto &ID = WideIV->getInductionDescriptor ();
9169
+
9170
+ // Check if VPV increments the induction by the induction step.
9171
+ VPValue *IVStep = WideIV->getStepValue ();
9172
+ switch (ID.getInductionOpcode ()) {
9173
+ case Instruction::Add:
9174
+ return match (VPV, m_c_Binary<Instruction::Add>(m_Specific (WideIV),
9175
+ m_Specific (IVStep)));
9176
+ case Instruction::FAdd:
9177
+ return match (VPV, m_c_Binary<Instruction::FAdd>(m_Specific (WideIV),
9178
+ m_Specific (IVStep)));
9179
+ case Instruction::FSub:
9180
+ return match (VPV, m_Binary<Instruction::FSub>(m_Specific (WideIV),
9181
+ m_Specific (IVStep)));
9182
+ case Instruction::Sub: {
9183
+ // IVStep will be the negated step of the subtraction. Check if Step == -1 *
9184
+ // IVStep.
9185
+ VPValue *Step;
9186
+ if (!match (VPV, m_Binary<Instruction::Sub>(m_VPValue (), m_VPValue (Step))) ||
9187
+ !Step->isLiveIn () || !IVStep->isLiveIn ())
9188
+ return false ;
9189
+ auto *StepCI = dyn_cast<ConstantInt>(Step->getLiveInIRValue ());
9190
+ auto *IVStepCI = dyn_cast<ConstantInt>(IVStep->getLiveInIRValue ());
9191
+ return StepCI && IVStepCI &&
9192
+ StepCI->getValue () == (-1 * IVStepCI->getValue ());
9193
+ }
9194
+ default :
9195
+ return ID.getKind () == InductionDescriptor::IK_PtrInduction &&
9196
+ match (VPV, m_GetElementPtr (m_Specific (WideIV),
9197
+ m_Specific (WideIV->getStepValue ())));
9198
+ }
9199
+ llvm_unreachable (" should have been covered by switch above" );
9200
+ }
9201
+
9030
9202
// Collect VPIRInstructions for phis in the exit blocks that are modeled
9031
9203
// in VPlan and add the exiting VPValue as operand. Some exiting values are not
9032
9204
// modeled explicitly yet and won't be included. Those are un-truncated
@@ -9056,6 +9228,12 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
9056
9228
}
9057
9229
Value *IncomingValue = ExitPhi->getIncomingValueForBlock (ExitingBB);
9058
9230
VPValue *V = Builder.getVPValueOrAddLiveIn (IncomingValue);
9231
+ // Exit values for inductions are computed and updated outside of VPlan
9232
+ // and independent of induction recipes.
9233
+ // TODO: Compute induction exit values in VPlan.
9234
+ if (isOptimizableIVOrUse (V) &&
9235
+ ExitVPBB->getSinglePredecessor () == MiddleVPBB)
9236
+ continue ;
9059
9237
ExitUsersToFix.insert (ExitIRI);
9060
9238
ExitIRI->addOperand (V);
9061
9239
}
@@ -9075,7 +9253,6 @@ addUsersInExitBlocks(VPlan &Plan,
9075
9253
9076
9254
auto *MiddleVPBB = Plan.getMiddleBlock ();
9077
9255
VPBuilder B (MiddleVPBB, MiddleVPBB->getFirstNonPhi ());
9078
- VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
9079
9256
9080
9257
// Introduce extract for exiting values and update the VPIRInstructions
9081
9258
// modeling the corresponding LCSSA phis.
@@ -9397,8 +9574,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9397
9574
VPlanTransforms::handleUncountableEarlyExit (
9398
9575
*Plan, *PSE.getSE (), OrigLoop, UncountableExitingBlock, RecipeBuilder);
9399
9576
}
9400
- DenseMap<VPValue *, VPValue *> IVEndValues;
9401
- addScalarResumePhis (RecipeBuilder, *Plan, IVEndValues);
9577
+ addScalarResumePhis (RecipeBuilder, *Plan);
9402
9578
SetVector<VPIRInstruction *> ExitUsersToFix =
9403
9579
collectUsersInExitBlocks (OrigLoop, RecipeBuilder, *Plan);
9404
9580
addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
@@ -9481,7 +9657,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9481
9657
VPlanTransforms::addActiveLaneMask (*Plan, ForControlFlow,
9482
9658
WithoutRuntimeCheck);
9483
9659
}
9484
- VPlanTransforms::optimizeInductionExitUsers (*Plan, IVEndValues);
9485
9660
9486
9661
assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
9487
9662
return Plan;
@@ -9533,10 +9708,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9533
9708
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9534
9709
RecipeBuilder.setRecipe (HeaderR->getUnderlyingInstr (), HeaderR);
9535
9710
}
9536
- DenseMap<VPValue *, VPValue *> IVEndValues;
9537
- // TODO: IVEndValues are not used yet in the native path, to optimize exit
9538
- // values.
9539
- addScalarResumePhis (RecipeBuilder, *Plan, IVEndValues);
9711
+ addScalarResumePhis (RecipeBuilder, *Plan);
9540
9712
9541
9713
assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
9542
9714
return Plan;
0 commit comments