@@ -2610,22 +2610,15 @@ void InnerLoopVectorizer::createInductionResumeValue(
2610
2610
assert(VectorTripCount && "Expected valid arguments");
2611
2611
2612
2612
Instruction *OldInduction = Legal->getPrimaryInduction();
2613
- Value *EndValue = nullptr;
2614
2613
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2615
2614
if (OrigPhi == OldInduction) {
2616
- // We know what the end value is.
2617
- EndValue = VectorTripCount;
2618
2615
} else {
2619
2616
IRBuilder<> B(LoopVectorPreHeader->getTerminator());
2620
2617
2621
2618
// Fast-math-flags propagate from the original induction instruction.
2622
2619
if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
2623
2620
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2624
2621
2625
- EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
2626
- Step, II.getKind(), II.getInductionBinOp());
2627
- EndValue->setName("ind.end");
2628
-
2629
2622
// Compute the end value for the additional bypass (if applicable).
2630
2623
if (AdditionalBypass.first) {
2631
2624
B.SetInsertPoint(AdditionalBypass.first,
@@ -2637,26 +2630,6 @@ void InnerLoopVectorizer::createInductionResumeValue(
2637
2630
}
2638
2631
}
2639
2632
2640
- VPBasicBlock *MiddleVPBB =
2641
- cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
2642
-
2643
- VPBasicBlock *ScalarPHVPBB = nullptr;
2644
- if (MiddleVPBB->getNumSuccessors() == 2) {
2645
- // Order is strict: first is the exit block, second is the scalar preheader.
2646
- ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
2647
- } else {
2648
- ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
2649
- }
2650
-
2651
- VPBuilder ScalarPHBuilder(ScalarPHVPBB);
2652
- auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
2653
- VPInstruction::ResumePhi,
2654
- {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
2655
- OrigPhi->getDebugLoc(), "bc.resume.val");
2656
-
2657
- auto *ScalarLoopHeader =
2658
- cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
2659
- addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2660
2633
InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
2661
2634
EndValueFromAdditionalBypass};
2662
2635
}
@@ -7704,10 +7677,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7704
7677
ILV.getOrCreateVectorTripCount(nullptr),
7705
7678
CanonicalIVStartValue, State);
7706
7679
7680
+ VPBasicBlock *MiddleVPBB =
7681
+ cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7682
+
7683
+ VPBasicBlock *ScalarPHVPBB = nullptr;
7684
+ if (MiddleVPBB->getNumSuccessors() == 2) {
7685
+ // Order is strict: first is the exit block, second is the scalar
7686
+ // preheader.
7687
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
7688
+ } else {
7689
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
7690
+ }
7691
+
7707
7692
BestVPlan.execute(&State);
7708
7693
7709
7694
// 2.5 Collect reduction resume values.
7710
- auto *ExitVPBB =
7695
+ VPBasicBlock *ExitVPBB =
7711
7696
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7712
7697
for (VPRecipeBase &R : *ExitVPBB) {
7713
7698
createAndCollectMergePhiForReduction(
@@ -7992,6 +7977,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7992
7977
// Generate a resume induction for the vector epilogue and put it in the
7993
7978
// vector epilogue preheader
7994
7979
Type *IdxTy = Legal->getWidestInductionType();
7980
+
7995
7981
PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
7996
7982
EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
7997
7983
EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
@@ -8879,6 +8865,74 @@ addUsersInExitBlock(VPlan &Plan,
8879
8865
}
8880
8866
}
8881
8867
8868
+ static void addResumeValuesForInductions(VPlan &Plan) {
8869
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8870
+ VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8871
+
8872
+ VPBuilder Builder(
8873
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
8874
+ for (VPRecipeBase &R : Header->phis()) {
8875
+ PHINode *OrigPhi;
8876
+ const InductionDescriptor *ID;
8877
+ VPValue *Start;
8878
+ VPValue *Step;
8879
+ Type *ScalarTy;
8880
+ bool IsCanonical = false;
8881
+ if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
8882
+ if (WideIV->getTruncInst())
8883
+ continue;
8884
+ OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
8885
+ ID = &WideIV->getInductionDescriptor();
8886
+ Start = WideIV->getStartValue();
8887
+ Step = WideIV->getStepValue();
8888
+ ScalarTy = WideIV->getScalarType();
8889
+ IsCanonical = WideIV->isCanonical();
8890
+ } else if (auto *WideIV = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
8891
+ OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
8892
+ ID = &WideIV->getInductionDescriptor();
8893
+ Start = WideIV->getStartValue();
8894
+ Step = WideIV->getOperand(1);
8895
+ ScalarTy = Start->getLiveInIRValue()->getType();
8896
+ } else {
8897
+ continue;
8898
+ }
8899
+
8900
+ VPValue *EndValue = &Plan.getVectorTripCount();
8901
+ if (!IsCanonical) {
8902
+ EndValue = Builder.createDerivedIV(
8903
+ ID->getKind(),
8904
+ dyn_cast_or_null<FPMathOperator>(ID->getInductionBinOp()), Start,
8905
+ &Plan.getVectorTripCount(), Step);
8906
+ }
8907
+
8908
+ if (ScalarTy != TypeInfo.inferScalarType(EndValue)) {
8909
+ EndValue =
8910
+ Builder.createScalarCast(Instruction::Trunc, EndValue, ScalarTy);
8911
+ }
8912
+
8913
+ VPBasicBlock *MiddleVPBB =
8914
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
8915
+
8916
+ VPBasicBlock *ScalarPHVPBB = nullptr;
8917
+ if (MiddleVPBB->getNumSuccessors() == 2) {
8918
+ // Order is strict: first is the exit block, second is the scalar
8919
+ // preheader.
8920
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
8921
+ } else {
8922
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
8923
+ }
8924
+
8925
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8926
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8927
+ VPInstruction::ResumePhi, {EndValue, Start}, OrigPhi->getDebugLoc(),
8928
+ "bc.resume.val");
8929
+
8930
+ auto *ScalarLoopHeader =
8931
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
8932
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
8933
+ }
8934
+ }
8935
+
8882
8936
/// Handle live-outs for first order reductions, both in the scalar preheader
8883
8937
/// and the original exit block:
8884
8938
/// 1. Feed a resume value for every FOR from the vector loop to the scalar
@@ -9174,6 +9228,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9174
9228
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9175
9229
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
9176
9230
addUsersInExitBlock(*Plan, ExitUsersToFix);
9231
+ addResumeValuesForInductions(*Plan);
9177
9232
9178
9233
// ---------------------------------------------------------------------------
9179
9234
// Transform initial VPlan: Apply previously taken decisions, in order, to
@@ -9279,6 +9334,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9279
9334
bool HasNUW = true;
9280
9335
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
9281
9336
DebugLoc());
9337
+ addResumeValuesForInductions(*Plan);
9282
9338
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9283
9339
return Plan;
9284
9340
}
@@ -9562,7 +9618,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
9562
9618
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
9563
9619
Kind, cast_if_present<BinaryOperator>(FPBinOp));
9564
9620
DerivedIV->setName("offset.idx");
9565
- assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9621
+ assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&
9622
+ "IV didn't need transforming?");
9566
9623
9567
9624
State.set(this, DerivedIV, VPLane(0));
9568
9625
}
@@ -10231,6 +10288,50 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10231
10288
EPI, &LVL, &CM, BFI, PSI, Checks,
10232
10289
*BestMainPlan);
10233
10290
10291
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
10292
+ // Collect PHI nodes of wide inductions in the VPlan for the epilogue. Those will need their resume-values computed from the main vector loop. Others can be removed in the main VPlan.
10293
+ SmallPtrSet<PHINode *, 2> WidenedPhis;
10294
+ for (VPRecipeBase &R :
10295
+ BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
10296
+ if (!isa<VPWidenIntOrFpInductionRecipe,
10297
+ VPWidenPointerInductionRecipe>(&R))
10298
+ continue;
10299
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R))
10300
+ WidenedPhis.insert(
10301
+ cast<VPWidenIntOrFpInductionRecipe>(&R)->getPHINode());
10302
+ else
10303
+ WidenedPhis.insert(
10304
+ cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10305
+ }
10306
+ VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(
10307
+ BestMainPlan->getVectorLoopRegion()->getSingleSuccessor());
10308
+
10309
+ VPBasicBlock *ScalarPHVPBB = nullptr;
10310
+ if (MiddleVPBB->getNumSuccessors() == 2) {
10311
+ // Order is strict: first is the exit block, second is the scalar
10312
+ // preheader.
10313
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
10314
+ } else {
10315
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
10316
+ }
10317
+
10318
+ for (VPRecipeBase &R :
10319
+ *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor())) {
10320
+ auto *VPIRInst = cast<VPIRInstruction>(&R);
10321
+ auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10322
+ if (!IRI)
10323
+ break;
10324
+ if (WidenedPhis.contains(IRI) ||
10325
+ !LVL.getInductionVars().contains(IRI))
10326
+ continue;
10327
+ VPRecipeBase *ResumePhi =
10328
+ VPIRInst->getOperand(0)->getDefiningRecipe();
10329
+ VPIRInst->setOperand(0, BestMainPlan->getOrAddLiveIn(
10330
+ Constant::getNullValue(IRI->getType())));
10331
+ ResumePhi->eraseFromParent();
10332
+ }
10333
+ VPlanTransforms::removeDeadRecipes(*BestMainPlan);
10334
+
10234
10335
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10235
10336
*BestMainPlan, MainILV, DT, true);
10236
10337
++LoopsVectorized;
@@ -10239,7 +10340,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10239
10340
// edges from the first pass.
10240
10341
EPI.MainLoopVF = EPI.EpilogueVF;
10241
10342
EPI.MainLoopUF = EPI.EpilogueUF;
10242
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
10243
10343
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10244
10344
ORE, EPI, &LVL, &CM, BFI, PSI,
10245
10345
Checks, BestEpiPlan);
0 commit comments