@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467
467
ElementCount MinProfitableTripCount,
468
468
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469
469
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471
+ VPlan &Plan)
471
472
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472
473
AC (AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473
474
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474
- PSI(PSI), RTChecks(RTChecks) {
475
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475
476
// Query this against the original loop and save it here because the profile
476
477
// of the original loop header may change as the transformation happens.
477
478
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize (
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522
523
// / and the resume values can come from an additional bypass block, the \p
523
524
// / AdditionalBypass pair provides information about the bypass block and the
524
525
// / end value on the edge from bypass to this loop.
525
- PHINode * createInductionResumeValue (
526
+ void createInductionResumeValue (
526
527
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527
528
ArrayRef<BasicBlock *> BypassBlocks,
528
529
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535
536
// / count of the original loop for both main loop and epilogue vectorization.
536
537
void setTripCount (Value *TC) { TripCount = TC; }
537
538
539
+ std::pair<BasicBlock *, Value *>
540
+ getInductionBypassValue (PHINode *OrigPhi) const {
541
+ return InductionBypassValues.find (OrigPhi)->second ;
542
+ }
543
+
538
544
protected:
539
545
friend class LoopVectorizationPlanner ;
540
546
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
680
686
// / Structure to hold information about generated runtime checks, responsible
681
687
// / for cleaning the checks, if vectorization turns out unprofitable.
682
688
GeneratedRTChecks &RTChecks;
689
+
690
+ // / Mapping of induction phis to their bypass values and bypass blocks.
691
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
692
+
693
+ VPlan &Plan;
683
694
};
684
695
685
696
// / Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
721
732
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
722
733
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
723
734
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
724
- GeneratedRTChecks &Checks)
735
+ GeneratedRTChecks &Checks, VPlan &Plan )
725
736
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
726
737
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
727
- CM, BFI, PSI, Checks),
738
+ CM, BFI, PSI, Checks, Plan ),
728
739
EPI (EPI) {}
729
740
730
741
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
761
772
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
762
773
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
763
774
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
764
- GeneratedRTChecks &Check)
775
+ GeneratedRTChecks &Check, VPlan &Plan )
765
776
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
766
- EPI, LVL, CM, BFI, PSI, Check) {}
777
+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
767
778
// / Implements the interface for creating a vectorized skeleton using the
768
779
// / *main loop* strategy (ie the first pass of vplan execution).
769
780
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
790
801
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
791
802
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
792
803
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
793
- GeneratedRTChecks &Checks)
804
+ GeneratedRTChecks &Checks, VPlan &Plan )
794
805
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
795
- EPI, LVL, CM, BFI, PSI, Checks) {
806
+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
796
807
TripCount = EPI.TripCount ;
797
808
}
798
809
// / Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2555
2566
nullptr , Twine (Prefix) + " scalar.ph" );
2556
2567
}
2557
2568
2558
- PHINode *InnerLoopVectorizer::createInductionResumeValue (
2569
+ static void addOperandToPhiInVPIRBasicBlock (VPIRBasicBlock *VPBB, PHINode *P,
2570
+ VPValue *Op) {
2571
+ for (VPRecipeBase &R : *VPBB) {
2572
+ auto *IRI = cast<VPIRInstruction>(&R);
2573
+ if (&IRI->getInstruction () == P) {
2574
+ IRI->addOperand (Op);
2575
+ break ;
2576
+ }
2577
+ }
2578
+ }
2579
+
2580
+ void InnerLoopVectorizer::createInductionResumeValue (
2559
2581
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2560
2582
ArrayRef<BasicBlock *> BypassBlocks,
2561
2583
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
2590
2612
}
2591
2613
}
2592
2614
2593
- // Create phi nodes to merge from the backedge-taken check block.
2594
- PHINode *BCResumeVal =
2595
- PHINode::Create (OrigPhi->getType (), 3 , " bc.resume.val" ,
2596
- LoopScalarPreHeader->getFirstNonPHIIt ());
2597
- // Copy original phi DL over to the new one.
2598
- BCResumeVal->setDebugLoc (OrigPhi->getDebugLoc ());
2615
+ VPBasicBlock *MiddleVPBB =
2616
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
2599
2617
2600
- // The new PHI merges the original incoming value, in case of a bypass,
2601
- // or the value at the end of the vectorized loop.
2602
- BCResumeVal->addIncoming (EndValue, LoopMiddleBlock);
2618
+ VPBasicBlock *ScalarPHVPBB = nullptr ;
2619
+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
2620
+ // Order is strict: first is the exit block, second is the scalar preheader.
2621
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
2622
+ } else {
2623
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
2624
+ }
2603
2625
2604
- // Fix the scalar body counter (PHI node).
2605
- // The old induction's phi node in the scalar body needs the truncated
2606
- // value.
2607
- for (BasicBlock *BB : BypassBlocks)
2608
- BCResumeVal-> addIncoming (II. getStartValue ( ), BB );
2626
+ VPBuilder ScalarPHBuilder (ScalarPHVPBB);
2627
+ auto *ResumePhiRecipe = ScalarPHBuilder. createNaryOp (
2628
+ VPInstruction::ResumePhi,
2629
+ {Plan. getOrAddLiveIn (EndValue), Plan. getOrAddLiveIn (II. getStartValue ())},
2630
+ OrigPhi-> getDebugLoc ( ), " bc.resume.val " );
2609
2631
2610
- if (AdditionalBypass.first )
2611
- BCResumeVal->setIncomingValueForBlock (AdditionalBypass.first ,
2612
- EndValueFromAdditionalBypass);
2613
- return BCResumeVal;
2632
+ auto *ScalarLoopHeader =
2633
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ());
2634
+ addOperandToPhiInVPIRBasicBlock (ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2635
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first ,
2636
+ EndValueFromAdditionalBypass};
2614
2637
}
2615
2638
2616
2639
// / Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
2643
2666
for (const auto &InductionEntry : Legal->getInductionVars ()) {
2644
2667
PHINode *OrigPhi = InductionEntry.first ;
2645
2668
const InductionDescriptor &II = InductionEntry.second ;
2646
- PHINode *BCResumeVal = createInductionResumeValue (
2647
- OrigPhi, II, getExpandedStep (II, ExpandedSCEVs), LoopBypassBlocks,
2648
- AdditionalBypass);
2649
- OrigPhi->setIncomingValueForBlock (LoopScalarPreHeader, BCResumeVal);
2669
+ createInductionResumeValue (OrigPhi, II, getExpandedStep (II, ExpandedSCEVs),
2670
+ LoopBypassBlocks, AdditionalBypass);
2650
2671
}
2651
2672
}
2652
2673
@@ -7688,6 +7709,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7688
7709
// the second pass for the scalar loop. The induction resume values for the
7689
7710
// inductions in the epilogue loop are created before executing the plan for
7690
7711
// the epilogue loop.
7712
+ for (VPRecipeBase &R :
7713
+ Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
7714
+ // Create induction resume values for both widened pointer and
7715
+ // integer/fp inductions and update the start value of the induction
7716
+ // recipes to use the resume value.
7717
+ PHINode *IndPhi = nullptr ;
7718
+ const InductionDescriptor *ID;
7719
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7720
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
7721
+ ID = &Ind->getInductionDescriptor ();
7722
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7723
+ IndPhi = WidenInd->getPHINode ();
7724
+ ID = &WidenInd->getInductionDescriptor ();
7725
+ } else
7726
+ continue ;
7727
+
7728
+ createInductionResumeValue (IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
7729
+ LoopBypassBlocks);
7730
+ }
7691
7731
7692
7732
return {LoopVectorPreHeader, nullptr };
7693
7733
}
@@ -8865,14 +8905,9 @@ static void addLiveOutsForFirstOrderRecurrences(
8865
8905
VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
8866
8906
" scalar.recur.init" );
8867
8907
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8868
- for (VPRecipeBase &R :
8869
- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ())) {
8870
- auto *IRI = cast<VPIRInstruction>(&R);
8871
- if (&IRI->getInstruction () == FORPhi) {
8872
- IRI->addOperand (ResumePhiRecipe);
8873
- break ;
8874
- }
8875
- }
8908
+ addOperandToPhiInVPIRBasicBlock (
8909
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ()), FORPhi,
8910
+ ResumePhiRecipe);
8876
8911
8877
8912
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8878
8913
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9599,7 +9634,7 @@ static bool processLoopInVPlanNativePath(
9599
9634
GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI,
9600
9635
F->getDataLayout (), AddBranchWeights);
9601
9636
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9602
- VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9637
+ VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan );
9603
9638
LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
9604
9639
<< L->getHeader ()->getParent ()->getName () << " \"\n " );
9605
9640
LVP.executePlan (VF.Width , 1 , BestPlan, LB, DT, false );
@@ -10087,11 +10122,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10087
10122
assert (IC > 1 && " interleave count should not be 1 or 0" );
10088
10123
// If we decided that it is not legal to vectorize the loop, then
10089
10124
// interleave it.
10125
+ VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10090
10126
InnerLoopVectorizer Unroller (
10091
10127
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed (1 ),
10092
- ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks);
10128
+ ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
10093
10129
10094
- VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10095
10130
LVP.executePlan (VF.Width , IC, BestPlan, Unroller, DT, false );
10096
10131
10097
10132
ORE->emit ([&]() {
@@ -10113,10 +10148,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10113
10148
// to be vectorized by executing the plan (potentially with a different
10114
10149
// factor) again shortly afterwards.
10115
10150
EpilogueLoopVectorizationInfo EPI (VF.Width , IC, EpilogueVF.Width , 1 );
10151
+ std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10116
10152
EpilogueVectorizerMainLoop MainILV (L, PSE, LI, DT, TLI, TTI, AC, ORE,
10117
- EPI, &LVL, &CM, BFI, PSI, Checks);
10153
+ EPI, &LVL, &CM, BFI, PSI, Checks,
10154
+ *BestMainPlan);
10118
10155
10119
- std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10120
10156
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10121
10157
*BestMainPlan, MainILV, DT, true );
10122
10158
++LoopsVectorized;
@@ -10125,11 +10161,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10125
10161
// edges from the first pass.
10126
10162
EPI.MainLoopVF = EPI.EpilogueVF ;
10127
10163
EPI.MainLoopUF = EPI.EpilogueUF ;
10164
+ VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10128
10165
EpilogueVectorizerEpilogueLoop EpilogILV (L, PSE, LI, DT, TLI, TTI, AC,
10129
10166
ORE, EPI, &LVL, &CM, BFI, PSI,
10130
- Checks);
10167
+ Checks, BestEpiPlan );
10131
10168
10132
- VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10133
10169
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion ();
10134
10170
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
10135
10171
Header->setName (" vec.epilog.vector.body" );
@@ -10178,23 +10214,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10178
10214
RdxDesc.getRecurrenceStartValue ());
10179
10215
}
10180
10216
} else {
10181
- // Create induction resume values for both widened pointer and
10182
- // integer/fp inductions and update the start value of the induction
10183
- // recipes to use the resume value.
10217
+ // Retrive the induction resume values for wide inductions from
10218
+ // their original phi nodes in the scalar loop
10184
10219
PHINode *IndPhi = nullptr ;
10185
- const InductionDescriptor *ID;
10186
10220
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10187
10221
IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
10188
- ID = &Ind->getInductionDescriptor ();
10189
10222
} else {
10190
10223
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10191
10224
IndPhi = WidenInd->getPHINode ();
10192
- ID = &WidenInd->getInductionDescriptor ();
10193
10225
}
10194
-
10195
- ResumeV = MainILV.createInductionResumeValue (
10196
- IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
10197
- {EPI.MainLoopIterationCountCheck });
10226
+ ResumeV = IndPhi->getIncomingValueForBlock (L->getLoopPreheader ());
10198
10227
}
10199
10228
assert (ResumeV && " Must have a resume value" );
10200
10229
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn (ResumeV);
@@ -10206,13 +10235,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10206
10235
LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
10207
10236
DT, true , &ExpandedSCEVs);
10208
10237
++LoopsEpilogueVectorized;
10238
+ BasicBlock *PH = L->getLoopPreheader ();
10209
10239
10240
+ for (const auto &[IVPhi, _] : LVL.getInductionVars ()) {
10241
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock (PH));
10242
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue (IVPhi);
10243
+ Inc->setIncomingValueForBlock (BB, V);
10244
+ }
10210
10245
if (!MainILV.areSafetyChecksAdded ())
10211
10246
DisableRuntimeUnroll = true ;
10212
10247
} else {
10213
10248
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
10214
10249
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10215
- PSI, Checks);
10250
+ PSI, Checks, BestPlan );
10216
10251
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10217
10252
++LoopsVectorized;
10218
10253
0 commit comments