Skip to content

Commit 05cd1bd

Browse files
committed
[VPlan] Add hasScalarTail, use instead of !CM.foldTailByMasking() (NFC).
Now that VPlan is able to fold away redundant branches to the scalar preheader, we can directly check in VPlan if the scalar tail may execute. hasScalarTail returns true if the tail may execute. We know that the scalar tail won't execute if the scalar preheader doesn't have any predecessors, i.e. is not reachable. This removes some late uses of the legacy cost model.
1 parent 8fddef8 commit 05cd1bd

File tree

3 files changed

+33
-20
lines changed

3 files changed

+33
-20
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -535,13 +535,13 @@ class LoopVectorizationPlanner {
535535
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
536536
/// that of B.
537537
bool isMoreProfitable(const VectorizationFactor &A,
538-
const VectorizationFactor &B) const;
538+
const VectorizationFactor &B, bool HasTail) const;
539539

540540
/// Returns true if the per-lane cost of VectorizationFactor A is lower than
541541
/// that of B in the context of vectorizing a loop with known \p MaxTripCount.
542542
bool isMoreProfitable(const VectorizationFactor &A,
543543
const VectorizationFactor &B,
544-
const unsigned MaxTripCount) const;
544+
const unsigned MaxTripCount, bool HasTail) const;
545545

546546
/// Determines if we have the infrastructure to vectorize the loop and its
547547
/// epilogue, assuming the main loop is vectorized by \p VF.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4253,9 +4253,10 @@ static unsigned getEstimatedRuntimeVF(ElementCount VF,
42534253
return EstimatedVF;
42544254
}
42554255

4256-
bool LoopVectorizationPlanner::isMoreProfitable(
4257-
const VectorizationFactor &A, const VectorizationFactor &B,
4258-
const unsigned MaxTripCount) const {
4256+
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4257+
const VectorizationFactor &B,
4258+
const unsigned MaxTripCount,
4259+
bool HasTail) const {
42594260
InstructionCost CostA = A.Cost;
42604261
InstructionCost CostB = B.Cost;
42614262

@@ -4293,9 +4294,9 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42934294
if (!MaxTripCount)
42944295
return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
42954296

4296-
auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4297-
InstructionCost VectorCost,
4298-
InstructionCost ScalarCost) {
4297+
auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
4298+
InstructionCost VectorCost,
4299+
InstructionCost ScalarCost) {
42994300
// If the trip count is a known (possibly small) constant, the trip count
43004301
// will be rounded up to an integer number of iterations under
43014302
// FoldTailByMasking. The total cost in that case will be
@@ -4304,20 +4305,23 @@ bool LoopVectorizationPlanner::isMoreProfitable(
43044305
// some extra overheads, but for the purpose of comparing the costs of
43054306
// different VFs we can use this to compare the total loop-body cost
43064307
// expected after vectorization.
4307-
if (CM.foldTailByMasking())
4308-
return VectorCost * divideCeil(MaxTripCount, VF);
4309-
return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4308+
if (HasTail)
4309+
return VectorCost * (MaxTripCount / VF) +
4310+
ScalarCost * (MaxTripCount % VF);
4311+
return VectorCost * divideCeil(MaxTripCount, VF);
43104312
};
43114313

43124314
auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
43134315
auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
43144316
return CmpFn(RTCostA, RTCostB);
43154317
}
43164318

4317-
bool LoopVectorizationPlanner::isMoreProfitable(
4318-
const VectorizationFactor &A, const VectorizationFactor &B) const {
4319+
bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4320+
const VectorizationFactor &B,
4321+
bool HasTail) const {
43194322
const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4320-
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount);
4323+
return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4324+
HasTail);
43214325
}
43224326

43234327
void LoopVectorizationPlanner::emitInvalidCostRemarks(
@@ -4607,7 +4611,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
46074611
continue;
46084612
}
46094613

4610-
if (isMoreProfitable(Candidate, ChosenFactor))
4614+
if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
46114615
ChosenFactor = Candidate;
46124616
}
46134617
}
@@ -4621,7 +4625,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
46214625
}
46224626

46234627
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4624-
!isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4628+
!isMoreProfitable(ChosenFactor, ScalarCost,
4629+
!CM.foldTailByMasking())) dbgs()
46254630
<< "LV: Vectorization seems to be not beneficial, "
46264631
<< "but was forced by a user.\n");
46274632
return ChosenFactor;
@@ -4713,7 +4718,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47134718

47144719
if (EpilogueVectorizationForceVF > 1) {
47154720
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4716-
ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
4721+
ElementCount ForcedEC =
4722+
ElementCount::getFixed(EpilogueVectorizationForceVF);
47174723
if (hasPlanWithVF(ForcedEC))
47184724
return {ForcedEC, 0, 0};
47194725

@@ -4787,7 +4793,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47874793
}
47884794

47894795
if (Result.Width.isScalar() ||
4790-
isMoreProfitable(NextVF, Result, MaxTripCount))
4796+
isMoreProfitable(NextVF, Result, MaxTripCount, !CM.foldTailByMasking()))
47914797
Result = NextVF;
47924798
}
47934799

@@ -7540,11 +7546,11 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
75407546

75417547
InstructionCost Cost = cost(*P, VF);
75427548
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7543-
if (isMoreProfitable(CurrentFactor, BestFactor))
7549+
if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
75447550
BestFactor = CurrentFactor;
75457551

75467552
// If profitable add it to ProfitableVF list.
7547-
if (isMoreProfitable(CurrentFactor, ScalarFactor))
7553+
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
75487554
ProfitableVFs.push_back(CurrentFactor);
75497555
}
75507556
}

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3768,6 +3768,13 @@ class VPlan {
37683768
/// successors of the block in VPlan. The returned block is owned by the VPlan
37693769
/// and deleted once the VPlan is destroyed.
37703770
VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB);
3771+
3772+
/// Returns true if the scalar tail may execute after the vector loop. Note
3773+
/// that this relies on unneeded branches to the scalar tail loop being
3774+
/// removed.
3775+
bool hasScalarTail() const {
3776+
return getScalarPreheader()->getNumPredecessors() != 0;
3777+
}
37713778
};
37723779

37733780
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

0 commit comments

Comments
 (0)