@@ -1695,14 +1695,14 @@ class LoopVectorizationCostModel {
1695
1695
// / elements is a power-of-2 larger than zero. If scalable vectorization is
1696
1696
// / disabled or unsupported, then the scalable part will be equal to
1697
1697
// / ElementCount::getScalable(0).
1698
- FixedScalableVFPair computeFeasibleMaxVF (unsigned ConstTripCount ,
1698
+ FixedScalableVFPair computeFeasibleMaxVF (unsigned MaxTripCount ,
1699
1699
ElementCount UserVF,
1700
1700
bool FoldTailByMasking);
1701
1701
1702
1702
// / \return the maximized element count based on the targets vector
1703
1703
// / registers and the loop trip-count, but limited to a maximum safe VF.
1704
1704
// / This is a helper function of computeFeasibleMaxVF.
1705
- ElementCount getMaximizedVFForTarget (unsigned ConstTripCount ,
1705
+ ElementCount getMaximizedVFForTarget (unsigned MaxTripCount ,
1706
1706
unsigned SmallestType,
1707
1707
unsigned WidestType,
1708
1708
ElementCount MaxSafeVF,
@@ -4809,7 +4809,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4809
4809
}
4810
4810
4811
4811
FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF (
4812
- unsigned ConstTripCount , ElementCount UserVF, bool FoldTailByMasking) {
4812
+ unsigned MaxTripCount , ElementCount UserVF, bool FoldTailByMasking) {
4813
4813
MinBWs = computeMinimumValueSizes (TheLoop->getBlocks (), *DB, &TTI);
4814
4814
unsigned SmallestType, WidestType;
4815
4815
std::tie (SmallestType, WidestType) = getSmallestAndWidestTypes ();
@@ -4897,12 +4897,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4897
4897
FixedScalableVFPair Result (ElementCount::getFixed (1 ),
4898
4898
ElementCount::getScalable (0 ));
4899
4899
if (auto MaxVF =
4900
- getMaximizedVFForTarget (ConstTripCount , SmallestType, WidestType,
4900
+ getMaximizedVFForTarget (MaxTripCount , SmallestType, WidestType,
4901
4901
MaxSafeFixedVF, FoldTailByMasking))
4902
4902
Result.FixedVF = MaxVF;
4903
4903
4904
4904
if (auto MaxVF =
4905
- getMaximizedVFForTarget (ConstTripCount , SmallestType, WidestType,
4905
+ getMaximizedVFForTarget (MaxTripCount , SmallestType, WidestType,
4906
4906
MaxSafeScalableVF, FoldTailByMasking))
4907
4907
if (MaxVF.isScalable ()) {
4908
4908
Result.ScalableVF = MaxVF;
@@ -4926,6 +4926,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4926
4926
}
4927
4927
4928
4928
unsigned TC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
4929
+ unsigned MaxTC = PSE.getSE ()->getSmallConstantMaxTripCount (TheLoop);
4929
4930
LLVM_DEBUG (dbgs () << " LV: Found trip count: " << TC << ' \n ' );
4930
4931
if (TC == 1 ) {
4931
4932
reportVectorizationFailure (" Single iteration (non) loop" ,
@@ -4936,7 +4937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4936
4937
4937
4938
switch (ScalarEpilogueStatus) {
4938
4939
case CM_ScalarEpilogueAllowed:
4939
- return computeFeasibleMaxVF (TC , UserVF, false );
4940
+ return computeFeasibleMaxVF (MaxTC , UserVF, false );
4940
4941
case CM_ScalarEpilogueNotAllowedUsePredicate:
4941
4942
[[fallthrough]];
4942
4943
case CM_ScalarEpilogueNotNeededUsePredicate:
@@ -4974,7 +4975,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4974
4975
LLVM_DEBUG (dbgs () << " LV: Cannot fold tail by masking: vectorize with a "
4975
4976
" scalar epilogue instead.\n " );
4976
4977
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4977
- return computeFeasibleMaxVF (TC , UserVF, false );
4978
+ return computeFeasibleMaxVF (MaxTC , UserVF, false );
4978
4979
}
4979
4980
return FixedScalableVFPair::getNone ();
4980
4981
}
@@ -4991,7 +4992,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4991
4992
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue ();
4992
4993
}
4993
4994
4994
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF (TC , UserVF, true );
4995
+ FixedScalableVFPair MaxFactors = computeFeasibleMaxVF (MaxTC , UserVF, true );
4995
4996
4996
4997
// Avoid tail folding if the trip count is known to be a multiple of any VF
4997
4998
// we choose.
@@ -5067,7 +5068,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5067
5068
}
5068
5069
5069
5070
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget (
5070
- unsigned ConstTripCount , unsigned SmallestType, unsigned WidestType,
5071
+ unsigned MaxTripCount , unsigned SmallestType, unsigned WidestType,
5071
5072
ElementCount MaxSafeVF, bool FoldTailByMasking) {
5072
5073
bool ComputeScalableMaxVF = MaxSafeVF.isScalable ();
5073
5074
const TypeSize WidestRegister = TTI.getRegisterBitWidth (
@@ -5106,24 +5107,24 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5106
5107
}
5107
5108
5108
5109
// When a scalar epilogue is required, at least one iteration of the scalar
5109
- // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
5110
+ // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
5110
5111
// max VF that results in a dead vector loop.
5111
- if (ConstTripCount > 0 && requiresScalarEpilogue (true ))
5112
- ConstTripCount -= 1 ;
5113
-
5114
- if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
5115
- (!FoldTailByMasking || isPowerOf2_32 (ConstTripCount ))) {
5116
- // If loop trip count (TC) is known at compile time there is no point in
5117
- // choosing VF greater than TC (as done in the loop below). Select maximum
5118
- // power of two which doesn't exceed TC.
5119
- // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5120
- // when the TC is less than or equal to the known number of lanes.
5121
- auto ClampedConstTripCount = llvm::bit_floor (ConstTripCount );
5112
+ if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
5113
+ MaxTripCount -= 1 ;
5114
+
5115
+ if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
5116
+ (!FoldTailByMasking || isPowerOf2_32 (MaxTripCount ))) {
5117
+ // If upper bound loop trip count (TC) is known at compile time there is no
5118
+ // point in choosing VF greater than TC (as done in the loop below). Select
5119
+ // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
5120
+ // scalable, we only fall back on a fixed VF when the TC is less than or
5121
+ // equal to the known number of lanes.
5122
+ auto ClampedUpperTripCount = llvm::bit_floor (MaxTripCount );
5122
5123
LLVM_DEBUG (dbgs () << " LV: Clamping the MaxVF to maximum power of two not "
5123
5124
" exceeding the constant trip count: "
5124
- << ClampedConstTripCount << " \n " );
5125
+ << ClampedUpperTripCount << " \n " );
5125
5126
return ElementCount::get (
5126
- ClampedConstTripCount ,
5127
+ ClampedUpperTripCount ,
5127
5128
FoldTailByMasking ? MaxVectorElementCount.isScalable () : false );
5128
5129
}
5129
5130
0 commit comments