@@ -831,7 +831,9 @@ struct AAAMDSizeRangeAttribute
831
831
const std::string getAsStr (Attributor *) const override {
832
832
std::string Str;
833
833
raw_string_ostream OS (Str);
834
- OS << getName () << ' [' ;
834
+ OS << getName () << " Known[" ;
835
+ OS << getKnown ().getLower () << ' ,' << getKnown ().getUpper () - 1 ;
836
+ OS << " ] Assumed[" ;
835
837
OS << getAssumed ().getLower () << ' ,' << getAssumed ().getUpper () - 1 ;
836
838
OS << ' ]' ;
837
839
return OS.str ();
@@ -1044,60 +1046,40 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1044
1046
AAAMDWavesPerEU (const IRPosition &IRP, Attributor &A)
1045
1047
: AAAMDSizeRangeAttribute(IRP, A, " amdgpu-waves-per-eu" ) {}
1046
1048
1047
- bool isValidState () const override {
1048
- return !Assumed.isEmptySet () && IntegerRangeState::isValidState ();
1049
- }
1050
-
1051
1049
void initialize (Attributor &A) override {
1052
1050
Function *F = getAssociatedFunction ();
1053
1051
auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1054
1052
1055
- if (const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1056
- *this , IRPosition::function (*F), DepClassTy::REQUIRED);
1057
- AssumedGroupSize->isValidState ()) {
1053
+ // We allow consistent WavesPErEU for all functions here but for non-entry
1054
+ // points we will verify consistency in the end.
1055
+ unsigned ImpliedMin, ImpliedMax;
1056
+ std::tie (ImpliedMin, ImpliedMax) =
1057
+ InfoCache.getWavesPerEU (*F, InfoCache.getFlatWorkGroupSizes (*F));
1058
1058
1059
- unsigned Min, Max;
1060
- std::tie (Min, Max) = InfoCache.getWavesPerEU (
1061
- *F, {AssumedGroupSize->getAssumed ().getLower ().getZExtValue (),
1062
- AssumedGroupSize->getAssumed ().getUpper ().getZExtValue () - 1 });
1063
-
1064
- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1065
- intersectKnown (Range);
1066
- }
1059
+ ConstantRange Range (APInt (32 , ImpliedMin), APInt (32 , ImpliedMax + 1 ));
1060
+ intersectKnown (Range);
1067
1061
1068
- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1062
+ // For entries we cannot derive anything better.
1063
+ if (AMDGPU::isEntryFunctionCC (getAssociatedFunction ()->getCallingConv ()))
1069
1064
indicatePessimisticFixpoint ();
1070
1065
}
1071
1066
1072
1067
ChangeStatus updateImpl (Attributor &A) override {
1073
- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1074
1068
ChangeStatus Change = ChangeStatus::UNCHANGED;
1075
1069
1076
1070
auto CheckCallSite = [&](AbstractCallSite CS) {
1077
1071
Function *Caller = CS.getInstruction ()->getFunction ();
1078
- Function *Func = getAssociatedFunction ();
1072
+ [[maybe_unused]] Function *Func = getAssociatedFunction ();
1079
1073
LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
1080
1074
<< " ->" << Func->getName () << ' \n ' );
1081
1075
1082
1076
const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1083
1077
*this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1084
- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1085
- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1086
- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1087
- !AssumedGroupSize->isValidState ())
1078
+ if (!CallerInfo || !CallerInfo->isValidState ())
1088
1079
return false ;
1089
1080
1090
- unsigned Min, Max;
1091
- std::tie (Min, Max) = InfoCache.getEffectiveWavesPerEU (
1092
- *Caller,
1093
- {CallerInfo->getAssumed ().getLower ().getZExtValue (),
1094
- CallerInfo->getAssumed ().getUpper ().getZExtValue () - 1 },
1095
- {AssumedGroupSize->getAssumed ().getLower ().getZExtValue (),
1096
- AssumedGroupSize->getAssumed ().getUpper ().getZExtValue () - 1 });
1097
- ConstantRange CallerRange (APInt (32 , Min), APInt (32 , Max + 1 ));
1098
- IntegerRangeState CallerRangeState (CallerRange);
1099
- Change |= clampStateAndIndicateChange (this ->getState (), CallerRangeState);
1100
-
1081
+ Change |=
1082
+ clampStateAndIndicateChange (this ->getState (), CallerInfo->getState ());
1101
1083
return true ;
1102
1084
};
1103
1085
@@ -1113,8 +1095,28 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1113
1095
Attributor &A);
1114
1096
1115
1097
ChangeStatus manifest (Attributor &A) override {
1098
+ unsigned ImpliedMin = getAssumed ().getLower ().getZExtValue ();
1099
+ unsigned ImpliedMax = getAssumed ().getUpper ().getZExtValue () - 1 ;
1100
+
1116
1101
Function *F = getAssociatedFunction ();
1117
1102
auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1103
+
1104
+ // Make non-kernel functions locally consistent.
1105
+ if (!AMDGPU::isEntryFunctionCC (getAssociatedFunction ()->getCallingConv ())) {
1106
+ const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1107
+ *this , getIRPosition (), DepClassTy::OPTIONAL);
1108
+ std::pair<unsigned , unsigned > FlatWorkGroupSize;
1109
+ if (!AssumedGroupSize || !AssumedGroupSize->isValidState ())
1110
+ FlatWorkGroupSize = InfoCache.getFlatWorkGroupSizes (*F);
1111
+ else
1112
+ FlatWorkGroupSize = {
1113
+ AssumedGroupSize->getAssumed ().getLower ().getZExtValue (),
1114
+ AssumedGroupSize->getAssumed ().getUpper ().getZExtValue () - 1 };
1115
+
1116
+ std::tie (ImpliedMin, ImpliedMax) = InfoCache.getEffectiveWavesPerEU (
1117
+ *F, {ImpliedMin, ImpliedMax}, FlatWorkGroupSize);
1118
+ }
1119
+
1118
1120
unsigned Max = InfoCache.getMaxWavesPerEU (*F);
1119
1121
return emitAttributeIfNotDefault (A, 1 , Max);
1120
1122
}
@@ -1295,10 +1297,10 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1295
1297
A.getOrCreateAAFor <AAUniformWorkGroupSize>(IRPosition::function (*F));
1296
1298
A.getOrCreateAAFor <AAAMDMaxNumWorkgroups>(IRPosition::function (*F));
1297
1299
A.getOrCreateAAFor <AAAMDGPUNoAGPR>(IRPosition::function (*F));
1300
+ A.getOrCreateAAFor <AAAMDWavesPerEU>(IRPosition::function (*F));
1298
1301
CallingConv::ID CC = F->getCallingConv ();
1299
1302
if (!AMDGPU::isEntryFunctionCC (CC)) {
1300
1303
A.getOrCreateAAFor <AAAMDFlatWorkGroupSize>(IRPosition::function (*F));
1301
- A.getOrCreateAAFor <AAAMDWavesPerEU>(IRPosition::function (*F));
1302
1304
} else if (CC == CallingConv::AMDGPU_KERNEL) {
1303
1305
addPreloadKernArgHint (*F, TM);
1304
1306
}
0 commit comments