@@ -1111,47 +1111,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1111
1111
Function *F = getAssociatedFunction ();
1112
1112
auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1113
1113
1114
- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1115
- auto [Min, Max] = R;
1116
- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1117
- IntegerRangeState RangeState (Range);
1118
- clampStateAndIndicateChange (this ->getState (), RangeState);
1119
- indicateOptimisticFixpoint ();
1120
- };
1121
-
1122
- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1123
- 1U , InfoCache.getMaxWavesPerEU (*F)};
1124
-
1125
1114
// If the attribute exists, we will honor it if it is not the default.
1126
1115
if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1116
+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1117
+ 1U , InfoCache.getMaxWavesPerEU (*F)};
1127
1118
if (*Attr != MaxWavesPerEURange) {
1128
- TakeRange (*Attr);
1119
+ auto [Min, Max] = *Attr;
1120
+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1121
+ IntegerRangeState RangeState (Range);
1122
+ this ->getState () = RangeState;
1123
+ indicateOptimisticFixpoint ();
1129
1124
return ;
1130
1125
}
1131
1126
}
1132
1127
1133
- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1134
- // calculation of waves per EU involves flat work group size, we can't
1135
- // simply use an assumed flat work group size as a start point, because the
1136
- // update of flat work group size is in an inverse direction of waves per
1137
- // EU. However, we can still do something if it is an entry function. Since
1138
- // an entry function is a terminal node, and flat work group size either
1139
- // from attribute or default will be used anyway, we can take that value and
1140
- // calculate the waves per EU based on it. This result can't be updated by
1141
- // no means, but that could still allow us to propagate it.
1142
- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1143
- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1144
- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1145
- FlatWorkGroupSize = *Attr;
1146
- else
1147
- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1148
- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1149
- FlatWorkGroupSize));
1150
- }
1128
+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1129
+ indicatePessimisticFixpoint ();
1151
1130
}
1152
1131
1153
1132
ChangeStatus updateImpl (Attributor &A) override {
1154
- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1155
1133
ChangeStatus Change = ChangeStatus::UNCHANGED;
1156
1134
1157
1135
auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1160,24 +1138,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1160
1138
LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
1161
1139
<< " ->" << Func->getName () << ' \n ' );
1162
1140
1163
- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1141
+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
1164
1142
*this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1165
- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1166
- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1167
- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1168
- !AssumedGroupSize->isValidState ())
1143
+ if (!CallerAA || !CallerAA->isValidState ())
1169
1144
return false ;
1170
1145
1171
- unsigned Min, Max ;
1172
- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1173
- *Caller,
1174
- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1175
- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1176
- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1177
- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1178
- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1179
- IntegerRangeState CallerRangeState (CallerRange);
1180
- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1146
+ auto Assumed = this -> getAssumed () ;
1147
+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1148
+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1149
+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1150
+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1151
+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1152
+ IntegerRangeState RangeState (Range );
1153
+ this -> getState () = RangeState ;
1154
+ Change |= this -> getState () == Assumed ? ChangeStatus::UNCHANGED
1155
+ : ChangeStatus::CHANGED ;
1181
1156
1182
1157
return true ;
1183
1158
};
@@ -1336,6 +1311,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1336
1311
}
1337
1312
}
1338
1313
1314
+ static void checkWavesPerEU (Module &M, TargetMachine &TM) {
1315
+ for (Function &F : M) {
1316
+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1317
+
1318
+ auto FlatWgrpSizeAttr =
1319
+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1320
+ auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1321
+ F, " amdgpu-waves-per-eu" , /* OnlyFirstRequired=*/ true );
1322
+
1323
+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1324
+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1325
+
1326
+ unsigned MinFlatWgrpSize = 1U ;
1327
+ unsigned MaxFlatWgrpSize = 1024U ;
1328
+ if (FlatWgrpSizeAttr.has_value ()) {
1329
+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1330
+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1331
+ }
1332
+
1333
+ // Start with the max range.
1334
+ unsigned Min = MinWavesPerEU;
1335
+ unsigned Max = MaxWavesPerEU;
1336
+
1337
+ // If the attribute exists, set them to the value from the attribute.
1338
+ if (WavesPerEUAttr.has_value ()) {
1339
+ Min = WavesPerEUAttr->first ;
1340
+ if (WavesPerEUAttr->second .has_value ())
1341
+ Max = *(WavesPerEUAttr->second );
1342
+ }
1343
+
1344
+ // Compute the range from flat workgroup size.
1345
+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1346
+ ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1347
+
1348
+ // For the lower bound, we have to "tighten" it.
1349
+ Min = std::max (Min, MinFromFlatWgrpSize);
1350
+ // For the upper bound, we have to "extend" it.
1351
+ Max = std::max (Max, MaxFromFlatWgrpSize);
1352
+
1353
+ // Clamp the range to the max range.
1354
+ Min = std::max (Min, MinWavesPerEU);
1355
+ Max = std::min (Max, MaxWavesPerEU);
1356
+
1357
+ // Update the attribute if it is not the max.
1358
+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1359
+ SmallString<10 > Buffer;
1360
+ raw_svector_ostream OS (Buffer);
1361
+ OS << Min << ' ,' << Max;
1362
+ F.addFnAttr (" amdgpu-waves-per-eu" , OS.str ());
1363
+ }
1364
+ }
1365
+ }
1366
+
1339
1367
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1340
1368
AMDGPUAttributorOptions Options,
1341
1369
ThinOrFullLTOPhase LTOPhase) {
@@ -1425,8 +1453,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1425
1453
}
1426
1454
}
1427
1455
1428
- ChangeStatus Change = A.run ();
1429
- return Change == ChangeStatus::CHANGED;
1456
+ bool Changed = A.run () == ChangeStatus::CHANGED;
1457
+
1458
+ if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1459
+ LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1460
+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1461
+ checkWavesPerEU (M, TM);
1462
+
1463
+ return Changed;
1430
1464
}
1431
1465
1432
1466
class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments