@@ -1108,47 +1108,25 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1108
1108
Function *F = getAssociatedFunction ();
1109
1109
auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1110
1110
1111
- auto TakeRange = [&](std::pair<unsigned , unsigned > R) {
1112
- auto [Min, Max] = R;
1113
- ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1114
- IntegerRangeState RangeState (Range);
1115
- clampStateAndIndicateChange (this ->getState (), RangeState);
1116
- indicateOptimisticFixpoint ();
1117
- };
1118
-
1119
- std::pair<unsigned , unsigned > MaxWavesPerEURange{
1120
- 1U , InfoCache.getMaxWavesPerEU (*F)};
1121
-
1122
1111
// If the attribute exists, we will honor it if it is not the default.
1123
1112
if (auto Attr = InfoCache.getWavesPerEUAttr (*F)) {
1113
+ std::pair<unsigned , unsigned > MaxWavesPerEURange{
1114
+ 1U , InfoCache.getMaxWavesPerEU (*F)};
1124
1115
if (*Attr != MaxWavesPerEURange) {
1125
- TakeRange (*Attr);
1116
+ auto [Min, Max] = *Attr;
1117
+ ConstantRange Range (APInt (32 , Min), APInt (32 , Max + 1 ));
1118
+ IntegerRangeState RangeState (Range);
1119
+ this ->getState () = RangeState;
1120
+ indicateOptimisticFixpoint ();
1126
1121
return ;
1127
1122
}
1128
1123
}
1129
1124
1130
- // Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
1131
- // calculation of waves per EU involves flat work group size, we can't
1132
- // simply use an assumed flat work group size as a start point, because the
1133
- // update of flat work group size is in an inverse direction of waves per
1134
- // EU. However, we can still do something if it is an entry function. Since
1135
- // an entry function is a terminal node, and flat work group size either
1136
- // from attribute or default will be used anyway, we can take that value and
1137
- // calculate the waves per EU based on it. This result can't be updated by
1138
- // no means, but that could still allow us to propagate it.
1139
- if (AMDGPU::isEntryFunctionCC (F->getCallingConv ())) {
1140
- std::pair<unsigned , unsigned > FlatWorkGroupSize;
1141
- if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr (*F))
1142
- FlatWorkGroupSize = *Attr;
1143
- else
1144
- FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize (*F);
1145
- TakeRange (InfoCache.getEffectiveWavesPerEU (*F, MaxWavesPerEURange,
1146
- FlatWorkGroupSize));
1147
- }
1125
+ if (AMDGPU::isEntryFunctionCC (F->getCallingConv ()))
1126
+ indicatePessimisticFixpoint ();
1148
1127
}
1149
1128
1150
1129
ChangeStatus updateImpl (Attributor &A) override {
1151
- auto &InfoCache = static_cast <AMDGPUInformationCache &>(A.getInfoCache ());
1152
1130
ChangeStatus Change = ChangeStatus::UNCHANGED;
1153
1131
1154
1132
auto CheckCallSite = [&](AbstractCallSite CS) {
@@ -1157,24 +1135,21 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
1157
1135
LLVM_DEBUG (dbgs () << ' [' << getName () << " ] Call " << Caller->getName ()
1158
1136
<< " ->" << Func->getName () << ' \n ' );
1159
1137
1160
- const auto *CallerInfo = A.getAAFor <AAAMDWavesPerEU>(
1138
+ const auto *CallerAA = A.getAAFor <AAAMDWavesPerEU>(
1161
1139
*this , IRPosition::function (*Caller), DepClassTy::REQUIRED);
1162
- const auto *AssumedGroupSize = A.getAAFor <AAAMDFlatWorkGroupSize>(
1163
- *this , IRPosition::function (*Func), DepClassTy::REQUIRED);
1164
- if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState () ||
1165
- !AssumedGroupSize->isValidState ())
1140
+ if (!CallerAA || !CallerAA->isValidState ())
1166
1141
return false ;
1167
1142
1168
- unsigned Min, Max ;
1169
- std::tie (Min, Max) = InfoCache. getEffectiveWavesPerEU (
1170
- *Caller,
1171
- {CallerInfo-> getAssumed (). getLower ().getZExtValue (),
1172
- CallerInfo ->getAssumed ().getUpper ().getZExtValue () - 1 },
1173
- {AssumedGroupSize-> getAssumed (). getLower (). getZExtValue (),
1174
- AssumedGroupSize-> getAssumed (). getUpper (). getZExtValue () - 1 } );
1175
- ConstantRange CallerRange ( APInt ( 32 , Min), APInt ( 32 , Max + 1 )) ;
1176
- IntegerRangeState CallerRangeState (CallerRange);
1177
- Change |= clampStateAndIndicateChange ( this -> getState (), CallerRangeState) ;
1143
+ auto Assumed = this -> getAssumed () ;
1144
+ unsigned Min = std::max (Assumed. getLower (). getZExtValue (),
1145
+ CallerAA-> getAssumed (). getLower (). getZExtValue ());
1146
+ unsigned Max = std::max (Assumed. getUpper ().getZExtValue (),
1147
+ CallerAA ->getAssumed ().getUpper ().getZExtValue ());
1148
+ ConstantRange Range ( APInt ( 32 , Min), APInt ( 32 , Max));
1149
+ IntegerRangeState RangeState (Range );
1150
+ this -> getState () = RangeState ;
1151
+ Change |= this -> getState () == Assumed ? ChangeStatus::UNCHANGED
1152
+ : ChangeStatus::CHANGED ;
1178
1153
1179
1154
return true ;
1180
1155
};
@@ -1333,6 +1308,59 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
1333
1308
}
1334
1309
}
1335
1310
1311
+ static void checkWavesPerEU (Module &M, TargetMachine &TM) {
1312
+ for (Function &F : M) {
1313
+ const GCNSubtarget &ST = TM.getSubtarget <GCNSubtarget>(F);
1314
+
1315
+ auto FlatWgrpSizeAttr =
1316
+ AMDGPU::getIntegerPairAttribute (F, " amdgpu-flat-work-group-size" );
1317
+ auto WavesPerEUAttr = AMDGPU::getIntegerPairAttribute (
1318
+ F, " amdgpu-waves-per-eu" , /* OnlyFirstRequired=*/ true );
1319
+
1320
+ unsigned MinWavesPerEU = ST.getMinWavesPerEU ();
1321
+ unsigned MaxWavesPerEU = ST.getMaxWavesPerEU ();
1322
+
1323
+ unsigned MinFlatWgrpSize = 1U ;
1324
+ unsigned MaxFlatWgrpSize = 1024U ;
1325
+ if (FlatWgrpSizeAttr.has_value ()) {
1326
+ MinFlatWgrpSize = FlatWgrpSizeAttr->first ;
1327
+ MaxFlatWgrpSize = *(FlatWgrpSizeAttr->second );
1328
+ }
1329
+
1330
+ // Start with the max range.
1331
+ unsigned Min = MinWavesPerEU;
1332
+ unsigned Max = MaxWavesPerEU;
1333
+
1334
+ // If the attribute exists, set them to the value from the attribute.
1335
+ if (WavesPerEUAttr.has_value ()) {
1336
+ Min = WavesPerEUAttr->first ;
1337
+ if (WavesPerEUAttr->second .has_value ())
1338
+ Max = *(WavesPerEUAttr->second );
1339
+ }
1340
+
1341
+ // Compute the range from flat workgroup size.
1342
+ auto [MinFromFlatWgrpSize, MaxFromFlatWgrpSize] =
1343
+ ST.getWavesPerEU (F, std::make_pair (MinFlatWgrpSize, MaxFlatWgrpSize));
1344
+
1345
+ // For the lower bound, we have to "tighten" it.
1346
+ Min = std::max (Min, MinFromFlatWgrpSize);
1347
+ // For the upper bound, we have to "extend" it.
1348
+ Max = std::max (Max, MaxFromFlatWgrpSize);
1349
+
1350
+ // Clamp the range to the max range.
1351
+ Min = std::max (Min, MinWavesPerEU);
1352
+ Max = std::min (Max, MaxWavesPerEU);
1353
+
1354
+ // Update the attribute if it is not the max.
1355
+ if (Min != MinWavesPerEU || Max != MaxWavesPerEU) {
1356
+ SmallString<10 > Buffer;
1357
+ raw_svector_ostream OS (Buffer);
1358
+ OS << Min << ' ,' << Max;
1359
+ F.addFnAttr (" amdgpu-waves-per-eu" , OS.str ());
1360
+ }
1361
+ }
1362
+ }
1363
+
1336
1364
static bool runImpl (Module &M, AnalysisGetter &AG, TargetMachine &TM,
1337
1365
AMDGPUAttributorOptions Options,
1338
1366
ThinOrFullLTOPhase LTOPhase) {
@@ -1408,8 +1436,14 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
1408
1436
}
1409
1437
}
1410
1438
1411
- ChangeStatus Change = A.run ();
1412
- return Change == ChangeStatus::CHANGED;
1439
+ bool Changed = A.run () == ChangeStatus::CHANGED;
1440
+
1441
+ if (Changed && (LTOPhase == ThinOrFullLTOPhase::None ||
1442
+ LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
1443
+ LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink))
1444
+ checkWavesPerEU (M, TM);
1445
+
1446
+ return Changed;
1413
1447
}
1414
1448
1415
1449
class AMDGPUAttributorLegacy : public ModulePass {
0 commit comments