llvm · npanchen · May 14, 2025 · shiltian · May 14, 2025 · npanchen
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1170,13 +1170,19 @@ struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
           !AssumedGroupSize->isValidState())
         return false;
 
+      unsigned MinFWGSize =
+          AssumedGroupSize->getAssumed().getLower().getZExtValue();
+      unsigned MaxFWGSize =
+          AssumedGroupSize->getAssumed().getUpper().getZExtValue();
+      if (MinFWGSize == 0 && MaxFWGSize == 0)
+        std::tie(MinFWGSize, MaxFWGSize) =
+            InfoCache.getDefaultFlatWorkGroupSize(*Func);
       unsigned Min, Max;
       std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
           *Caller,
           {CallerInfo->getAssumed().getLower().getZExtValue(),
            CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
-          {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
-           AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+          {MinFWGSize, MaxFWGSize - 1});
       ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
       IntegerRangeState CallerRangeState(CallerRange);
       Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-max-flat-wgs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-max-flat-wgs.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 -passes=amdgpu-attributor %s | FileCheck %s
+
+; CHECK-LABEL: define internal fastcc void @call1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]]
+define internal fastcc void @call1() #0 {
+  tail call fastcc void @call2()
+  ret void
+}
+
+; CHECK-LABEL: define internal fastcc void @call2(
+; CHECK-SAME: ) #[[ATTR0]]
+define internal fastcc void @call2() #1 {
+  tail call fastcc void @call5()
+  ret void
+}
+
+; CHECK-LABEL: define { ptr addrspace(1), ptr } @call3(
+; CHECK-SAME:) #[[ATTR0]]
+define { ptr addrspace(1), ptr } @call3() #2 {
+  tail call fastcc void @call5()
+  ret { ptr addrspace(1), ptr } zeroinitializer
+}
+
+; CHECK-LABEL: define internal fastcc void @call5(
+; CHECK-SAME: ) #[[ATTR0]]
+define internal fastcc void @call5() {
+  tail call fastcc void @call1()
+  ret void
+}
+
+attributes #0 = {"amdgpu-flat-work-group-size"="1, 1024" "target-cpu"="gfx942" }
+attributes #1 = {"amdgpu-flat-work-group-size"="1, 1024" "target-cpu"="gfx942" }
+attributes #2 = {"amdgpu-flat-work-group-size"="1, 256" "target-cpu"="gfx942" }
+
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx942" "uniform-work-group-size"="false" }