[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior #74080

mhalk · 2023-12-01T14:07:47Z

Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value.

Changed several minor properties to be in sync with HIP:

Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen.
Make sure we only allow 32bit values for stack size.

Added testcase where a dynamic stack is required due to recursion.

llvmbot · 2023-12-01T14:08:19Z

@llvm/pr-subscribers-backend-amdgpu

Author: Michael Halkenhäuser (mhalk)

Changes

Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value.

Changed several minor properties to be in sync with HIP:

Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen.
Make sure we only allow 32bit values for stack size.

Added testcase where a dynamic stack is required due to recursion.

Full diff: https://github.com/llvm/llvm-project/pull/74080.diff

2 Files Affected:

(modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+55-6)
(added) openmp/libomptarget/test/offloading/dynamic_callstack.c (+80)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 69acfa54e6c96a3..34d5ebbf1173bac 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
   /// signal and can define an optional input signal (nullptr if none).
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUSignalTy *OutputSignal,
                          AMDGPUSignalTy *InputSignal) {
     assert(OutputSignal && "Invalid kernel output signal");
@@ -705,7 +705,8 @@ struct AMDGPUQueueTy {
     Packet->grid_size_y = 1;
     Packet->grid_size_z = 1;
     Packet->private_segment_size =
-        Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
+        Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
+                                  : Kernel.getPrivateSize();
     Packet->group_segment_size = GroupSize;
     Packet->kernel_object = Kernel.getKernelObject();
     Packet->kernarg_address = KernelArgs;
@@ -1174,7 +1175,7 @@ struct AMDGPUStreamTy {
   /// the kernel args buffer to the specified memory manager.
   Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
                          uint32_t NumThreads, uint64_t NumBlocks,
-                         uint32_t GroupSize, uint64_t StackSize,
+                         uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUMemoryManagerTy &MemoryManager) {
     if (Queue == nullptr)
       return Plugin::error("Target queue was nullptr");
@@ -1872,6 +1873,38 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     else
       return Plugin::error("Unexpected AMDGPU wavefront %d", WavefrontSize);
 
+    // To determine the correct scratch memory size per thread, we need to check
+    // the device architecure generation. According to AOT_OFFLOADARCHS we may
+    // assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
+    // with a two char arch specialization. In-between is the 1-2 char
+    // generation number we want to extract.
+    std::string CUKind{ComputeUnitKind};
+    for (auto &C : CUKind)
+      C = (char)std::tolower(C);
+
+    int GfxGen = 0;
+    if ((CUKind.find("gfx") == 0) && CUKind.length() > 5 &&
+        CUKind.length() < 8) {
+      // Cut away suffix & prefix.
+      CUKind.erase(CUKind.length() - 2, 2);
+      CUKind.erase(0, 3);
+      // Make sure we only convert digits to a number.
+      if (std::find_if(CUKind.begin(), CUKind.end(), [](unsigned char c) {
+            return !std::isdigit(c);
+          }) == CUKind.end())
+        GfxGen = std::stoi(CUKind);
+    }
+
+    // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
+    // But we need to divide by WavefrontSize.
+    if (GfxGen < 11) {
+      // 13-bit field in units of 256-dword.
+      MaxThreadScratchSize = ((256 * 4) / WavefrontSize) * ((1 << 13) - 1);
+    } else {
+      // 15-bit field in units of 64-dword.
+      MaxThreadScratchSize = ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
+    }
+
     // Get maximum number of workitems per workgroup.
     uint16_t WorkgroupMaxDim[3];
     if (auto Err =
@@ -2623,7 +2656,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
   Error setDeviceStackSize(uint64_t Value) override {
-    StackSize = Value;
+    if (Value > MaxThreadScratchSize) {
+      // Cap device scratch size.
+      MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
+              "%ld would exceed available resources.",
+              MaxThreadScratchSize, Value);
+      StackSize = MaxThreadScratchSize;
+    } else {
+      // Apply device scratch size, since it is within limits.
+      StackSize = Value;
+    }
+
     return Plugin::success();
   }
   Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2782,7 +2825,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
-  uint64_t StackSize = 16 * 1024 /* 16 KB */;
+  /// Default: 1024, in conformity to hipLimitStackSize.
+  uint64_t StackSize = 1024 /* 1 KiB */;
+
+  // The maximum scratch memory size per thread.
+  // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
+  uint32_t MaxThreadScratchSize;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3198,7 +3246,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  GroupSize, static_cast<uint32_t>(StackSize),
+                                  ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/test/offloading/dynamic_callstack.c b/openmp/libomptarget/test/offloading/dynamic_callstack.c
new file mode 100644
index 000000000000000..9de30d7b7b690b0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/dynamic_callstack.c
@@ -0,0 +1,80 @@
+#include <omp.h>
+#include <stdio.h>
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
+// RUN:   %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
+
+// TODO: Realize the following run in an acceptable manner.
+//       Unfortunately with insufficient scratch mem size programs will hang.
+//       Therefore, a timeout mechanism would help tremendously.
+//       Additionally, we need to allow empty output / unsuccessful execution.
+
+// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
+// RUN?:   timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN?:   | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
+// RUN?:   --allow-empty
+
+// REQUIRES: amdgcn-amd-amdhsa
+
+// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
+// That is: stack requirement for main's target region may not be calculated.
+
+// This recursive function will eventually return 0.
+int recursiveFunc(const int Recursions) {
+  if (Recursions < 1)
+    return 0;
+
+  int j[Recursions];
+#pragma omp target private(j)
+  { ; }
+
+  return recursiveFunc(Recursions - 1);
+}
+
+int main() {
+  int N = 256;
+  int a[N];
+  int b[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    a[i] = 0;
+
+  for (i = 0; i < N; i++)
+    b[i] = i;
+
+#pragma omp target parallel for
+  {
+    for (int j = 0; j < N; j++)
+      a[j] = b[j] + recursiveFunc(j);
+  }
+
+  int rc = 0;
+  for (i = 0; i < N; i++)
+    if (a[i] != b[i]) {
+      rc++;
+      printf("Wrong value: a[%d]=%d\n", i, a[i]);
+    }
+
+  if (!rc)
+    printf("Success\n");
+
+  return rc;
+}
+
+/// CHECK: Success
+
+/// LIMIT_EXCEEDED: Scratch memory size will be set to
+/// LIMIT_EXCEEDED: Success
+
+/// LIMIT_INSUFFICIENT-NOT: Success

mhalk · 2023-12-01T14:13:38Z

As stated in the testcase, I would have liked to add a failing configuration, which demonstrates the impact of insufficiently sized scratch memory -- but those cases result in a hang.
If anybody has suggestions / an opinion on this, I would be very grateful.
Even when it comes to removing this run configuration altogether.

jhuber6 · 2023-12-01T14:58:45Z

@JonChesterfield was the one who originally encouraged me to leave the default at 16 KiB like it was before the COV5 change, so I'll leave that decision up to him.

jhuber6 · 2023-12-01T15:02:47Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+    if (GfxGen < 11) {
+      // 13-bit field in units of 256-dword.
+      MaxThreadScratchSize = ((256 * 4) / WavefrontSize) * ((1 << 13) - 1);
+    } else {
+      // 15-bit field in units of 64-dword.
+      MaxThreadScratchSize = ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
+    }
+


Can't decide if this should be a ternary or something. LLVM style normally omits braces on single line blocks.

Thanks & good point, I'll keep this in mind when addressing further feedback.

jhuber6 · 2023-12-01T15:05:28Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+    // assume that AMDGPU offload archs are prefixed with "gfx" and suffixed
+    // with a two char arch specialization. In-between is the 1-2 char
+    // generation number we want to extract.
+    std::string CUKind{ComputeUnitKind};


This code looks really complicated and should be using StringRef. Are we just trying to get the major version? That should be pretty trivial.

StringRef Arch(ComputeUnitKind) unsigned Version = 0u; if (!llvm::to_integer(Arch.drop_front(sizeof("gfx") - 1), Version)) return Plugin::error("Invalid GFX architecture string"); unsigned Major = Version / 100;

Yes we only want the major version, so 9 for gfx90a or 10 for gfx1030.
It is pretty complicated for what it actually does. With the C++ tools I was aware of, I wanted to make sure I do not produce garbage.
Thank you very much!

Sorry, I forgot a ! on that condition, the string conversion returns false on failure.

Thanks for the correction!
Also, e.g. "gfx90a" will fail, so I'd go for a slice:

StringRef Arch(ComputeUnitKind); unsigned GfxGen = 0u; if (!llvm::to_integer(Arch.slice(sizeof("gfx") - 1, Arch.size() - 2), GfxGen))

In any way, this is significantly easier to read.

jhuber6 · 2023-12-01T16:38:48Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+    // But we need to divide by WavefrontSize.
+    // For generations pre-gfx11: use 13-bit field in units of 256-dword,
+    // otherwise: 15-bit field in units of 64-dword.
+    MaxThreadScratchSize = (GfxGen < 11)


if you get gfx1100 the GfxGen will be 1100 here.

Also I don't know if I like GfxGen here. It's called either "Arch" or ISAVersion elsewhere that I know of.

jhuber6 · 2023-12-01T16:40:15Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+    // generation number we want to extract.
+    StringRef Arch(ComputeUnitKind);
+    unsigned GfxGen = 0u;
+    if (!llvm::to_integer(Arch.slice(sizeof("gfx") - 1, Arch.size() - 2),


Why did the drop_front not work again? Also you need to specify that it's a hex radix.

JonChesterfield · 2023-12-01T16:40:56Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

+    MaxThreadScratchSize = (GfxGen < 11)
+                               ? ((256 * 4) / WavefrontSize) * ((1 << 13) - 1)
+                               : ((64 * 4) / WavefrontSize) * ((1 << 15) - 1);
+


Let's not copy and paste this crazy-complicated arithmetic from elsewhere in the same repo. If it's already in a convenient header, lets include that header somewhere in the plugin. I don't need to debug future divergence in this logic between the compiler backend and the language runtime.

That function is in a utility class, but could easily be made static or put somewhere more common. It uses an internal generation enum, but we could probably just convert to that here.

JonChesterfield · 2023-12-01T17:40:54Z

I suggested we just pass too-large values to HSA and catch the error from HSA. Unfortunately it comes in the form of an invocation of callbackError which calls FATAL_MESSAGE which calls abort(), so that's not great. There's quite a lot of calls to FATAL_MESSAGE in the nextgen plugin infra which is a shame given the plugin put effort into returning error codes on other paths.

Leaning slightly towards accepting the copy&paste with string slice and just having things blow up on us later, what do you guys think?

carlobertolli · 2023-12-01T17:48:15Z

I don't think the arch specification can change in the future, so the existing code would not need to be duplicated. However, it might be extended for future architectures and that means we will need to propagate it here as well. That would a potential source of problems in the future.

How hard it is to move the common code into something that can be linked against libomptarget libs?

JonChesterfield · 2023-12-01T17:58:20Z

Make a header under llvm/lib/Target/AMDGPU and include it from the backend and from here. Provided it's an inline function in the header there's no messing around with cmake. The llvm libs are already in scope for openmp runtime. I'd say trivial technical complexity and moderate chance of the backend developers not liking the patch spawning moderate political complexity.

carlobertolli · 2023-12-01T18:12:55Z

@jayfoad @b-sumner any comments on moving the duplicated code above to a header in llvm/lib/Target/AMDGPU?

JonChesterfield · 2023-12-01T18:17:44Z

@mhalk found what we think is the corresponding logic in hip at https://github.com/ROCm-Developer-Tools/clr/blob/develop/rocclr/device/device.cpp#L777, which in traditional copy&paste style has failed to notice that gfx11 is different to gfx10

I think our dev strategy of copying information in source code form across the compiler back end, the openmp runtime, the hip runtime and the libc runtime is a bad thing and we need to cut that out. Header files in the compiler backend included by the language runtimes is my recommendation as they all depend on the compiler backend anyway, and the compiler backend doesn't depend on any of those.

b-sumner · 2023-12-01T18:44:16Z

@jayfoad @b-sumner any comments on moving the duplicated code above to a header in llvm/lib/Target/AMDGPU?

I don't think the HIP/OpenCL runtime wants this dependence. So it would require a new API to be introduced in COMgr. "The llvm libs are already in scope for openmp runtime" is surprising to me.

jhuber6 · 2023-12-01T18:50:25Z

@jayfoad @b-sumner any comments on moving the duplicated code above to a header in llvm/lib/Target/AMDGPU?

I don't think the HIP/OpenCL runtime wants this dependence. So it would require a new API to be introduced in COMgr. "The llvm libs are already in scope for openmp runtime" is surprising to me.

The OpenMP libraries live in tree, so it's fairly trivial to include LLVM. OpenMP uses the LLVM backends for JIT functionality, as well as a load of ELF handling. We also use LLVM error handling and data structures almost everywhere. The LLVM libraries are generally statically linked unless shared libraries are explicitly requested by the user, so it introduces no extra dependencies in a release.

jayfoad · 2023-12-01T20:06:52Z

@jayfoad @b-sumner any comments on moving the duplicated code above to a header in llvm/lib/Target/AMDGPU?

No strong opinions from me. Stuff like that that needs to be shared between different parts of the backend (e.g. between codegen and the assembler/disassembler) often ends up in lib/Target/AMDGPU/Utils/, so maybe you could put this functionality in e.g. lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h?

mhalk · 2023-12-04T12:50:25Z

@jayfoad @b-sumner any comments on moving the duplicated code above to a header in llvm/lib/Target/AMDGPU?

No strong opinions from me. Stuff like that that needs to be shared between different parts of the backend (e.g. between codegen and the assembler/disassembler) often ends up in lib/Target/AMDGPU/Utils/, so maybe you could put this functionality in e.g. lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h?

Thanks for the input!
This looks like a reasonable way to resolve this situation -- I will start to investigate.

JonChesterfield · 2023-12-04T13:41:46Z

Jay's suggestion sounds good to me. HIP has different tradeoffs from living in a different repo - maybe the right thing to do there is include the header in comgr and re-export the function, but it's all very heavy handed relative to the #include and done approach in tree runtimes get to use.

jdoerfert · 2023-12-04T18:24:24Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

@@ -3198,7 +3233,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

  // Push the kernel launch into the stream.
  return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  GroupSize, static_cast<uint32_t>(StackSize),


Change it to uint32_t everywhere instead. Let's assume we stop at 4GB stack size.

Thanks -- Sure, sounds reasonable.

jdoerfert · 2023-12-04T18:27:06Z

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

@@ -2782,7 +2812,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {

  /// The current size of the stack that will be used in cases where it could
  /// not be statically determined.
-  uint64_t StackSize = 16 * 1024 /* 16 KB */;
+  /// Default: 1024, in conformity to hipLimitStackSize.
+  uint64_t StackSize = 1024 /* 1 KiB */;


mhalk · 2023-12-04T19:21:44Z

Jay's suggestion sounds good to me. HIP has different tradeoffs from living in a different repo - maybe the right thing to do there is include the header in comgr and re-export the function, but it's all very heavy handed relative to the #include and done approach in tree runtimes get to use.

On my first attempt to rename & move the corresponding Generation enum into lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h I would need to adapt (at least) ~ 25 other files within the backend; and TBH I've yet to get it to work as expected.

@jayfoad @JonChesterfield @jhuber6
Q: How should I go about this change to avoid introducing (lingering?) errors into the backend? Any suggestions welcome!
(Is there a way to narrow that number? [I guess not.])

jhuber6 · 2023-12-04T19:22:38Z

Jay's suggestion sounds good to me. HIP has different tradeoffs from living in a different repo - maybe the right thing to do there is include the header in comgr and re-export the function, but it's all very heavy handed relative to the #include and done approach in tree runtimes get to use.

On my first attempt to rename & move the corresponding Generation enum into lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h I would need to adapt (at least) ~ 25 other files within the backend; and TBH I've yet to get it to work as expected.

@jayfoad @JonChesterfield @jhuber6 Q: How should I go about this change to avoid introducing (lingering?) errors into the backend? Any suggestions welcome! (Is there a way to narrow that number? [I guess not.])

Can you just make an inline function somewhere that the current implementation also calls?

mhalk · 2023-12-04T20:02:28Z

Jay's suggestion sounds good to me. HIP has different tradeoffs from living in a different repo - maybe the right thing to do there is include the header in comgr and re-export the function, but it's all very heavy handed relative to the #include and done approach in tree runtimes get to use.

On my first attempt to rename & move the corresponding Generation enum into lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h I would need to adapt (at least) ~ 25 other files within the backend; and TBH I've yet to get it to work as expected.
@jayfoad @JonChesterfield @jhuber6 Q: How should I go about this change to avoid introducing (lingering?) errors into the backend? Any suggestions welcome! (Is there a way to narrow that number? [I guess not.])

Can you just make an inline function somewhere that the current implementation also calls?

Another level of indirection sounds good to me, I'll check that.

mhalk · 2023-12-07T12:10:34Z

FYI: Just a quick update, as discussed offline, I'll first work on solely moving / extracting the Generation enum with the intent to do this in its own PR (as a precursor to this one). Once it's ready, I'll link it here.

Reinstated several minor changes in behavior w.r.t. conformity with HIP. 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added calculation of maximum dyn callstack size per thread * If a value provided via LIBOMPTARGET_STACK_SIZE exceeds MaxThreadScratchSize, it will be capped See: * gerrit review 942931 / 968158 * llvm#72606 * llvm#74080 Change-Id: Ib0ef997b567f5f55097456c56d3f0bc2e287f848

arsenm · 2024-01-11T05:32:56Z

what's the status of this?

jhuber6 · 2024-01-22T21:25:59Z

Status? We should probably try to get this in before the release if possible.

arsenm

needs resolve to main

mhalk · 2024-03-01T17:46:57Z

Apologies for the long radio silence.

After reaching out to @arsenm the idea of moving the enum AMDGPUSubtarget::Generation does not seem attractive anymore.
As Matt suggested I'll try to implement a SubtargetFeature (named sth. like MaxWaveScratchSize) that may be queried via TargetParser, which holds the corresponding callstack size.
This will avoid duplication, if I can get it to work within the two locations where getMaxWaveScratchSize is needed.
(These changes should be reasonable within the same PR.)

I plan to be working on this next week.

mhalk · 2024-03-04T19:21:47Z

When starting to actually try and implement this, I came to the conclusion that when adding such a 'SubtargetFeature' it would really just duplicate GCNSubtarget::getMaxWaveScratchSize.
(IMHO this does not seem like a desirable / acceptable solution when looking at the former discussion.)

On top of that, TBH I only see bool features that may be queried via TargetParser but we want to encode an unsigned value. (So, if that is in fact a desired solution: how would one create such an unsigned feature? Or are there any examples I might have missed?)

If we would really want to avoid duplication IMO (maybe I'm wrong) we would have to either:

Use GCNSubtarget::getMaxWaveScratchSize (somehow?) at the particular location within the OpenMP amdgpu plugin (I have the Triple available, so theoretically I have the information I need).
Encode an unsigned value which holds the MaxWaveScratchSize and can be queried via TargetParser at both locations where this info is needed (OpenMP amdgpu plugin and GCNSubtarget class).

Any suggestions or ideas are very welcome & appreciated.

arsenm · 2024-03-05T09:38:13Z

On top of that, TBH I only see bool features that may be queried via TargetParser but we want to encode an unsigned value. (So, if that is in fact a desired solution: how would one create such an unsigned feature? Or are there any examples I might have missed?)

You can encode arbitrary values in a subtarget feature. The current main example would be FeatureLocalMemorySize*. Not sure if we directly expose those in the TargetParser

Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a GFX-dependent value. Changed several minor properties to be in sync with HIP: 1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize). 2. During AQL packet generation in case of a dyn callstack the maximum between user-provided and compiler-default is chosen. 3. Make sure we only allow 32bit values for stack size. Added testcase where a dynamic stack is required due to recursion.

mhalk · 2024-03-05T14:34:13Z

You can encode arbitrary values in a subtarget feature. The current main example would be FeatureLocalMemorySize*. Not sure if we directly expose those in the TargetParser

Great, thank you @arsenm! That helped a lot.

Now I've added the corresponding three SubtargetFeatures and added the getters similar to (Feature)LocalMemorySize.
TBH it is still not entirely clear to me how to access the newly available (get)MaxWaveScratchSize via the TargetParser or AMDGPUBaseInfo.

When using the TargetParser I was able to construct the FeatureMap corresponding to my present device(s) but that did not contain the info (and from what it contained, MaxWaveScratchSize does not really seem to 'fit' in that group of device capabilities).

Do you have another pointer for me on how to proceed?

arsenm · 2024-03-05T14:47:56Z

When using the TargetParser I was able to construct the FeatureMap corresponding to my present device(s) but that did not contain the info (and from what it contained, MaxWaveScratchSize does not really seem to 'fit' in that group of device capabilities).

The TargetParser does seem to only be exposing boolean-ish looking names, which unfortunately would imply introducing N different feature names there. I've lost track of why you need this in the first place; if you over-commit on the stack size, won't the lower level API fail for you?

mhalk · 2024-03-05T15:12:13Z

The TargetParser does seem to only be exposing boolean-ish looking names, which unfortunately would imply introducing N different feature names there. I've lost track of why you need this in the first place; if you over-commit on the stack size, won't the lower level API fail for you?

When launching a kernel which requires a dynamic callstack we want to use the maximum target-specific StackSize.

Yes, such a fail will occur, but IIRC this is not desired:

AMDGPU fatal error 1: Received error in queue 0x7f24b2a44000:
HSA_STATUS_ERROR_OUT_OF_RESOURCES: The runtime failed to allocate the necessary resources.
This error may also occur when the core runtime library needs to spawn threads or create internal OS-specific events.
Aborted (core dumped)

(IMHO in a complex scenario this message might not be very helpful.)

With the current changes the user will be informed, without aborting and while using the max. scratch memory.
(Unfortunately, it's still a matter of relaying the info into the OpenMP amdgpu plugin, so basically the situation hasn't changed much.)

arsenm · 2024-03-05T15:18:04Z

Yes, such a fail will occur, but IIRC this is not desired:

I think it would be better if we fixed the low level diagnostics to not be garbage, rather than building another layer of diagnostics on top

mhalk requested review from carlobertolli, jdoerfert, kevinsala and jhuber6 December 1, 2023 14:07

llvmbot added backend:AMDGPU openmp:libomptarget OpenMP offload runtime labels Dec 1, 2023

jhuber6 requested a review from JonChesterfield December 1, 2023 14:58

jhuber6 reviewed Dec 1, 2023

View reviewed changes

mhalk force-pushed the feature/dyn_callstack_calc branch from 444628c to 9ca5835 Compare December 1, 2023 16:18

jhuber6 reviewed Dec 1, 2023

View reviewed changes

JonChesterfield reviewed Dec 1, 2023

View reviewed changes

carlobertolli requested a review from b-sumner December 1, 2023 17:45

carlobertolli requested a review from jayfoad December 1, 2023 18:10

jdoerfert reviewed Dec 4, 2023

View reviewed changes

arsenm requested changes Feb 6, 2024

View reviewed changes

mhalk force-pushed the feature/dyn_callstack_calc branch from 9ca5835 to 0f191fe Compare March 5, 2024 14:18

mhalk force-pushed the feature/dyn_callstack_calc branch from 0f191fe to 21c4c52 Compare March 5, 2024 14:24

[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior #74080

Are you sure you want to change the base?

[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior #74080

Uh oh!

Conversation

mhalk commented Dec 1, 2023

Uh oh!

llvmbot commented Dec 1, 2023

Uh oh!

mhalk commented Dec 1, 2023

Uh oh!

jhuber6 commented Dec 1, 2023

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jhuber6 Dec 1, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

JonChesterfield commented Dec 1, 2023

Uh oh!

carlobertolli commented Dec 1, 2023

Uh oh!

JonChesterfield commented Dec 1, 2023

Uh oh!

carlobertolli commented Dec 1, 2023

Uh oh!

JonChesterfield commented Dec 1, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

b-sumner commented Dec 1, 2023

Uh oh!

jhuber6 commented Dec 1, 2023

Uh oh!

jayfoad commented Dec 1, 2023

Uh oh!

mhalk commented Dec 4, 2023

Uh oh!

JonChesterfield commented Dec 4, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

mhalk commented Dec 4, 2023

Uh oh!

jhuber6 commented Dec 4, 2023

Uh oh!

mhalk commented Dec 4, 2023

Uh oh!

mhalk commented Dec 7, 2023

Uh oh!

arsenm commented Jan 11, 2024

Uh oh!

jhuber6 commented Jan 22, 2024

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

mhalk commented Mar 1, 2024

Uh oh!

jhuber6 Dec 1, 2023 •

edited

Loading

JonChesterfield commented Dec 1, 2023 •

edited

Loading

JonChesterfield commented Dec 4, 2023 •

edited

Loading

mhalk commented Mar 4, 2024 •

edited

Loading