[OpenMP] Use default grid value for static grid size

jdoerfert · jdoerfert · commit 7481b465ae30 · 2023-08-23T11:12:03.000-07:00
If the user did not provide any static clause to override the grid size, we assume the default grid size as upper bound and use it to improve code generation through vendor specific attributes. Fixes: llvm#64816 Differential Revision: https://reviews.llvm.org/D158382
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallingConv.h"
@@ -37,6 +38,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -4121,6 +4123,20 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
   Builder.CreateCall(Fn, {});
 }
 
+static const omp::GV &getGridValue(Function *Kernel) {
+  if (Kernel->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    StringRef Features =
+        Kernel->getFnAttribute("target-features").getValueAsString();
+    if (Features.count("+wavefrontsize64"))
+      return omp::getAMDGPUGridValues<64>();
+    return omp::getAMDGPUGridValues<32>();
+  }
+  if (Triple(Kernel->getParent()->getTargetTriple()).isNVPTX())
+
+    return omp::NVPTXGridValues;
+  llvm_unreachable("No grid value available for this architecture!");
+}
+
 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
     Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) {
   if (Config.isTargetDevice()) {
@@ -4135,6 +4151,9 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
   if (NumTeams > 0)
     OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams));
 
+  if (NumThreads == -1 && Config.isGPU())
+    NumThreads = getGridValue(OutlinedFn).GV_Default_WG_Size;
+
   if (NumThreads > 0) {
     if (OutlinedFn->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
       OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -0,0 +1,103 @@
+// clang-format off
+// RUN: %libomptarget-compile-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=DEFAULT
+
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+__attribute__((optnone)) int optnone() { return 1; }
+
+int main() {
+  int N = optnone() * 4098 * 32;
+
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: [[NT:(128|256)]] (MaxFlatWorkGroupSize: [[NT]]
+#pragma omp target 
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 1024
+#pragma omp target thread_limit(optnone() * 42)
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
+#pragma omp target thread_limit(optnone() * 42) ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// FIXME: Use the attribute value to imply a thread_limit
+// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
+#pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target
+#pragma omp teams distribute parallel for num_threads(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target teams distribute parallel for thread_limit(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: MaxFlatWorkGroupSize: 1024
+#pragma omp target teams distribute parallel for num_threads(optnone() * 42)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
+#pragma omp target
+#pragma omp teams distribute parallel for num_threads(9)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target thread_limit(4)
+#pragma omp teams distribute parallel for
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target
+#pragma omp teams distribute parallel for thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 9 (MaxFlatWorkGroupSize: 9
+#pragma omp target teams distribute parallel for num_threads(9)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+// DEFAULT: 4 (MaxFlatWorkGroupSize: 4
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i) {
+    optnone();
+  }
+}
+