[SYCL][HIP] Add AMDGPU reflect pass to choose between safe and unsafe AMDGPU atomics (#11467)

hdelan · web-flow · commit 34135a3757ff · 2024-04-24T16:20:38.000+01:00
AMDGPU reflect pass is needed to choose between safe and unsafe atomics at the libclc level. In the long run we will delete this patch as work is being done to ensure correct lowering of atomic instructions. See patches: llvm/llvm-project#85052 llvm/llvm-project#69229 This work is necessary as malloc shared atomics rely on PCIe atomics which can have patchy and unreliable support. Therefore, we want to be able to choose at compile time whether we should use safe atomics using CAS (which PCIe should support), or if we want to rely of the availability of the newest PCIe atomics, if malloc shared atomics are desired. Also changes the implementation of `atomic_or`, `atomic_and` so that they can choose between the safe or unsafe version based on the AMDGPU reflect value.
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
@@ -328,6 +328,10 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         # Disables NVVM reflection to defer to after linking
         list( APPEND flags -Xclang -target-feature -Xclang +ptx72
              -march=sm_86 -mllvm --nvvm-reflect-enable=false)
+      elseif( ARCH STREQUAL amdgcn )
+        # AMDGCN needs libclc to be compiled to high bc version since all atomic
+        # clang builtins need to be accessible
+        list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false )
       elseif( ARCH  STREQUAL x86_64)
         # TODO: This is used by SYCL Native Cpu, we should define an option to set this flags
         list( APPEND flags -Xclang -target-feature -Xclang +avx
@@ -358,6 +362,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       # simultaneously, we choose declare the builtins using the private space,
       # which will also work for the generic address space.
       set( supports_generic_addrspace FALSE )
+    elseif( ARCH STREQUAL amdgcn )
+      set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false )
     elseif( ARCH STREQUAL x86_64)
       set( opt_flags )
       set( supports_generic_addrspace FALSE )
diff --git a/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_and.cl b/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_and.cl
@@ -10,13 +10,8 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
-AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, int, i, __hip_atomic_fetch_and)
-AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned int, j, __hip_atomic_fetch_and)
-AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, long, l, __hip_atomic_fetch_and)
-AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned long, m, __hip_atomic_fetch_and)
+#define __CLC_OP &
+#define __SPIRV_BUILTIN _Z17__spirv_AtomicAnd
+#define __HIP_BUILTIN __hip_atomic_fetch_and
 
-#undef AMDGPU_ATOMIC
-#undef AMDGPU_ATOMIC_IMPL
-#undef AMDGPU_ARCH_GEQ
-#undef AMDGPU_ARCH_BETWEEN
-#undef GET_ATOMIC_SCOPE_AND_ORDER
+#include "atomic_safe.def"
diff --git a/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_helpers.h b/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_helpers.h
@@ -9,6 +9,8 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
+extern int __oclc_amdgpu_reflect(__constant char *);
+
 #define AMDGPU_ARCH_GEQ(LOWER) __oclc_ISA_version >= LOWER
 #define AMDGPU_ARCH_BETWEEN(LOWER, UPPER)                                      \
   __oclc_ISA_version >= LOWER &&__oclc_ISA_version < UPPER
@@ -72,14 +74,22 @@
   AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, BUILTIN)  \
   AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, BUILTIN)
 
-#define AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED,                                          \
-                               SUB1, OP)                                                                               \
+// Safe atomics will either choose a slow CAS atomic impl (default) or a fast
+// native atomic if --amdgpu-unsafe-int-atomics is passed to LLVM.
+//
+// Safe atomics using CAS may be necessary if PCIe does not support atomic
+// operations such as and, or, xor
+#define AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED,                                         \
+                                SUB1, OP, USE_BUILTIN_COND, BUILTIN)                                                   \
   _CLC_DEF TYPE                                                                                                        \
       FUNC_NAME##P##AS_MANGLED##TYPE_MANGLED##N5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagE##TYPE_MANGLED( \
           volatile AS TYPE *p, enum Scope scope,                                                                       \
           enum MemorySemanticsMask semantics, TYPE val) {                                                              \
     int atomic_scope = 0, memory_order = 0;                                                                            \
     GET_ATOMIC_SCOPE_AND_ORDER(scope, atomic_scope, semantics, memory_order)                                           \
+    if (USE_BUILTIN_COND)                                                                                              \
+      return BUILTIN(p, val, memory_order, atomic_scope);                                                              \
+    /* CAS atomics*/                                                                                                   \
     TYPE oldval = __hip_atomic_load(p, memory_order, atomic_scope);                                                    \
     TYPE newval = 0;                                                                                                   \
     do {                                                                                                               \
@@ -89,7 +99,13 @@
     return oldval;                                                                                                     \
   }
 
-#define AMDGPU_CAS_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP)                   \
-  AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP)  \
-  AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP)   \
-  AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP)
+#define AMDGPU_SAFE_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP, BUILTIN)         \
+  AMDGPU_SAFE_ATOMIC_IMPL(                                                     \
+      FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP,                     \
+      __oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN)        \
+  AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP,  \
+                          true /* local AS should always use builtin*/,        \
+                          BUILTIN)                                             \
+  AMDGPU_SAFE_ATOMIC_IMPL(                                                     \
+      FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP,                                \
+      __oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN)
diff --git a/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_or.cl b/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_or.cl
@@ -10,13 +10,8 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
-AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, int, i, __hip_atomic_fetch_or)
-AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned int, j, __hip_atomic_fetch_or)
-AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, long, l, __hip_atomic_fetch_or)
-AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned long, m, __hip_atomic_fetch_or)
+#define __CLC_OP |
+#define __SPIRV_BUILTIN _Z16__spirv_AtomicOr
+#define __HIP_BUILTIN __hip_atomic_fetch_or
 
-#undef AMDGPU_ATOMIC
-#undef AMDGPU_ATOMIC_IMPL
-#undef AMDGPU_ARCH_GEQ
-#undef AMDGPU_ARCH_BETWEEN
-#undef GET_ATOMIC_SCOPE_AND_ORDER
+#include "atomic_safe.def"
diff --git a/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_safe.def b/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_safe.def
@@ -0,0 +1,9 @@
+// Before including, define: __SPIRV_BUILTIN, __CLC_OP, __HIP_BUILTIN
+// and include atomic_helpers.h to get AMDGPU_SAFE_ATOMIC
+
+AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, int, i, __CLC_OP, __HIP_BUILTIN)
+AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned int, j, __CLC_OP,
+                   __HIP_BUILTIN)
+AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, long, l, __CLC_OP, __HIP_BUILTIN)
+AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned long, m, __CLC_OP,
+                   __HIP_BUILTIN)
diff --git a/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_xor.cl b/libclc/amdgcn-amdhsa/libspirv/atomic/atomic_xor.cl
@@ -10,16 +10,8 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
-#define __CLC_XOR ^
+#define __CLC_OP ^
+#define __SPIRV_BUILTIN _Z17__spirv_AtomicXor
+#define __HIP_BUILTIN __hip_atomic_fetch_xor
 
-AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, int, i, __CLC_XOR)
-AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned int, j, __CLC_XOR)
-AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, long, l, __CLC_XOR)
-AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned long, m, __CLC_XOR)
-
-#undef __CLC_XOR
-#undef AMDGPU_ATOMIC
-#undef AMDGPU_ATOMIC_IMPL
-#undef AMDGPU_ARCH_GEQ
-#undef AMDGPU_ARCH_BETWEEN
-#undef GET_ATOMIC_SCOPE_AND_ORDER
+#include "atomic_safe.def"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -130,6 +130,12 @@ struct AMDGPULowerKernelAttributesPass
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct AMDGPUOclcReflectPass : public PassInfoMixin<AMDGPUOclcReflectPass> {
+public:
+  PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
 void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
 extern char &AMDGPULowerModuleLDSLegacyPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOclcReflect.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOclcReflect.cpp
@@ -0,0 +1,92 @@
+//===- AMDGPUOclcReflect.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass searches for occurences of the AMDGPU_OCLC_REFLECT function, and
+// replaces the calls with some val dependent on the operand of the func. This
+// can be used to reflect across different implementations of functions at
+// compile time based on a compiler flag or some other means. This pass
+// currently supports use cases:
+//
+// 1. Choose a safe or unsafe version of atomic_xor at compile time, which can
+//    be chosen at compile time by setting the flag
+//    --amdgpu-oclc-unsafe-int-atomics=true.
+//
+// This pass is similar to the NVPTX pass NVVMReflect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define AMDGPU_OCLC_REFLECT "__oclc_amdgpu_reflect"
+
+static cl::opt<bool>
+    AMDGPUReflectEnabled("amdgpu-oclc-reflect-enable", cl::init(true),
+                         cl::Hidden,
+                         cl::desc("AMDGPU reflection, enabled by default"));
+static cl::opt<bool> AMDGPUUnsafeIntAtomicsEnable(
+    "amdgpu-oclc-unsafe-int-atomics", cl::init(false), cl::Hidden,
+    cl::desc("Should unsafe int atomics be chosen. Disabled by default."));
+
+PreservedAnalyses AMDGPUOclcReflectPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  if (!AMDGPUReflectEnabled)
+    return PreservedAnalyses::all();
+
+  if (F.getName() == AMDGPU_OCLC_REFLECT) {
+    assert(F.isDeclaration() &&
+           "__oclc_amdgpu_reflect function should not have a body");
+    return PreservedAnalyses::all();
+  }
+
+  SmallVector<CallInst *, 4> ToRemove;
+
+  for (Instruction &I : instructions(F)) {
+    auto *Call = dyn_cast<CallInst>(&I);
+    if (!Call)
+      continue;
+    if (Function *Callee = Call->getCalledFunction();
+        !Callee || Callee->getName() != AMDGPU_OCLC_REFLECT)
+      continue;
+
+    assert(Call->arg_size() == 1 &&
+           "Wrong number of operands to __oclc_amdgpu_reflect function");
+
+    ToRemove.push_back(Call);
+  }
+
+  if (!ToRemove.size())
+    return PreservedAnalyses::all();
+
+  for (CallInst *Call : ToRemove) {
+    const Value *Str = Call->getArgOperand(0);
+    const Value *Operand = cast<Constant>(Str)->getOperand(0);
+    StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
+    ReflectArg = ReflectArg.drop_back(1);
+
+    if (ReflectArg == "AMDGPU_OCLC_UNSAFE_INT_ATOMICS") {
+      int ReflectVal = AMDGPUUnsafeIntAtomicsEnable ? 1 : 0;
+      Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
+    } else {
+      report_fatal_error("Invalid arg passed to __oclc_amdgpu_reflect");
+    }
+    Call->eraseFromParent();
+  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -36,6 +36,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-arguments",
               AMDGPULowerKernelArgumentsPass(*this))
 FUNCTION_PASS("amdgpu-lower-kernel-attributes",
               AMDGPULowerKernelAttributesPass())
+FUNCTION_PASS("amdgpu-oclc-reflect", AMDGPUOclcReflectPass())
 FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -663,6 +663,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
   PB.registerPipelineStartEPCallback(
       [](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
+        FPM.addPass(AMDGPUOclcReflectPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInsertSingleUseVDST.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
+  AMDGPUOclcReflect.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPerfHintAnalysis.cpp
   AMDGPUPostLegalizerCombiner.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-oclc-reflect.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-oclc-reflect.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -p amdgpu-oclc-reflect %s | FileCheck %s -check-prefixes=CHECK,CHECK-SAFE-ATOMICS
+; RUN: opt -S -p amdgpu-oclc-reflect -amdgpu-oclc-unsafe-int-atomics=true %s | FileCheck %s -check-prefixes=CHECK,CHECK-UNSAFE-ATOMICS
+
+target triple = "amdgcn-amd-amdhsa"
+
+@.str = private unnamed_addr addrspace(4) constant [31 x i8] c"AMDGPU_OCLC_UNSAFE_INT_ATOMICS\00", align 1
+
+declare hidden i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef) local_unnamed_addr
+
+define i32 @foo() {
+; CHECK-SAFE-ATOMICS-LABEL: define i32 @foo() {
+; CHECK-SAFE-ATOMICS-NEXT:    ret i32 0
+;
+; CHECK-UNSAFE-ATOMICS-LABEL: define i32 @foo() {
+; CHECK-UNSAFE-ATOMICS-NEXT:    ret i32 1
+;
+  %call = tail call i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef @.str)
+  ret i32 %call
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp b/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
@@ -0,0 +1,24 @@
+// REQUIRES: hip
+// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
+// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
+
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::queue{}.single_task([=] {
+    int a;
+    sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
+        atomicInt(a);
+    atomicInt.fetch_xor(1);
+    atomicInt.fetch_and(1);
+    atomicInt.fetch_or(1);
+    // CHECK: __CLANG_OFFLOAD_BUNDLE____START__ sycl-amdgcn-amd-amdhsa-
+    // CHECK-SAFE: cmpxchg volatile
+    // CHECK-SAFE-NOT: atomicrmw
+    // CHECK-UNSAFE: atomicrmw volatile xor
+    // CHECK-UNSAFE: atomicrmw volatile and
+    // CHECK-UNSAFE: atomicrmw volatile or
+    // CHECK-UNSAFE-NOT: cmpxchg
+    // CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
+  });
+}