Skip to content

Commit 34135a3

Browse files
authored
[SYCL][HIP] Add AMDGPU reflect pass to choose between safe and unsafe AMDGPU atomics (#11467)
AMDGPU reflect pass is needed to choose between safe and unsafe atomics at the libclc level. In the long run we will delete this patch as work is being done to ensure correct lowering of atomic instructions. See patches: llvm/llvm-project#85052 llvm/llvm-project#69229 This work is necessary as malloc shared atomics rely on PCIe atomics which can have patchy and unreliable support. Therefore, we want to be able to choose at compile time whether we should use safe atomics using CAS (which PCIe should support), or if we want to rely of the availability of the newest PCIe atomics, if malloc shared atomics are desired. Also changes the implementation of `atomic_or`, `atomic_and` so that they can choose between the safe or unsafe version based on the AMDGPU reflect value.
1 parent daeb58b commit 34135a3

File tree

13 files changed

+196
-36
lines changed

13 files changed

+196
-36
lines changed

libclc/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,10 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
328328
# Disables NVVM reflection to defer to after linking
329329
list( APPEND flags -Xclang -target-feature -Xclang +ptx72
330330
-march=sm_86 -mllvm --nvvm-reflect-enable=false)
331+
elseif( ARCH STREQUAL amdgcn )
332+
# AMDGCN needs libclc to be compiled to high bc version since all atomic
333+
# clang builtins need to be accessible
334+
list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false )
331335
elseif( ARCH STREQUAL x86_64)
332336
# TODO: This is used by SYCL Native Cpu, we should define an option to set this flags
333337
list( APPEND flags -Xclang -target-feature -Xclang +avx
@@ -358,6 +362,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
358362
# simultaneously, we choose declare the builtins using the private space,
359363
# which will also work for the generic address space.
360364
set( supports_generic_addrspace FALSE )
365+
elseif( ARCH STREQUAL amdgcn )
366+
set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false )
361367
elseif( ARCH STREQUAL x86_64)
362368
set( opt_flags )
363369
set( supports_generic_addrspace FALSE )

libclc/amdgcn-amdhsa/libspirv/atomic/atomic_and.cl

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
#include <spirv/spirv.h>
1111
#include <spirv/spirv_types.h>
1212

13-
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, int, i, __hip_atomic_fetch_and)
14-
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned int, j, __hip_atomic_fetch_and)
15-
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, long, l, __hip_atomic_fetch_and)
16-
AMDGPU_ATOMIC(_Z17__spirv_AtomicAnd, unsigned long, m, __hip_atomic_fetch_and)
13+
#define __CLC_OP &
14+
#define __SPIRV_BUILTIN _Z17__spirv_AtomicAnd
15+
#define __HIP_BUILTIN __hip_atomic_fetch_and
1716

18-
#undef AMDGPU_ATOMIC
19-
#undef AMDGPU_ATOMIC_IMPL
20-
#undef AMDGPU_ARCH_GEQ
21-
#undef AMDGPU_ARCH_BETWEEN
22-
#undef GET_ATOMIC_SCOPE_AND_ORDER
17+
#include "atomic_safe.def"

libclc/amdgcn-amdhsa/libspirv/atomic/atomic_helpers.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <spirv/spirv.h>
1010
#include <spirv/spirv_types.h>
1111

12+
extern int __oclc_amdgpu_reflect(__constant char *);
13+
1214
#define AMDGPU_ARCH_GEQ(LOWER) __oclc_ISA_version >= LOWER
1315
#define AMDGPU_ARCH_BETWEEN(LOWER, UPPER) \
1416
__oclc_ISA_version >= LOWER &&__oclc_ISA_version < UPPER
@@ -72,14 +74,22 @@
7274
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, BUILTIN) \
7375
AMDGPU_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, BUILTIN)
7476

75-
#define AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
76-
SUB1, OP) \
77+
// Safe atomics will either choose a slow CAS atomic impl (default) or a fast
78+
// native atomic if --amdgpu-unsafe-int-atomics is passed to LLVM.
79+
//
80+
// Safe atomics using CAS may be necessary if PCIe does not support atomic
81+
// operations such as and, or, xor
82+
#define AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, AS, AS_MANGLED, \
83+
SUB1, OP, USE_BUILTIN_COND, BUILTIN) \
7784
_CLC_DEF TYPE \
7885
FUNC_NAME##P##AS_MANGLED##TYPE_MANGLED##N5__spv5Scope4FlagENS##SUB1##_19MemorySemanticsMask4FlagE##TYPE_MANGLED( \
7986
volatile AS TYPE *p, enum Scope scope, \
8087
enum MemorySemanticsMask semantics, TYPE val) { \
8188
int atomic_scope = 0, memory_order = 0; \
8289
GET_ATOMIC_SCOPE_AND_ORDER(scope, atomic_scope, semantics, memory_order) \
90+
if (USE_BUILTIN_COND) \
91+
return BUILTIN(p, val, memory_order, atomic_scope); \
92+
/* CAS atomics*/ \
8393
TYPE oldval = __hip_atomic_load(p, memory_order, atomic_scope); \
8494
TYPE newval = 0; \
8595
do { \
@@ -89,7 +99,13 @@
8999
return oldval; \
90100
}
91101

92-
#define AMDGPU_CAS_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP) \
93-
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP) \
94-
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP) \
95-
AMDGPU_CAS_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP)
102+
#define AMDGPU_SAFE_ATOMIC(FUNC_NAME, TYPE, TYPE_MANGLED, OP, BUILTIN) \
103+
AMDGPU_SAFE_ATOMIC_IMPL( \
104+
FUNC_NAME, TYPE, TYPE_MANGLED, global, U3AS1, 1, OP, \
105+
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN) \
106+
AMDGPU_SAFE_ATOMIC_IMPL(FUNC_NAME, TYPE, TYPE_MANGLED, local, U3AS3, 1, OP, \
107+
true /* local AS should always use builtin*/, \
108+
BUILTIN) \
109+
AMDGPU_SAFE_ATOMIC_IMPL( \
110+
FUNC_NAME, TYPE, TYPE_MANGLED, , , 0, OP, \
111+
__oclc_amdgpu_reflect("AMDGPU_OCLC_UNSAFE_INT_ATOMICS"), BUILTIN)

libclc/amdgcn-amdhsa/libspirv/atomic/atomic_or.cl

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,8 @@
1010
#include <spirv/spirv.h>
1111
#include <spirv/spirv_types.h>
1212

13-
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, int, i, __hip_atomic_fetch_or)
14-
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned int, j, __hip_atomic_fetch_or)
15-
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, long, l, __hip_atomic_fetch_or)
16-
AMDGPU_ATOMIC(_Z16__spirv_AtomicOr, unsigned long, m, __hip_atomic_fetch_or)
13+
#define __CLC_OP |
14+
#define __SPIRV_BUILTIN _Z16__spirv_AtomicOr
15+
#define __HIP_BUILTIN __hip_atomic_fetch_or
1716

18-
#undef AMDGPU_ATOMIC
19-
#undef AMDGPU_ATOMIC_IMPL
20-
#undef AMDGPU_ARCH_GEQ
21-
#undef AMDGPU_ARCH_BETWEEN
22-
#undef GET_ATOMIC_SCOPE_AND_ORDER
17+
#include "atomic_safe.def"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Before including, define: __SPIRV_BUILTIN, __CLC_OP, __HIP_BUILTIN
2+
// and include atomic_helpers.h to get AMDGPU_SAFE_ATOMIC
3+
4+
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, int, i, __CLC_OP, __HIP_BUILTIN)
5+
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned int, j, __CLC_OP,
6+
__HIP_BUILTIN)
7+
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, long, l, __CLC_OP, __HIP_BUILTIN)
8+
AMDGPU_SAFE_ATOMIC(__SPIRV_BUILTIN, unsigned long, m, __CLC_OP,
9+
__HIP_BUILTIN)

libclc/amdgcn-amdhsa/libspirv/atomic/atomic_xor.cl

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,8 @@
1010
#include <spirv/spirv.h>
1111
#include <spirv/spirv_types.h>
1212

13-
#define __CLC_XOR ^
13+
#define __CLC_OP ^
14+
#define __SPIRV_BUILTIN _Z17__spirv_AtomicXor
15+
#define __HIP_BUILTIN __hip_atomic_fetch_xor
1416

15-
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, int, i, __CLC_XOR)
16-
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned int, j, __CLC_XOR)
17-
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, long, l, __CLC_XOR)
18-
AMDGPU_CAS_ATOMIC(_Z17__spirv_AtomicXor, unsigned long, m, __CLC_XOR)
19-
20-
#undef __CLC_XOR
21-
#undef AMDGPU_ATOMIC
22-
#undef AMDGPU_ATOMIC_IMPL
23-
#undef AMDGPU_ARCH_GEQ
24-
#undef AMDGPU_ARCH_BETWEEN
25-
#undef GET_ATOMIC_SCOPE_AND_ORDER
17+
#include "atomic_safe.def"

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,12 @@ struct AMDGPULowerKernelAttributesPass
130130
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
131131
};
132132

133+
struct AMDGPUOclcReflectPass : public PassInfoMixin<AMDGPUOclcReflectPass> {
134+
public:
135+
PreservedAnalyses run(Function &M, FunctionAnalysisManager &AM);
136+
static bool isRequired() { return true; }
137+
};
138+
133139
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
134140
extern char &AMDGPULowerModuleLDSLegacyPassID;
135141

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
//===- AMDGPUOclcReflect.cpp ----------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass searches for occurences of the AMDGPU_OCLC_REFLECT function, and
10+
// replaces the calls with some val dependent on the operand of the func. This
11+
// can be used to reflect across different implementations of functions at
12+
// compile time based on a compiler flag or some other means. This pass
13+
// currently supports use cases:
14+
//
15+
// 1. Choose a safe or unsafe version of atomic_xor at compile time, which can
16+
// be chosen at compile time by setting the flag
17+
// --amdgpu-oclc-unsafe-int-atomics=true.
18+
//
19+
// This pass is similar to the NVPTX pass NVVMReflect.
20+
//
21+
//===----------------------------------------------------------------------===//
22+
23+
#include "AMDGPU.h"
24+
#include "llvm/IR/Constants.h"
25+
#include "llvm/IR/Dominators.h"
26+
#include "llvm/IR/InstIterator.h"
27+
#include "llvm/IR/Instructions.h"
28+
#include "llvm/Pass.h"
29+
#include "llvm/Support/CommandLine.h"
30+
31+
using namespace llvm;
32+
33+
#define AMDGPU_OCLC_REFLECT "__oclc_amdgpu_reflect"
34+
35+
static cl::opt<bool>
36+
AMDGPUReflectEnabled("amdgpu-oclc-reflect-enable", cl::init(true),
37+
cl::Hidden,
38+
cl::desc("AMDGPU reflection, enabled by default"));
39+
static cl::opt<bool> AMDGPUUnsafeIntAtomicsEnable(
40+
"amdgpu-oclc-unsafe-int-atomics", cl::init(false), cl::Hidden,
41+
cl::desc("Should unsafe int atomics be chosen. Disabled by default."));
42+
43+
PreservedAnalyses AMDGPUOclcReflectPass::run(Function &F,
44+
FunctionAnalysisManager &AM) {
45+
if (!AMDGPUReflectEnabled)
46+
return PreservedAnalyses::all();
47+
48+
if (F.getName() == AMDGPU_OCLC_REFLECT) {
49+
assert(F.isDeclaration() &&
50+
"__oclc_amdgpu_reflect function should not have a body");
51+
return PreservedAnalyses::all();
52+
}
53+
54+
SmallVector<CallInst *, 4> ToRemove;
55+
56+
for (Instruction &I : instructions(F)) {
57+
auto *Call = dyn_cast<CallInst>(&I);
58+
if (!Call)
59+
continue;
60+
if (Function *Callee = Call->getCalledFunction();
61+
!Callee || Callee->getName() != AMDGPU_OCLC_REFLECT)
62+
continue;
63+
64+
assert(Call->arg_size() == 1 &&
65+
"Wrong number of operands to __oclc_amdgpu_reflect function");
66+
67+
ToRemove.push_back(Call);
68+
}
69+
70+
if (!ToRemove.size())
71+
return PreservedAnalyses::all();
72+
73+
for (CallInst *Call : ToRemove) {
74+
const Value *Str = Call->getArgOperand(0);
75+
const Value *Operand = cast<Constant>(Str)->getOperand(0);
76+
StringRef ReflectArg = cast<ConstantDataSequential>(Operand)->getAsString();
77+
ReflectArg = ReflectArg.drop_back(1);
78+
79+
if (ReflectArg == "AMDGPU_OCLC_UNSAFE_INT_ATOMICS") {
80+
int ReflectVal = AMDGPUUnsafeIntAtomicsEnable ? 1 : 0;
81+
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
82+
} else {
83+
report_fatal_error("Invalid arg passed to __oclc_amdgpu_reflect");
84+
}
85+
Call->eraseFromParent();
86+
}
87+
88+
PreservedAnalyses PA;
89+
PA.preserveSet<CFGAnalyses>();
90+
PA.preserve<DominatorTreeAnalysis>();
91+
return PA;
92+
}

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-arguments",
3636
AMDGPULowerKernelArgumentsPass(*this))
3737
FUNCTION_PASS("amdgpu-lower-kernel-attributes",
3838
AMDGPULowerKernelAttributesPass())
39+
FUNCTION_PASS("amdgpu-oclc-reflect", AMDGPUOclcReflectPass())
3940
FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
4041
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
4142
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
663663
PB.registerPipelineStartEPCallback(
664664
[](ModulePassManager &PM, OptimizationLevel Level) {
665665
FunctionPassManager FPM;
666+
FPM.addPass(AMDGPUOclcReflectPass());
666667
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
667668
if (EnableHipStdPar)
668669
PM.addPass(HipStdParAcceleratorCodeSelectionPass());

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
8282
AMDGPUInsertSingleUseVDST.cpp
8383
AMDGPUMarkLastScratchLoad.cpp
8484
AMDGPUMIRFormatter.cpp
85+
AMDGPUOclcReflect.cpp
8586
AMDGPUOpenCLEnqueuedBlockLowering.cpp
8687
AMDGPUPerfHintAnalysis.cpp
8788
AMDGPUPostLegalizerCombiner.cpp
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -S -p amdgpu-oclc-reflect %s | FileCheck %s -check-prefixes=CHECK,CHECK-SAFE-ATOMICS
3+
; RUN: opt -S -p amdgpu-oclc-reflect -amdgpu-oclc-unsafe-int-atomics=true %s | FileCheck %s -check-prefixes=CHECK,CHECK-UNSAFE-ATOMICS
4+
5+
target triple = "amdgcn-amd-amdhsa"
6+
7+
@.str = private unnamed_addr addrspace(4) constant [31 x i8] c"AMDGPU_OCLC_UNSAFE_INT_ATOMICS\00", align 1
8+
9+
declare hidden i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef) local_unnamed_addr
10+
11+
define i32 @foo() {
12+
; CHECK-SAFE-ATOMICS-LABEL: define i32 @foo() {
13+
; CHECK-SAFE-ATOMICS-NEXT: ret i32 0
14+
;
15+
; CHECK-UNSAFE-ATOMICS-LABEL: define i32 @foo() {
16+
; CHECK-UNSAFE-ATOMICS-NEXT: ret i32 1
17+
;
18+
%call = tail call i32 @__oclc_amdgpu_reflect(ptr addrspace(4) noundef @.str)
19+
ret i32 %call
20+
}
21+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
22+
; CHECK: {{.*}}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// REQUIRES: hip
2+
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
3+
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
4+
5+
#include <sycl/sycl.hpp>
6+
7+
int main() {
8+
sycl::queue{}.single_task([=] {
9+
int a;
10+
sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
11+
atomicInt(a);
12+
atomicInt.fetch_xor(1);
13+
atomicInt.fetch_and(1);
14+
atomicInt.fetch_or(1);
15+
// CHECK: __CLANG_OFFLOAD_BUNDLE____START__ sycl-amdgcn-amd-amdhsa-
16+
// CHECK-SAFE: cmpxchg volatile
17+
// CHECK-SAFE-NOT: atomicrmw
18+
// CHECK-UNSAFE: atomicrmw volatile xor
19+
// CHECK-UNSAFE: atomicrmw volatile and
20+
// CHECK-UNSAFE: atomicrmw volatile or
21+
// CHECK-UNSAFE-NOT: cmpxchg
22+
// CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
23+
});
24+
}

0 commit comments

Comments
 (0)