Skip to content

Commit 2186199

Browse files
authored
[OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL (#71234)
Fixes the DeviceRTL compilation to ensure it is ABI agnostic. Uses already available global variable "oclc_ABI_version" instead of "llvm.amdgcn.abi.verion". It also adds some minor fields in ImplicitArg structure.
1 parent 1f21e49 commit 2186199

File tree

7 files changed

+24
-18
lines changed

7 files changed

+24
-18
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17432,19 +17432,19 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
1743217432
/// Emit code based on Code Object ABI version.
1743317433
/// COV_4 : Emit code to use dispatch ptr
1743417434
/// COV_5 : Emit code to use implicitarg ptr
17435-
/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version"
17435+
/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
1743617436
/// and use its value for COV_4 or COV_5 approach. It is used for
1743717437
/// compiling device libraries in an ABI-agnostic way.
1743817438
///
17439-
/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by
17439+
/// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by
1744017440
/// clang during compilation of user code.
1744117441
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
1744217442
llvm::LoadInst *LD;
1744317443

1744417444
auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
1744517445

1744617446
if (Cov == clang::TargetOptions::COV_None) {
17447-
StringRef Name = "llvm.amdgcn.abi.version";
17447+
StringRef Name = "__oclc_ABI_version";
1744817448
auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
1744917449
if (!ABIVersionC)
1745017450
ABIVersionC = new llvm::GlobalVariable(

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,11 +362,15 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
362362
/// AMDGPU ROCm device libraries.
363363
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
364364
CodeGen::CodeGenModule &CGM) const {
365-
StringRef Name = "llvm.amdgcn.abi.version";
365+
StringRef Name = "__oclc_ABI_version";
366366
llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
367367
if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
368368
return;
369369

370+
if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
371+
clang::TargetOptions::COV_None)
372+
return;
373+
370374
auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
371375
llvm::Constant *COV = llvm::ConstantInt::get(
372376
Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);

clang/test/CodeGen/amdgpu-abi-version.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
// RUN: %clang_cc1 -cc1 -triple amdgcn-amd-amdhsa -emit-llvm -mcode-object-version=none %s -o - | FileCheck %s
33

44
//.
5-
// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 0
5+
// CHECK: @__oclc_ABI_version = external addrspace(4) global i32
66
//.
77
// CHECK-LABEL: define dso_local i32 @foo(
88
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
99
// CHECK-NEXT: entry:
1010
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
1111
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
12-
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @llvm.amdgcn.abi.version, align 4
12+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
1313
// CHECK-NEXT: [[TMP1:%.*]] = icmp sge i32 [[TMP0]], 500
1414
// CHECK-NEXT: [[TMP2:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
1515
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i32 12

clang/test/CodeGen/amdgpu-address-spaces.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ int [[clang::address_space(999)]] bbb = 1234;
2929
// CHECK: @u = addrspace(5) global i32 undef, align 4
3030
// CHECK: @aaa = addrspace(6) global i32 1000, align 4
3131
// CHECK: @bbb = addrspace(999) global i32 1234, align 4
32-
// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
32+
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
3333
//.
3434
// CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
3535
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {

clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717

1818
#include "Inputs/cuda.h"
1919

20-
// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
20+
// LINKED4: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
2121
// LINKED4-LABEL: bar
22-
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
22+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
2323
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
2424
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
2525
// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -28,7 +28,7 @@
2828
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
2929
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
3030

31-
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
31+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
3232
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
3333
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
3434
// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -37,7 +37,7 @@
3737
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
3838
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
3939

40-
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
40+
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
4141
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
4242
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
4343
// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
@@ -47,9 +47,9 @@
4747
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
4848
// LINKED4: "amdgpu_code_object_version", i32 400
4949

50-
// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
50+
// LINKED5: __oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
5151
// LINKED5-LABEL: bar
52-
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
52+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
5353
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
5454
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
5555
// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -58,7 +58,7 @@
5858
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
5959
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
6060

61-
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
61+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
6262
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
6363
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
6464
// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -67,7 +67,7 @@
6767
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
6868
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
6969

70-
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
70+
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}}
7171
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
7272
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
7373
// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434

3535
// COVNONE-LABEL: test_get_workgroup_size
36-
// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
36+
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
3737
// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
3838
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
3939
// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
@@ -42,7 +42,7 @@
4242
// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
4343
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
4444

45-
// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
45+
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
4646
// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
4747
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
4848
// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
@@ -51,7 +51,7 @@
5151
// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
5252
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
5353

54-
// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version
54+
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
5555
// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
5656
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
5757
// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3086,6 +3086,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
30863086
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
30873087
if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
30883088
ImplArgs->BlockCountX = NumBlocks;
3089+
ImplArgs->BlockCountY = 1;
3090+
ImplArgs->BlockCountZ = 1;
30893091
ImplArgs->GroupSizeX = NumThreads;
30903092
ImplArgs->GroupSizeY = 1;
30913093
ImplArgs->GroupSizeZ = 1;

0 commit comments

Comments
 (0)