Skip to content

Commit 25efe63

Browse files
AlexVlxsearlmc1
authored andcommitted
[clang][CodeGen][SPIRV] Translate amdgpu_flat_work_group_size into max_work_group_size. (llvm#116820)
HIPAMD relies on the `amdgpu_flat_work_group_size` attribute to implement key functionality such as the `__launch_bounds__` `__global__` function annotation. This attribute is not available / directly translatable to SPIR-V, hence as it is AMDGCN flavoured SPIR-V suffers from information loss. This patch addresses that limitation by converting the unsupported attribute into the `max_work_group_size` attribute which maps to [`MaxWorkgroupSizeINTEL`](https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_kernel_attributes.asciidoc), which is available in / handled by SPIR-V. When reverse translating from SPIR-V to AMDGCN LLVMIR we invert the map and add the original AMDGPU attribute. Change-Id: I5d95cd17d7169a61dc26fb410a838263e4497374
1 parent 4cadbfd commit 25efe63

File tree

3 files changed

+64
-16
lines changed

3 files changed

+64
-16
lines changed

clang/lib/CodeGen/Targets/SPIR.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
6060
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
6161
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
6262
const VarDecl *D) const override;
63+
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
64+
CodeGen::CodeGenModule &M) const override;
6365
llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
6466
SyncScope Scope,
6567
llvm::AtomicOrdering Ordering,
@@ -241,6 +243,41 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
241243
return DefaultGlobalAS;
242244
}
243245

246+
void SPIRVTargetCodeGenInfo::setTargetAttributes(
247+
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
248+
if (!M.getLangOpts().HIP ||
249+
M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
250+
return;
251+
if (GV->isDeclaration())
252+
return;
253+
254+
auto F = dyn_cast<llvm::Function>(GV);
255+
if (!F)
256+
return;
257+
258+
auto FD = dyn_cast_or_null<FunctionDecl>(D);
259+
if (!FD)
260+
return;
261+
if (!FD->hasAttr<CUDAGlobalAttr>())
262+
return;
263+
264+
unsigned N = M.getLangOpts().GPUMaxThreadsPerBlock;
265+
if (auto FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>())
266+
N = FlatWGS->getMax()->EvaluateKnownConstInt(M.getContext()).getExtValue();
267+
268+
// We encode the maximum flat WG size in the first component of the 3D
269+
// max_work_group_size attribute, which will get reverse translated into the
270+
// original AMDGPU attribute when targeting AMDGPU.
271+
auto Int32Ty = llvm::IntegerType::getInt32Ty(M.getLLVMContext());
272+
llvm::Metadata *AttrMDArgs[] = {
273+
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, N)),
274+
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1)),
275+
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(Int32Ty, 1))};
276+
277+
F->setMetadata("max_work_group_size",
278+
llvm::MDNode::get(M.getLLVMContext(), AttrMDArgs));
279+
}
280+
244281
llvm::SyncScope::ID
245282
SPIRVTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &, SyncScope Scope,
246283
llvm::AtomicOrdering,

clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
// CHECK-NEXT: ret void
3535
//
3636
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
37-
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] {
37+
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
3838
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
3939
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
4040
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -60,7 +60,7 @@
6060
// OPT-NEXT: ret void
6161
//
6262
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel1Pi(
63-
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] {
63+
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0:[0-9]+]] !max_work_group_size [[META5:![0-9]+]] {
6464
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
6565
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE]] to ptr addrspace(4)
6666
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
@@ -104,7 +104,7 @@ __global__ void kernel1(int *x) {
104104
// CHECK-NEXT: ret void
105105
//
106106
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
107-
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
107+
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
108108
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
109109
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
110110
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -129,7 +129,7 @@ __global__ void kernel1(int *x) {
129129
// OPT-NEXT: ret void
130130
//
131131
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel2Ri(
132-
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
132+
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef align 4 dereferenceable(4) [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
133133
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
134134
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE]] to ptr addrspace(4)
135135
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
@@ -173,7 +173,7 @@ __global__ void kernel2(int &x) {
173173
// CHECK-NEXT: ret void
174174
//
175175
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
176-
// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] {
176+
// CHECK-SPIRV-SAME: ptr addrspace(2) noundef [[X:%.*]], ptr addrspace(1) noundef [[Y:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
177177
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
178178
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(2), align 8
179179
// CHECK-SPIRV-NEXT: [[Y_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -197,7 +197,7 @@ __global__ void kernel2(int &x) {
197197
// OPT-NEXT: ret void
198198
//
199199
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel3PU3AS2iPU3AS1i(
200-
// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
200+
// OPT-SPIRV-SAME: ptr addrspace(2) nocapture noundef readonly [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[Y:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
201201
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
202202
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(2) [[X]], align 4
203203
// OPT-SPIRV-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[Y]], align 4
@@ -305,7 +305,7 @@ struct S {
305305
// CHECK-NEXT: ret void
306306
//
307307
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
308-
// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
308+
// CHECK-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
309309
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
310310
// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca [[STRUCT_S]], align 8
311311
// CHECK-SPIRV-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(4)
@@ -341,7 +341,7 @@ struct S {
341341
// OPT-NEXT: ret void
342342
//
343343
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel41S(
344-
// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] {
344+
// OPT-SPIRV-SAME: [[STRUCT_S:%.*]] [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2:[0-9]+]] !max_work_group_size [[META5]] {
345345
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
346346
// OPT-SPIRV-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 0
347347
// OPT-SPIRV-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [[STRUCT_S]] [[S_COERCE]], 1
@@ -405,7 +405,7 @@ __global__ void kernel4(struct S s) {
405405
// CHECK-NEXT: ret void
406406
//
407407
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
408-
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
408+
// CHECK-SPIRV-SAME: ptr addrspace(1) noundef [[S_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
409409
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
410410
// CHECK-SPIRV-NEXT: [[S:%.*]] = alloca ptr addrspace(4), align 8
411411
// CHECK-SPIRV-NEXT: [[S_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -447,7 +447,7 @@ __global__ void kernel4(struct S s) {
447447
// OPT-NEXT: ret void
448448
//
449449
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel5P1S(
450-
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
450+
// OPT-SPIRV-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] !max_work_group_size [[META5]] {
451451
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
452452
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[S_COERCE]] to ptr addrspace(4)
453453
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[TMP0]], align 8
@@ -511,7 +511,7 @@ struct T {
511511
// CHECK-NEXT: ret void
512512
//
513513
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
514-
// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
514+
// CHECK-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
515515
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
516516
// CHECK-SPIRV-NEXT: [[T:%.*]] = alloca [[STRUCT_T]], align 8
517517
// CHECK-SPIRV-NEXT: [[T1:%.*]] = addrspacecast ptr [[T]] to ptr addrspace(4)
@@ -549,7 +549,7 @@ struct T {
549549
// OPT-NEXT: ret void
550550
//
551551
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel61T(
552-
// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
552+
// OPT-SPIRV-SAME: [[STRUCT_T:%.*]] [[T_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] !max_work_group_size [[META5]] {
553553
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
554554
// OPT-SPIRV-NEXT: [[T_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0, 0
555555
// OPT-SPIRV-NEXT: [[T_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_T]] [[T_COERCE]], 0, 1
@@ -604,7 +604,7 @@ __global__ void kernel6(struct T t) {
604604
// CHECK-NEXT: ret void
605605
//
606606
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
607-
// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
607+
// CHECK-SPIRV-SAME: ptr addrspace(1) noalias noundef [[X_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
608608
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
609609
// CHECK-SPIRV-NEXT: [[X:%.*]] = alloca ptr addrspace(4), align 8
610610
// CHECK-SPIRV-NEXT: [[X_ADDR:%.*]] = alloca ptr addrspace(4), align 8
@@ -630,7 +630,7 @@ __global__ void kernel6(struct T t) {
630630
// OPT-NEXT: ret void
631631
//
632632
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel7Pi(
633-
// OPT-SPIRV-SAME: ptr addrspace(1) noalias nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] {
633+
// OPT-SPIRV-SAME: ptr addrspace(1) noalias nocapture noundef [[X_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
634634
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
635635
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(1) [[X_COERCE]] to ptr addrspace(4)
636636
// OPT-SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
@@ -676,7 +676,7 @@ struct SS {
676676
// CHECK-NEXT: ret void
677677
//
678678
// CHECK-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
679-
// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] {
679+
// CHECK-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) addrspace(4) #[[ATTR0]] !max_work_group_size [[META5]] {
680680
// CHECK-SPIRV-NEXT: [[ENTRY:.*:]]
681681
// CHECK-SPIRV-NEXT: [[A:%.*]] = alloca [[STRUCT_SS]], align 8
682682
// CHECK-SPIRV-NEXT: [[A1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
@@ -697,7 +697,7 @@ struct SS {
697697
// OPT-NEXT: ret void
698698
//
699699
// OPT-SPIRV-LABEL: define spir_kernel void @_Z7kernel82SS(
700-
// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] {
700+
// OPT-SPIRV-SAME: [[STRUCT_SS:%.*]] [[A_COERCE:%.*]]) local_unnamed_addr addrspace(4) #[[ATTR2]] !max_work_group_size [[META5]] {
701701
// OPT-SPIRV-NEXT: [[ENTRY:.*:]]
702702
// OPT-SPIRV-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_SS]] [[A_COERCE]], 0
703703
// OPT-SPIRV-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[A_COERCE_FCA_0_EXTRACT]], align 4
@@ -724,5 +724,9 @@ __global__ void kernel8(struct SS a) {
724724
*a.x += 3.f;
725725
}
726726
//.
727+
// CHECK-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
728+
//.
727729
// OPT: [[META4]] = !{}
728730
//.
731+
// OPT-SPIRV: [[META5]] = !{i32 1024, i32 1, i32 1}
732+
//.

clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
55
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
66
// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
7+
// RUN: %clang_cc1 -triple spirv64-amd-amdhsa --gpu-max-threads-per-block=1024 \
8+
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
9+
// RUN: | FileCheck -check-prefixes=CHECK-SPIRV,MAX1024-SPIRV %s
710
// RUN: %clang_cc1 -triple nvptx \
811
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
912
// RUN: -check-prefix=NAMD
@@ -21,12 +24,14 @@
2124

2225
__global__ void flat_work_group_size_default() {
2326
// CHECK: define{{.*}} amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
27+
// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z28flat_work_group_size_defaultv(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_DEFAULT:![0-9]+]]
2428
// NOUB: define{{.*}} void @_Z28flat_work_group_size_defaultv() [[NOUB:#[0-9]+]]
2529
}
2630

2731
__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
2832
__global__ void flat_work_group_size_32_64() {
2933
// CHECK: define{{.*}} amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
34+
// CHECK-SPIRV: define{{.*}} spir_kernel void @_Z26flat_work_group_size_32_64v(){{.*}} !max_work_group_size [[MAX_WORK_GROUP_SIZE_64:![0-9]+]]
3035
}
3136
__attribute__((amdgpu_waves_per_eu(2))) // expected-no-diagnostics
3237
__global__ void waves_per_eu_2() {
@@ -82,7 +87,9 @@ template __global__ void template_32_4_a_max_num_work_groups<2>();
8287

8388
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
8489
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
90+
// MAX1024-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_DEFAULT]] = !{i32 1024, i32 1, i32 1}
8591
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
92+
// CHECK-SPIRV-DAG: [[MAX_WORK_GROUP_SIZE_64]] = !{i32 64, i32 1, i32 1}
8693
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
8794
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
8895
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"

0 commit comments

Comments
 (0)