Skip to content

Commit 0d2c55c

Browse files
authored
AMDGPU: Move enqueued block handling into clang (#128519)
The previous implementation wasn't maintaining a faithful IR representation of how this really works. The value returned by createEnqueuedBlockKernel wasn't actually used as a function, and hacked up later to be a pointer to the runtime handle global variable. In reality, the enqueued block is a struct where the first field is a pointer to the kernel descriptor, not the kernel itself. We were also relying on passing around a reference to a global using a string attribute containing its name. It's better to base this on a proper IR symbol reference during final emission. This now avoids using a function attribute on kernels and avoids using the additional "runtime-handle" attribute to populate the final metadata. Instead, associate the runtime handle reference to the kernel with the !associated global metadata. We can then get a final, correctly mangled name at the end. I couldn't figure out how to get rename-with-external-symbol behavior using a combination of comdats and aliases, so leaves an IR pass to externalize the runtime handles for codegen. If anything breaks, it's most likely this, so leave avoiding this for a later step. Use a special section name to enable this behavior. This also means it's possible to declare enqueuable kernels in source without going through the dedicated block syntax or other dedicated compiler support. We could move towards initializing the runtime handle in the compiler/linker. I have a working patch where the linker sets up the first field of the handle, avoiding the need to export the block kernel symbol for the runtime. We would need new relocations to get the private and group sizes, but that would avoid the runtime's special case handling that requires the device_enqueue_symbol metadata field. https://reviews.llvm.org/D141700
1 parent dbd82f3 commit 0d2c55c

18 files changed

+492
-461
lines changed

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,20 @@ void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
614614
FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
615615
}
616616

617+
/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
618+
/// enqueue.
619+
///
620+
/// ptr addrspace(1) kernel_object, i32 private_segment_size,
621+
/// i32 group_segment_size
622+
623+
static llvm::StructType *
624+
getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
625+
llvm::Type *KernelDescriptorPtrTy) {
626+
llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
627+
return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
628+
"block.runtime.handle.t");
629+
}
630+
617631
/// Create an OpenCL kernel for an enqueued block.
618632
///
619633
/// The type of the first argument (the block literal) is the struct type
@@ -653,23 +667,29 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
653667
ArgNames.push_back(
654668
llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
655669
}
656-
std::string Name = Invoke->getName().str() + "_kernel";
670+
671+
llvm::Module &Mod = CGF.CGM.getModule();
672+
const llvm::DataLayout &DL = Mod.getDataLayout();
673+
674+
llvm::Twine Name = Invoke->getName() + "_kernel";
657675
auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
676+
677+
// The kernel itself can be internal, the runtime does not directly access the
678+
// kernel address (only the kernel descriptor).
658679
auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
659-
&CGF.CGM.getModule());
680+
&Mod);
660681
F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
661682

662683
llvm::AttrBuilder KernelAttrs(C);
663684
// FIXME: The invoke isn't applying the right attributes either
664685
// FIXME: This is missing setTargetAttributes
665686
CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
666-
KernelAttrs.addAttribute("enqueued-block");
667687
F->addFnAttrs(KernelAttrs);
668688

669689
auto IP = CGF.Builder.saveIP();
670690
auto *BB = llvm::BasicBlock::Create(C, "entry", F);
671691
Builder.SetInsertPoint(BB);
672-
const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
692+
const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
673693
auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
674694
BlockPtr->setAlignment(BlockAlign);
675695
Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
@@ -692,7 +712,39 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
692712
if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
693713
F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
694714

695-
return F;
715+
llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
716+
C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
717+
llvm::Constant *RuntimeHandleInitializer =
718+
llvm::ConstantAggregateZero::get(HandleTy);
719+
720+
llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
721+
722+
// The runtime needs access to the runtime handle as an external symbol. The
723+
// runtime handle will need to be made external later, in
724+
// AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
725+
// inside the runtime handle, and is not directly referenced.
726+
727+
// TODO: We would initialize the first field by declaring F->getName() + ".kd"
728+
// to reference the kernel descriptor. The runtime wouldn't need to bother
729+
// setting it. We would need to have a final symbol name though.
730+
// TODO: Can we directly use an external symbol with getGlobalIdentifier?
731+
auto *RuntimeHandle = new llvm::GlobalVariable(
732+
Mod, HandleTy,
733+
/*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
734+
/*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
735+
/*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
736+
DL.getDefaultGlobalsAddressSpace(),
737+
/*isExternallyInitialized=*/true);
738+
739+
llvm::MDNode *HandleAsMD =
740+
llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
741+
F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
742+
743+
RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
744+
745+
CGF.CGM.addUsedGlobal(F);
746+
CGF.CGM.addUsedGlobal(RuntimeHandle);
747+
return RuntimeHandle;
696748
}
697749

698750
void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(

clang/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ if( NOT CLANG_BUILT_STANDALONE )
136136
llvm-dis
137137
llvm-dwarfdump
138138
llvm-ifs
139+
llvm-link
139140
llvm-lto2
140141
llvm-mc
141142
llvm-modextract
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Make sure that invoking blocks in static functions with the same name in
2+
// different modules are linked together.
3+
4+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_first -DTYPE=float -DCONST=256.0f -emit-llvm-bc -o %t.0.bc %s
5+
// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_second -DTYPE=int -DCONST=128.0f -emit-llvm-bc -o %t.1.bc %s
6+
7+
// Make sure nothing strange happens with the linkage choices.
8+
// RUN: opt -passes=globalopt -o %t.opt.0.bc %t.0.bc
9+
// RUN: opt -passes=globalopt -o %t.opt.1.bc %t.1.bc
10+
11+
// Check the result of linking
12+
// RUN: llvm-link -S %t.opt.0.bc %t.opt.1.bc -o - | FileCheck %s
13+
14+
// Make sure that a block invoke used with the same name works in multiple
15+
// translation units
16+
17+
// CHECK: @llvm.used = appending addrspace(1) global [4 x ptr] [ptr @__static_invoker_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr @__static_invoker_block_invoke_kernel.2, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr)], section "llvm.metadata"
18+
19+
20+
// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
21+
// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle.3 = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
22+
23+
// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_FIRST_MD:[0-9]+]]
24+
25+
26+
// CHECK-LABEL: define internal void @__static_invoker_block_invoke(ptr noundef %.block_descriptor)
27+
// CHECK: call float @llvm.fmuladd.f32
28+
29+
30+
// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_first(
31+
32+
33+
// CHECK-LABEL: define internal fastcc void @static_invoker(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr)
34+
// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr %{{.+}})
35+
36+
// CHECK: declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
37+
38+
39+
// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel.2(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_SECOND_MD:[0-9]+]]
40+
// CHECK: call void @__static_invoker_block_invoke.4(ptr %
41+
42+
43+
// CHECK-LABEL: define internal void @__static_invoker_block_invoke.4(ptr noundef %.block_descriptor)
44+
// CHECK: mul nsw i32
45+
// CHECK: sitofp
46+
// CHECK: fadd
47+
// CHECK: fptosi
48+
49+
// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_second(ptr addrspace(1) noundef align 4 %outptr, ptr addrspace(1) noundef align 4 %argptr, ptr addrspace(1) noundef align 4 %difference)
50+
51+
// CHECK-LABEL: define internal fastcc void @static_invoker.5(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr) unnamed_addr #{{[0-9]+}} {
52+
// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr), ptr %{{.+}})
53+
54+
55+
typedef struct {int a;} ndrange_t;
56+
57+
static void static_invoker(global TYPE* outptr, global TYPE* argptr) {
58+
queue_t default_queue;
59+
unsigned flags = 0;
60+
ndrange_t ndrange;
61+
62+
enqueue_kernel(default_queue, flags, ndrange,
63+
^(void) {
64+
global TYPE* f = argptr;
65+
outptr[0] = f[1] * f[2] + CONST;
66+
});
67+
}
68+
69+
kernel void KERNEL_NAME(global TYPE *outptr, global TYPE *argptr, global TYPE *difference) {
70+
queue_t default_queue;
71+
unsigned flags = 0;
72+
ndrange_t ndrange;
73+
74+
static_invoker(outptr, argptr);
75+
76+
*difference = CONST;
77+
}
78+
79+
// CHECK: ![[ASSOC_FIRST_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle}
80+
// CHECK: ![[ASSOC_SECOND_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3}

0 commit comments

Comments
 (0)