Skip to content

Commit bb8f27c

Browse files
nbpatellegrosbuffle
authored andcommitted
[MLIR] Pass count of parameters & gpu binary size to runtime wrappers (llvm#66154)
This PR is a breakdown of the big PR llvm#65539 which enables intel gpu integration. In this PR we pass count of parameters and size of gpu binary to runtime wrappers since the SyclRuntimeWrappers (which will come in subsequent PR) requires the spirv size for compilation and also the number of parameters to iterate over the params.
1 parent c83a946 commit bb8f27c

File tree

4 files changed

+30
-11
lines changed

4 files changed

+30
-11
lines changed

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
101101
FunctionCallBuilder moduleLoadCallBuilder = {
102102
"mgpuModuleLoad",
103103
llvmPointerType /* void *module */,
104-
{llvmPointerType /* void *cubin */}};
104+
{llvmPointerType /* void *cubin */, llvmInt64Type /* size_t size */}};
105105
FunctionCallBuilder moduleUnloadCallBuilder = {
106106
"mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
107107
FunctionCallBuilder moduleGetFunctionCallBuilder = {
@@ -125,7 +125,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
125125
llvmInt32Type, /* unsigned int sharedMemBytes */
126126
llvmPointerType, /* void *hstream */
127127
llvmPointerPointerType, /* void **kernelParams */
128-
llvmPointerPointerType /* void **extra */
128+
llvmPointerPointerType, /* void **extra */
129+
llvmInt64Type /* size_t paramsCount */
129130
}};
130131
FunctionCallBuilder streamCreateCallBuilder = {
131132
"mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
@@ -1134,7 +1135,23 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
11341135
loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
11351136
LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
11361137

1137-
auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
1138+
// Pass the binary size. SPIRV requires binary size.
1139+
auto gpuBlob = binaryAttr.getValue();
1140+
auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>(
1141+
loc, llvmInt64Type,
1142+
mlir::IntegerAttr::get(llvmInt64Type,
1143+
static_cast<int64_t>(gpuBlob.size())));
1144+
1145+
auto module =
1146+
moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize});
1147+
1148+
// Pass the count of the parameters to runtime wrappers
1149+
auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>(
1150+
loc, llvmInt64Type,
1151+
mlir::IntegerAttr::get(
1152+
llvmInt64Type,
1153+
static_cast<int64_t>(launchOp.getNumKernelOperands())));
1154+
11381155
// Get the function from the module. The name corresponds to the name of
11391156
// the kernel function.
11401157
auto kernelName = generateKernelNameConstant(
@@ -1158,7 +1175,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
11581175
{function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
11591176
adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
11601177
adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
1161-
/*extra=*/nullpointer});
1178+
/*extra=*/nullpointer, paramsCount});
11621179

11631180
if (launchOp.getAsyncToken()) {
11641181
// Async launch: make dependent ops use the same stream.

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false;
119119
#endif // MLIR_ENABLE_CUDA_CUSPARSELT
120120
#endif // MLIR_ENABLE_CUDA_CUSPARSE
121121

122-
extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
122+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
123+
mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
123124
ScopedContext scopedContext;
124125
CUmodule module = nullptr;
125126
CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
@@ -165,7 +166,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
165166
mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
166167
intptr_t gridZ, intptr_t blockX, intptr_t blockY,
167168
intptr_t blockZ, int32_t smem, CUstream stream, void **params,
168-
void **extra) {
169+
void **extra, size_t /*paramsCount*/) {
169170
ScopedContext scopedContext;
170171
int32_t maxShmem = 0;
171172
CUdevice device = getDefaultCuDevice();

mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
thread_local static int32_t defaultDevice = 0;
3434

35-
extern "C" hipModule_t mgpuModuleLoad(void *data) {
35+
extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
3636
hipModule_t module = nullptr;
3737
HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));
3838
return module;
@@ -62,7 +62,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
6262
intptr_t blockX, intptr_t blockY,
6363
intptr_t blockZ, int32_t smem,
6464
hipStream_t stream, void **params,
65-
void **extra) {
65+
void **extra, size_t /*paramsCount*/) {
6666
HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
6767
blockX, blockY, blockZ, smem,
6868
stream, params, extra));

mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ module attributes {gpu.container_module} {
3434
// CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
3535
// CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
3636
// CHECK-SAME: -> !llvm.ptr
37-
38-
// CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]])
37+
// CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant
38+
// CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]])
39+
// CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant
3940
// CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
4041

4142
// CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
@@ -56,7 +57,7 @@ module attributes {gpu.container_module} {
5657

5758
// CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
5859
// CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
59-
// CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]])
60+
// CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]])
6061
// CHECK: llvm.call @mgpuStreamSynchronize
6162
// CHECK: llvm.call @mgpuStreamDestroy
6263
// CHECK: llvm.call @mgpuModuleUnload

0 commit comments

Comments
 (0)