[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers (llvm#66401)

nbpatel · legrosbuffle · commit 5bcf1623ce87 · 2023-09-29T10:32:50.000+02:00
This PR is a breakdown of the big PR llvm#65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -168,7 +168,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       "mgpuMemAlloc",
       llvmPointerType /* void * */,
       {llvmIntPtrType /* intptr_t sizeBytes */,
-       llvmPointerType /* void *stream */}};
+       llvmPointerType /* void *stream */,
+       llvmInt8Type /* bool isHostShared */}};
   FunctionCallBuilder deallocCallBuilder = {
       "mgpuMemFree",
       llvmVoidType,
@@ -787,19 +788,23 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::AllocOp allocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  if (adaptor.getHostShared())
-    return rewriter.notifyMatchFailure(
-        allocOp, "host_shared allocation is not supported");
 
   MemRefType memRefType = allocOp.getType();
 
   if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
-      !isConvertibleAndHasIdentityMaps(memRefType) ||
-      failed(isAsyncWithOneDependency(rewriter, allocOp)))
+      !isConvertibleAndHasIdentityMaps(memRefType))
     return failure();
 
   auto loc = allocOp.getLoc();
 
+  bool isShared = allocOp.getHostShared();
+
+  if (isShared && allocOp.getAsyncToken())
+    return rewriter.notifyMatchFailure(
+        allocOp, "Host Shared allocation cannot be done async");
+  else if (!isShared && failed(isAsyncWithOneDependency(rewriter, allocOp)))
+    return failure();
+
   // Get shape of the memref as values: static sizes are constant
   // values and dynamic sizes are passed to 'alloc' as operands.
   SmallVector<Value, 4> shape;
@@ -812,8 +817,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   // descriptor.
   Type elementPtrType = this->getElementPtrType(memRefType);
   auto stream = adaptor.getAsyncDependencies().front();
+
+  auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared));
+
   Value allocatedPtr =
-      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+          .getResult();
   if (!getTypeConverter()->useOpaquePointers())
     allocatedPtr =
         rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -232,7 +232,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+                              bool /*isHostShared*/) {
   ScopedContext scopedContext;
   CUdeviceptr ptr = 0;
   if (sizeBytes != 0)
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -104,7 +104,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
   HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+                              bool /*isHostShared*/) {
   void *ptr;
   HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));
   return ptr;
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant 
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]

Original file line number	Diff line number	Diff line change
`@@ -232,7 +232,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,`
`232`	`232`	`CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));`
`233`	`233`	`}`
`234`	`234`
`235`		`-extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/) {`
	`235`	`+extern "C" void mgpuMemAlloc(uint64_t sizeBytes, CUstream /stream*/,`
	`236`	`+ bool /isHostShared/) {`
`236`	`237`	`ScopedContext scopedContext;`
`237`	`238`	`CUdeviceptr ptr = 0;`
`238`	`239`	`if (sizeBytes != 0)`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {`
`104`	`104`	`HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));`
`105`	`105`	`}`
`106`	`106`
`107`		`-extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/) {`
	`107`	`+extern "C" void mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /stream*/,`
	`108`	`+ bool /isHostShared/) {`
`108`	`109`	`void *ptr;`
`109`	`110`	`HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));`
`110`	`111`	`return ptr;`