[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401

nbpatel · 2023-09-14T16:59:45Z

This PR is a breakdown of the big PR #65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory.

llvmbot · 2023-09-14T17:00:50Z

@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir

Changes

This PR is a breakdown of the big PR https://github.com//pull/65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory. -- Full diff: https://github.com//pull/66401.diff

5 Files Affected:

(modified) mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp (+10-5)
(modified) mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp (+2-1)
(modified) mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp (+2-1)
(modified) mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir (+2-1)
(modified) mlir/test/Conversion/GPUCommon/typed-pointers.mlir (+2-1)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index eddf3e9a47d0bc8..428d5d1d4b0e944 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern&lt;OpTy&gt; {
       &quot;mgpuMemAlloc&quot;,
       llvmPointerType /* void * */,
       {llvmIntPtrType /* intptr_t sizeBytes */,
-       llvmPointerType /* void *stream */}};
+       llvmPointerType /* void *stream */,
+       llvmInt64Type /* bool isHostShared */}};
   FunctionCallBuilder deallocCallBuilder = {
       &quot;mgpuMemFree&quot;,
       llvmVoidType,
@@ -786,9 +787,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::AllocOp allocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &amp;rewriter) const {
-  if (adaptor.getHostShared())
-    return rewriter.notifyMatchFailure(
-        allocOp, &quot;host_shared allocation is not supported&quot;);
 
   MemRefType memRefType = allocOp.getType();
 
@@ -799,6 +797,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   auto loc = allocOp.getLoc();
 
+  bool isShared = allocOp.getHostShared();
+
   // Get shape of the memref as values: static sizes are constant
   // values and dynamic sizes are passed to &#x27;alloc&#x27; as operands.
   SmallVector&lt;Value, 4&gt; shape;
@@ -811,8 +811,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   // descriptor.
   Type elementPtrType = this-&gt;getElementPtrType(memRefType);
   auto stream = adaptor.getAsyncDependencies().front();
+
+  auto isHostShared = rewriter.create&lt;mlir::LLVM::ConstantOp&gt;(
+      loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+
   Value allocatedPtr =
-      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+          .getResult();
   if (!getTypeConverter()-&gt;useOpaquePointers())
     allocatedPtr =
         rewriter.create&lt;LLVM::BitcastOp&gt;(loc, elementPtrType, allocatedPtr);
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1dba677ebe66365..a0172f85a67a5c0 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -210,7 +210,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT &quot;C&quot; void mgpuEventRecord(CUevent event,
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
 }
 
-extern &quot;C&quot; void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern &quot;C&quot; void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+                              bool /*isHostShared*/) {
   ScopedContext scopedContext;
   CUdeviceptr ptr;
   CUDA_REPORT_IF_ERROR(cuMemAlloc(&amp;ptr, sizeBytes));
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f6f..292159536f5522f 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -99,7 +99,8 @@ extern &quot;C&quot; void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
   HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
 }
 
-extern &quot;C&quot; void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern &quot;C&quot; void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+                              bool /*isHostShared*/) {
   void *ptr;
   HIP_REPORT_IF_ERROR(hipMalloc(&amp;ptr, sizeBytes));
   return ptr;
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 2506c6ceb990ef5..f365dcb02daf4c2 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant 
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref&lt;?xf32&gt;
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
index 2fa6c854c567819..e27162c7dbc1902 100644
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref&lt;?xf32&gt;
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]

grypp · 2023-09-15T07:18:17Z

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

  Value allocatedPtr =
-      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})


I thought we had a consensus about avoiding the use of host_shared and async because of USM's limitation in handling asynchronous data allocation. Are we still on the same page with this?

#65539 (comment)

we need to relax the checks in that case in GPUToLLVMConversion Pass to allow lowering of non async gpu.alloc......also we might need to change the gpu-async-region pass to handle this.

Sure, I think it's worth to do it! What's your take on this? Personally, I think having a complete PR is the way to go. Otherwise, we will have an improperly implemented Op.

I relaxed the checks for the GPUToLLVMCoversion pass. Regarding touching other passes, can we do it in an iterative PR once all the PR's relating to #65539 are merged?

Awesome thanks! Sure, it sounds good me to.
Let me know if there is anything else to do cuda runtime lowering.

Thanks. any other feedback on the PR @grypp ? or else can we merge it if it looks good to you

Hardcode84 · 2023-09-25T18:39:22Z

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

@@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
      "mgpuMemAlloc",
      llvmPointerType /* void * */,
      {llvmIntPtrType /* intptr_t sizeBytes */,
-       llvmPointerType /* void *stream */}};
+       llvmPointerType /* void *stream */,
+       llvmInt64Type /* bool isHostShared */}};


I think it should be i8 to match C++ bool, or, better, just use i32 (int) on both sides.

Hardcode84

LGTM but wait for @grypp approve

grypp

Thanks for doing this again. It is definitely better now

nbpatel · 2023-09-26T15:41:06Z

@grypp do I need to do anything to get it merged? I see the CI is failing for windows...can you help merge this?

grypp · 2023-09-26T15:45:02Z

@grypp do I need to do anything to get it merged? I see the CI is failing for windows...can you help merge this?

Is the HEAD correct? See the issues in the code formatter.

Warning: Unable to find merge base between 671e2ba45bf35a7c96a1a374c4956cce7e5d4d55 and 44f3978dae0b3e73e41566cbab42c2825164f62f
  Warning: Error checking commit history
  Warning: If this pull request is from a forked repository, please set the checkout action `repository` input to the same repository as the pull request.
  Warning: This can be done by setting actions/checkout `repository` to ${{ github.event.pull_request.head.repo.full_name }}
  Error: Unable to determine a difference between 671e2ba45bf35a7c96a1a374c4956cce7e5d4d55..44f3978dae0b3e73e41566cbab42c2825164f62f

Let's try to rebase this PR on a clean HEAD

nbpatel · 2023-09-26T17:52:18Z

Done.

nbpatel · 2023-09-26T22:27:31Z

@joker-eph can you help merge this as well?its approved

…#66401) This PR is a breakdown of the big PR llvm#65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory.

If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to #66401

If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to llvm#66401

Support lowering of hostShared in gpu.alloc op

4a32117

llvmbot added mlir:gpu mlir mlir:execution-engine labels Sep 14, 2023

grypp reviewed Sep 15, 2023

View reviewed changes

gpu host_shared allocation cannot be async

4752f1b

nbpatel requested a review from grypp September 22, 2023 16:23

Hardcode84 reviewed Sep 25, 2023

View reviewed changes

Address PR feedback

49043d9

nbpatel requested a review from Hardcode84 September 25, 2023 21:18

Use llvmInt8TypeI8 for bool

44f3978

Hardcode84 approved these changes Sep 25, 2023

View reviewed changes

grypp approved these changes Sep 26, 2023

View reviewed changes

Merge branch 'main' into nishant_hostShared

4412bf5

joker-eph merged commit 1002a1d into llvm:main Sep 26, 2023

nbpatel deleted the nishant_hostShared branch October 2, 2023 20:34

nbpatel mentioned this pull request Oct 23, 2023

[MLIR] Modify lowering of gpu.alloc op to llvm #69969

Merged

grypp pushed a commit that referenced this pull request Oct 25, 2023

[MLIR] Modify lowering of gpu.alloc op to llvm (#69969)

ced9f4f

If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to #66401

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401

[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401

nbpatel commented Sep 14, 2023

llvmbot commented Sep 14, 2023 •

edited

Loading

grypp Sep 15, 2023

nbpatel Sep 15, 2023

grypp Sep 15, 2023

nbpatel Sep 15, 2023

grypp Sep 25, 2023

nbpatel Sep 25, 2023 •

edited

Loading

Hardcode84 Sep 25, 2023 •

edited

Loading

Hardcode84 left a comment

grypp left a comment

nbpatel commented Sep 26, 2023 •

edited

Loading

grypp commented Sep 26, 2023

nbpatel commented Sep 26, 2023

nbpatel commented Sep 26, 2023

[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401

[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401

Conversation

nbpatel commented Sep 14, 2023

llvmbot commented Sep 14, 2023 • edited Loading

grypp Sep 15, 2023

Choose a reason for hiding this comment

nbpatel Sep 15, 2023

Choose a reason for hiding this comment

grypp Sep 15, 2023

Choose a reason for hiding this comment

nbpatel Sep 15, 2023

Choose a reason for hiding this comment

grypp Sep 25, 2023

Choose a reason for hiding this comment

nbpatel Sep 25, 2023 • edited Loading

Choose a reason for hiding this comment

Hardcode84 Sep 25, 2023 • edited Loading

Choose a reason for hiding this comment

Hardcode84 left a comment

Choose a reason for hiding this comment

grypp left a comment

Choose a reason for hiding this comment

nbpatel commented Sep 26, 2023 • edited Loading

grypp commented Sep 26, 2023

nbpatel commented Sep 26, 2023

nbpatel commented Sep 26, 2023

llvmbot commented Sep 14, 2023 •

edited

Loading

nbpatel Sep 25, 2023 •

edited

Loading

Hardcode84 Sep 25, 2023 •

edited

Loading

nbpatel commented Sep 26, 2023 •

edited

Loading