-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[MLIR] Pass hostShared flag in gpu.alloc op to runtime wrappers #66401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir-gpu @llvm/pr-subscribers-mlir ChangesThis PR is a breakdown of the big PR https://github.com//pull/65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory. -- Full diff: https://github.com//pull/66401.diff5 Files Affected:
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0bc8..428d5d1d4b0e944 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, - llvmPointerType /* void *stream */}}; + llvmPointerType /* void *stream */, + llvmInt64Type /* bool isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -786,9 +787,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - if (adaptor.getHostShared()) - return rewriter.notifyMatchFailure( - allocOp, "host_shared allocation is not supported"); MemRefType memRefType = allocOp.getType(); @@ -799,6 +797,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto loc = allocOp.getLoc(); + bool isShared = allocOp.getHostShared(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector<Value, 4> shape; @@ -811,8 +811,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe66365..a0172f85a67a5c0 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -210,7 +210,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, + bool /*isHostShared*/) { ScopedContext scopedContext; CUdeviceptr ptr; CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f6f..292159536f5522f 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, + bool /*isHostShared*/) { void *ptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index 2506c6ceb990ef5..f365dcb02daf4c2 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir index 2fa6c854c567819..e27162c7dbc1902 100644 --- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32> // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] |
Value allocatedPtr = | ||
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); | ||
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought we had a consensus about avoiding the use of host_shared
and async
because of USM's limitation in handling asynchronous data allocation. Are we still on the same page with this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need to relax the checks in that case in GPUToLLVMConversion Pass to allow lowering of non async gpu.alloc......also we might need to change the gpu-async-region pass to handle this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I think it's worth to do it! What's your take on this? Personally, I think having a complete PR is the way to go. Otherwise, we will have an improperly implemented Op.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I relaxed the checks for the GPUToLLVMCoversion pass. Regarding touching other passes, can we do it in an iterative PR once all the PR's relating to #65539 are merged?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Awesome thanks! Sure, it sounds good me to.
Let me know if there is anything else to do cuda runtime lowering.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. any other feedback on the PR @grypp ? or else can we merge it if it looks good to you
@@ -167,7 +167,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> { | |||
"mgpuMemAlloc", | |||
llvmPointerType /* void * */, | |||
{llvmIntPtrType /* intptr_t sizeBytes */, | |||
llvmPointerType /* void *stream */}}; | |||
llvmPointerType /* void *stream */, | |||
llvmInt64Type /* bool isHostShared */}}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be i8
to match C++ bool, or, better, just use i32 (int) on both sides.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM but wait for @grypp approve
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for doing this again. It is definitely better now
@grypp do I need to do anything to get it merged? I see the CI is failing for windows...can you help merge this? |
Is the HEAD correct? See the issues in the code formatter.
Let's try to rebase this PR on a clean HEAD |
Done. |
@joker-eph can you help merge this as well?its approved |
…#66401) This PR is a breakdown of the big PR llvm#65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory.
If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to #66401
If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to llvm#66401
This PR is a breakdown of the big PR #65539 which enables intel gpu integration. In this PR we pass hostShared flag to runtime wrappers (required by SyclRuntimeWrappers which will come in subsequent PR) to indicate if the allocation is done on host shared gpu memory or device only memory.