Skip to content

Commit ced9f4f

Browse files
authored
[MLIR] Modify lowering of gpu.alloc op to llvm (#69969)
If gpu.alloc has no asyn deependency ( in case if gpu.alloc has hostShared allocation), create a new stream & synchronize. This PR is follow up to #66401
1 parent 01ac180 commit ced9f4f

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,11 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
836836
// Allocate the underlying buffer and store a pointer to it in the MemRef
837837
// descriptor.
838838
Type elementPtrType = this->getElementPtrType(memRefType);
839-
auto stream = adaptor.getAsyncDependencies().front();
839+
840+
auto nullPtr = rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPointerType);
841+
Value stream = adaptor.getAsyncDependencies().empty()
842+
? nullPtr
843+
: adaptor.getAsyncDependencies().front();
840844

841845
auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
842846
loc, llvmInt8Type, rewriter.getI8IntegerAttr(isShared));
@@ -855,7 +859,12 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
855859
auto memRefDescriptor = this->createMemRefDescriptor(
856860
loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
857861

858-
rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
862+
if (allocOp.getAsyncToken()) {
863+
// Async alloc: make dependent ops use the same stream.
864+
rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
865+
} else {
866+
rewriter.replaceOp(allocOp, {memRefDescriptor});
867+
}
859868

860869
return success();
861870
}

mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,22 @@ module attributes {gpu.container_module} {
1919
gpu.wait [%3]
2020
return
2121
}
22+
23+
// CHECK-LABEL: llvm.func @alloc_sync
24+
// CHECK-SAME: %[[size:.*]]: i64
25+
func.func @alloc_sync(%size : index) {
26+
// CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
27+
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
28+
// CHECK: %[[nullptr:.*]] = llvm.mlir.zero
29+
// CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
30+
// CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[nullptr]], %[[isHostShared]])
31+
%0 = gpu.alloc host_shared (%size) : memref<?xf32>
32+
// CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
33+
%1 = gpu.wait async
34+
%2 = gpu.dealloc async [%1] %0 : memref<?xf32>
35+
// CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
36+
// CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
37+
gpu.wait [%2]
38+
return
39+
}
2240
}

0 commit comments

Comments
 (0)