Skip to content

Commit 512a8d4

Browse files
committed
enable XeVM test using gpu-runner
1 parent 9a50a42 commit 512a8d4

25 files changed

+354
-40
lines changed

include/gc/Transforms/Passes.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ def ConvertMemRefToCPURuntime : Pass<"convert-memref-to-cpuruntime", "func::Func
7474
"cpuruntime::CPURuntimeDialect"
7575
];
7676
}
77-
#ifdef GC_USE_GPU
7877

7978
#ifdef GC_USE_IMEX
8079
def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
@@ -96,6 +95,7 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
9695
}
9796
#endif
9897

98+
#ifdef GC_USE_GPU
9999
def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
100100
let summary = "Convert the GPU operations to GpuOclRuntime calls.";
101101
let description = [{

lib/gc/Target/LLVM/XeVM/Target.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,14 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
182182
return std::nullopt;
183183
}
184184

185+
gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
186+
if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
187+
funcOp.setIntelReqdSubGroupSize(16);
188+
return WalkResult::interrupt();
189+
}
190+
return WalkResult::advance();
191+
});
192+
185193
// TODO: reroute to another serializer for a different target?
186194
SpirSerializer serializer(*module, cast<XeVMTargetAttr>(attribute), options);
187195
serializer.init();

lib/gc/Transforms/GPU/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ endif()
77
set(OCL_PASSES_LIBS "")
88
if(GC_ENABLE_GPU)
99
add_subdirectory(OCL)
10-
set(OCL_PASSES_LIBS GcGpuOCLPasses)
10+
set(OCL_PASSES_LIBS GcGpuOclPasses)
1111
endif()
1212

1313
gc_add_mlir_library(GcGpuPasses

lib/gc/Transforms/GPU/OCL/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
gc_add_mlir_library(GcGpuOCLPasses
1+
gc_add_mlir_library(GcGpuOclPasses
22
GpuToGpuOcl.cpp
33

44
DEPENDS

lib/gc/Transforms/GPU/OCL/GpuToGpuOcl.cpp

Lines changed: 44 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,8 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
250250
isa<MemRefType>(type)) {
251251
MemRefDescriptor desc(arg);
252252
args.emplace_back(desc.alignedPtr(rewriter, loc));
253+
} else if (isa<LLVM::LLVMPointerType>(type)) {
254+
args.emplace_back(arg);
253255
} else {
254256
// Store the arg on the stack and pass the pointer
255257
auto ptr = rewriter.create<LLVM::AllocaOp>(
@@ -352,6 +354,42 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
352354
.getResult();
353355
}
354356

357+
StringAttr getBinaryAttr(ConversionPatternRewriter &rewriter,
358+
gpu::LaunchFuncOp &gpuLaunch,
359+
StringAttr kernelModName) const {
360+
StringAttr binaryAttr;
361+
Operation *binaryStorageOp;
362+
#ifdef GC_USE_IMEX
363+
binaryStorageOp = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
364+
gpuLaunch, kernelModName);
365+
if (!binaryStorageOp) {
366+
gpuLaunch.emitOpError() << "Module " << kernelModName << " not found!";
367+
return {};
368+
}
369+
binaryAttr = binaryStorageOp->getAttrOfType<StringAttr>("gpu.binary");
370+
rewriter.eraseOp(binaryStorageOp);
371+
#else
372+
binaryStorageOp = SymbolTable::lookupNearestSymbolFrom<gpu::BinaryOp>(
373+
gpuLaunch, kernelModName);
374+
if (!binaryStorageOp) {
375+
gpuLaunch.emitOpError() << "Binary " << kernelModName << " not found!";
376+
return {};
377+
}
378+
auto objects = cast<gpu::BinaryOp>(binaryStorageOp).getObjects();
379+
if (objects.size() != 1) {
380+
gpuLaunch.emitOpError() << "Many targets present in " << kernelModName
381+
<< ", please use xevm only.";
382+
return {};
383+
}
384+
binaryAttr = cast<gpu::ObjectAttr>(objects[0]).getObject();
385+
#endif
386+
if (!binaryAttr) {
387+
binaryStorageOp->emitOpError() << "missing binary.";
388+
return {};
389+
}
390+
return binaryAttr;
391+
}
392+
355393
// Create a new kernel and save the pointer to the global variable
356394
// ...name_Ptr.
357395
bool createKernel(
@@ -360,24 +398,12 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
360398
StringRef funcName,
361399
const std::function<SmallString<128> &(const char *chars)> &str) const {
362400
auto kernelModName = gpuLaunch.getKernelModuleName();
363-
auto kernelMod = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
364-
gpuLaunch, kernelModName);
365-
if (!kernelMod) {
366-
gpuLaunch.emitOpError() << "Module " << kernelModName << " not found!";
367-
return false;
368-
}
369-
const auto binaryAttr = kernelMod->getAttrOfType<StringAttr>("gpu.binary");
370-
if (!binaryAttr) {
371-
kernelMod.emitOpError() << "missing 'gpu.binary' attribute";
372-
return false;
373-
}
374-
401+
auto binaryAttr = getBinaryAttr(rewriter, gpuLaunch, kernelModName);
375402
rewriter.setInsertionPointToStart(mod.getBody());
376403
// The kernel pointer is stored here
377404
rewriter.create<LLVM::GlobalOp>(loc, helper.ptrType, /*isConstant=*/false,
378405
LLVM::Linkage::Internal, str("Ptr"),
379406
rewriter.getZeroAttr(helper.ptrType));
380-
rewriter.eraseOp(kernelMod);
381407

382408
auto function = rewriter.create<LLVM::LLVMFuncOp>(
383409
loc, funcName,
@@ -415,7 +441,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
415441
for (auto arg : gpuLaunch.getKernelOperands()) {
416442
auto type = arg.getType();
417443
size_t size;
418-
if (isa<MemRefType>(type)) {
444+
if (isa<MemRefType>(type) || isa<LLVM::LLVMPointerType>(type)) {
419445
size = 0; // A special case for pointers
420446
} else if (type.isIndex()) {
421447
size = helper.idxType.getIntOrFloatBitWidth() / 8;
@@ -452,6 +478,8 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
452478
assert(getConstantIntValue(cast.getOperand(0)));
453479
value = helper.idxConstant(
454480
rewriter, loc, getConstantIntValue(cast.getOperand(0)).value());
481+
} else {
482+
value = rewriter.clone(*value.getDefiningOp())->getResult(0);
455483
}
456484
rewriter.create<LLVM::StoreOp>(loc, value, elementPtr);
457485
}
@@ -527,6 +555,8 @@ struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase<GpuToGpuOcl> {
527555
return;
528556
}
529557

558+
if (!helper.kernelNames.size())
559+
return;
530560
// Add gpuOclDestructor() function that destroys all the kernels
531561
auto mod = llvm::dyn_cast<ModuleOp>(getOperation());
532562
assert(mod);

lib/gc/Transforms/GPU/Pipeline.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ void registerIMEXPipeline() {
138138
#ifdef GC_USE_GPU
139139
void populateGPUPipeline(OpPassManager &pm,
140140
const GPUPipelineOptions &pipelineOpts) {
141-
142141
pm.addNestedPass<func::FuncOp>(createAddContextArg());
143142

144143
pm.addPass(createConvertSCFToCFPass());
@@ -148,20 +147,22 @@ void populateGPUPipeline(OpPassManager &pm,
148147
pm.addPass(createArithToLLVMConversionPass());
149148
pm.addPass(createConvertFuncToLLVMPass());
150149
pm.addPass(createConvertMathToLLVMPass());
151-
pm.addPass(createCSEPass());
150+
pm.addPass(createReconcileUnrealizedCastsPass());
152151

152+
// Convert allocs, etc.
153+
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
153154
pm.addPass(createGpuKernelOutliningPass());
154155
pm.addPass(createConvertXeVMToLLVMPass());
155156
pm.addPass(createGpuXeVMAttachTarget());
156-
pm.addPass(createConvertGpuOpsToLLVMSPVOps());
157-
pm.addPass(createGpuToLLVMConversionPass());
157+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToLLVMSPVOps());
158+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());
159+
pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
158160
pm.addPass(createReconcileUnrealizedCastsPass());
159-
pm.addPass(createCSEPass());
160-
// Convert allocs, etc.
161-
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
162161
pm.addPass(createGpuModuleToBinaryPass());
163162
// Convert launch given a binary.
164163
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
164+
pm.addPass(createFinalizeMemRefToLLVMConversionPass());
165+
pm.addPass(createReconcileUnrealizedCastsPass());
165166
}
166167

167168
void registerGPUPipeline() {

src/gc-gpu-runner/CMakeLists.txt

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,16 @@
1515
# SPDX-License-Identifier: Apache-2.0
1616
################################################################################
1717

18-
if(NOT GC_ENABLE_TOOLS OR NOT GC_ENABLE_GPU)
18+
if(GC_ENABLE_TOOLS AND GC_ENABLE_GPU)
19+
gc_add_mlir_tool(gc-gpu-runner GpuRunner.cpp)
20+
target_link_libraries(gc-gpu-runner PRIVATE
21+
GcJitWrapper
22+
GcGpuOclRuntime
23+
)
24+
mlir_check_all_link_libraries(gc-gpu-runner)
25+
else()
1926
message(STATUS "Gpu runner is not enabled.")
2027
return()
2128
endif()
2229

23-
gc_add_mlir_tool(gc-gpu-runner GpuRunner.cpp)
24-
target_link_libraries(gc-gpu-runner PRIVATE
25-
GcJitWrapper
26-
GcGpuOclRuntime
27-
)
28-
mlir_check_all_link_libraries(gc-gpu-runner)
30+

src/gc-opt/gc-opt.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ namespace mlir::gc {
4646
void registerCPUPipeline();
4747
#ifdef GC_USE_GPU
4848
void registerGPUPipeline();
49+
#endif
50+
4951
#ifdef GC_USE_IMEX
5052
void registerIMEXPipeline();
5153
#endif
52-
#endif
5354

5455
} // namespace mlir::gc
5556

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// RUN: gc-opt %s --gpu-to-gpuocl | FileCheck %s
2+
3+
module @test attributes {gpu.container_module} {
4+
llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64) attributes {llvm.emit_c_interface} {
5+
%0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
6+
%1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
7+
%2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
8+
%3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
9+
%4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
10+
%5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
11+
%6 = llvm.insertvalue %arg5, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
12+
%7 = llvm.insertvalue %arg6, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
13+
%8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xf32>
14+
%gpu_mem = gpu.alloc host_shared () : memref<64x64xf32>
15+
gpu.memcpy %gpu_mem, %8 : memref<64x64xf32>, memref<64x64xf32>
16+
%9 = llvm.mlir.constant(32 : index) : i64
17+
%10 = builtin.unrealized_conversion_cast %9 : i64 to index
18+
%11 = llvm.mlir.constant(2 : index) : i64
19+
%12 = builtin.unrealized_conversion_cast %11 : i64 to index
20+
%13 = llvm.mlir.constant(1 : index) : i64
21+
%14 = builtin.unrealized_conversion_cast %13 : i64 to index
22+
gpu.launch_func @entry_kernel::@entry_kernel blocks in (%12, %12, %14) threads in (%14, %14, %14) args(%10 : index, %gpu_mem : memref<64x64xf32>)
23+
gpu.memcpy %8, %gpu_mem : memref<64x64xf32>, memref<64x64xf32>
24+
gpu.dealloc %gpu_mem : memref<64x64xf32>
25+
llvm.return
26+
}
27+
28+
gpu.module @entry_kernel attributes {gpu.binary = "Some SPIRV here \00"} {
29+
gpu.func @entry_kernel(%arg0: index, %arg1: memref<64x64xf32>) kernel attributes {} {
30+
gpu.return
31+
}
32+
}
33+
}
34+
35+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
36+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
37+
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
38+
39+
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
40+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
41+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
42+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
43+
// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
44+
// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
45+
// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
46+
// CHECK: [[BB1]]:
47+
// CHECK: llvm.return [[NEW_PTR]]
48+
// CHECK: [[BB2]]:
49+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
50+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
51+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
52+
// CHECK: llvm.store [[NEW_PTR]], [[ADDR]]
53+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
54+
// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
55+
// CHECK: llvm.return [[OLD_PTR]]
56+
57+
// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
58+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
59+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
60+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
61+
// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
62+
// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
63+
// CHECK: [[BB1]]:
64+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
65+
// CHECK: llvm.return [[NEW_PTR]]
66+
// CHECK: [[BB2]]:
67+
// CHECK: llvm.return [[PTR]]
68+
69+
// CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64)
70+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
71+
// CHECK: llvm.call @gcGpuOclMallocShared([[CTX]], [[SIZE]])
72+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
73+
// CHECK: [[SRC:%.+]] = llvm.extractvalue
74+
// CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
75+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
76+
// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
77+
// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],
78+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
79+
// CHECK: [[SRC:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
80+
// CHECK: [[DST:%.+]] = llvm.extractvalue
81+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
82+
// CHECK: [[GPU_PTR:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][0]
83+
// CHECK: llvm.call @gcGpuOclDealloc([[CTX]], [[GPU_PTR]])
84+
85+
// CHECK: llvm.func @gcGpuOclKernelCreate
86+
// CHECK: llvm.func @gcGpuOclKernelDestroy
87+
// CHECK: llvm.func @gcGpuOclKernelLaunch
88+
89+
90+
// CHECK: llvm.func @gcGpuOclModuleDestructor()
91+
// CHECK: llvm.fence acquire
92+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
93+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
94+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
95+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
96+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
97+
// CHECK: llvm.store [[PTR]], [[ADDR]]
98+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])

0 commit comments

Comments
 (0)