Skip to content

Commit b10a889

Browse files
committed
enable XeVM test using gpu-runner
1 parent 5540c6f commit b10a889

20 files changed

+353
-29
lines changed

lib/gc/Target/LLVM/XeVM/Target.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,14 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
182182
return std::nullopt;
183183
}
184184

185+
gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
186+
if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
187+
funcOp.setIntelReqdSubGroupSize(16);
188+
return WalkResult::interrupt();
189+
}
190+
return WalkResult::advance();
191+
});
192+
185193
// TODO: reroute to another serializer for a different target?
186194
SpirSerializer serializer(*module, cast<XeVMTargetAttr>(attribute), options);
187195
serializer.init();

lib/gc/Transforms/GPU/OCL/GpuToGpuOcl.cpp

Lines changed: 57 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -246,10 +246,13 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
246246

247247
int i = 0;
248248
for (auto arg : kernelArgs) {
249-
if (auto type = gpuLaunch.getKernelOperand(i++).getType();
249+
if (auto type = gpuLaunch.getKernelOperand(i).getType();
250250
isa<MemRefType>(type)) {
251251
MemRefDescriptor desc(arg);
252252
args.emplace_back(desc.alignedPtr(rewriter, loc));
253+
} else if (auto type = gpuLaunch.getKernelOperand(i).getType();
254+
isa<LLVM::LLVMPointerType>(type)) {
255+
args.emplace_back(arg);
253256
} else {
254257
// Store the arg on the stack and pass the pointer
255258
auto ptr = rewriter.create<LLVM::AllocaOp>(
@@ -258,6 +261,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
258261
rewriter.create<LLVM::StoreOp>(loc, arg, ptr);
259262
args.emplace_back(ptr);
260263
}
264+
i++;
261265
}
262266

263267
const auto gpuOclLaunch =
@@ -352,32 +356,67 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
352356
.getResult();
353357
}
354358

355-
// Create a new kernel and save the pointer to the global variable
356-
// ...name_Ptr.
357-
bool createKernel(
358-
gpu::LaunchFuncOp &gpuLaunch, OpAdaptor &adaptor,
359-
ConversionPatternRewriter &rewriter, const Location &loc, ModuleOp &mod,
360-
StringRef funcName,
361-
const std::function<SmallString<128> &(const char *chars)> &str) const {
362-
auto kernelModName = gpuLaunch.getKernelModuleName();
359+
StringAttr getBinaryAttrIMEX(ConversionPatternRewriter &rewriter,
360+
gpu::LaunchFuncOp &gpuLaunch,
361+
StringAttr kernelModName) const {
362+
StringAttr binaryAttr;
363363
auto kernelMod = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
364364
gpuLaunch, kernelModName);
365365
if (!kernelMod) {
366366
gpuLaunch.emitOpError() << "Module " << kernelModName << " not found!";
367-
return false;
367+
return {};
368368
}
369-
const auto binaryAttr = kernelMod->getAttrOfType<StringAttr>("gpu.binary");
369+
binaryAttr = kernelMod->getAttrOfType<StringAttr>("gpu.binary");
370370
if (!binaryAttr) {
371371
kernelMod.emitOpError() << "missing 'gpu.binary' attribute";
372-
return false;
372+
return {};
373373
}
374+
rewriter.eraseOp(kernelMod);
375+
return binaryAttr;
376+
}
377+
378+
StringAttr getBinaryAttrUpstream(ConversionPatternRewriter &rewriter,
379+
gpu::LaunchFuncOp &gpuLaunch,
380+
StringAttr kernelModName) const {
381+
StringAttr binaryAttr;
382+
auto gpuBin = SymbolTable::lookupNearestSymbolFrom<gpu::BinaryOp>(
383+
gpuLaunch, kernelModName);
384+
if (!gpuBin) {
385+
gpuLaunch.emitOpError() << "Binary " << kernelModName << " not found!";
386+
return {};
387+
}
388+
if (gpuBin.getObjects().size() != 1) {
389+
gpuLaunch.emitOpError() << "Many targets present in " << kernelModName
390+
<< ", please use xevm only.";
391+
return {};
392+
}
393+
binaryAttr = cast<gpu::ObjectAttr>(gpuBin.getObjects()[0]).getObject();
394+
if (!binaryAttr) {
395+
gpuBin.emitOpError() << "missing binary object.";
396+
return {};
397+
}
398+
return binaryAttr;
399+
}
400+
401+
// Create a new kernel and save the pointer to the global variable
402+
// ...name_Ptr.
403+
bool createKernel(
404+
gpu::LaunchFuncOp &gpuLaunch, OpAdaptor &adaptor,
405+
ConversionPatternRewriter &rewriter, const Location &loc, ModuleOp &mod,
406+
StringRef funcName,
407+
const std::function<SmallString<128> &(const char *chars)> &str) const {
408+
auto kernelModName = gpuLaunch.getKernelModuleName();
409+
#ifdef GC_USE_IMEX
410+
auto binaryAttr = getBinaryAttrIMEX(rewriter, gpuLaunch, kernelModName);
411+
#else
412+
auto binaryAttr = getBinaryAttrUpstream(rewriter, gpuLaunch, kernelModName);
413+
#endif
374414

375415
rewriter.setInsertionPointToStart(mod.getBody());
376416
// The kernel pointer is stored here
377417
rewriter.create<LLVM::GlobalOp>(loc, helper.ptrType, /*isConstant=*/false,
378418
LLVM::Linkage::Internal, str("Ptr"),
379419
rewriter.getZeroAttr(helper.ptrType));
380-
rewriter.eraseOp(kernelMod);
381420

382421
auto function = rewriter.create<LLVM::LLVMFuncOp>(
383422
loc, funcName,
@@ -415,7 +454,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
415454
for (auto arg : gpuLaunch.getKernelOperands()) {
416455
auto type = arg.getType();
417456
size_t size;
418-
if (isa<MemRefType>(type)) {
457+
if (isa<MemRefType>(type) || isa<LLVM::LLVMPointerType>(type)) {
419458
size = 0; // A special case for pointers
420459
} else if (type.isIndex()) {
421460
size = helper.idxType.getIntOrFloatBitWidth() / 8;
@@ -452,6 +491,8 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {
452491
assert(getConstantIntValue(cast.getOperand(0)));
453492
value = helper.idxConstant(
454493
rewriter, loc, getConstantIntValue(cast.getOperand(0)).value());
494+
} else {
495+
value = rewriter.clone(*value.getDefiningOp())->getResult(0);
455496
}
456497
rewriter.create<LLVM::StoreOp>(loc, value, elementPtr);
457498
}
@@ -527,6 +568,8 @@ struct GpuToGpuOcl final : gc::impl::GpuToGpuOclBase<GpuToGpuOcl> {
527568
return;
528569
}
529570

571+
if (!helper.kernelNames.size())
572+
return;
530573
// Add gpuOclDestructor() function that destroys all the kernels
531574
auto mod = llvm::dyn_cast<ModuleOp>(getOperation());
532575
assert(mod);

lib/gc/Transforms/GPU/Pipeline.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ void registerIMEXPipeline() {
138138
#ifdef GC_USE_GPU
139139
void populateGPUPipeline(OpPassManager &pm,
140140
const GPUPipelineOptions &pipelineOpts) {
141-
142141
pm.addNestedPass<func::FuncOp>(createAddContextArg());
143142

144143
pm.addPass(createConvertSCFToCFPass());
@@ -148,20 +147,22 @@ void populateGPUPipeline(OpPassManager &pm,
148147
pm.addPass(createArithToLLVMConversionPass());
149148
pm.addPass(createConvertFuncToLLVMPass());
150149
pm.addPass(createConvertMathToLLVMPass());
151-
pm.addPass(createCSEPass());
150+
pm.addPass(createReconcileUnrealizedCastsPass());
152151

152+
// Convert allocs, etc.
153+
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
153154
pm.addPass(createGpuKernelOutliningPass());
154155
pm.addPass(createConvertXeVMToLLVMPass());
155156
pm.addPass(createGpuXeVMAttachTarget());
156-
pm.addPass(createConvertGpuOpsToLLVMSPVOps());
157-
pm.addPass(createGpuToLLVMConversionPass());
157+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToLLVMSPVOps());
158+
pm.addNestedPass<gpu::GPUModuleOp>(createConvertIndexToLLVMPass());
159+
pm.addNestedPass<gpu::GPUModuleOp>(createArithToLLVMConversionPass());
158160
pm.addPass(createReconcileUnrealizedCastsPass());
159-
pm.addPass(createCSEPass());
160-
// Convert allocs, etc.
161-
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
162161
pm.addPass(createGpuModuleToBinaryPass());
163162
// Convert launch given a binary.
164163
pm.addPass(createGpuToGpuOcl({pipelineOpts.callFinish}));
164+
pm.addPass(createFinalizeMemRefToLLVMConversionPass());
165+
pm.addPass(createReconcileUnrealizedCastsPass());
165166
}
166167

167168
void registerGPUPipeline() {
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// RUN: gc-opt %s --gpu-to-gpuocl | FileCheck %s
2+
3+
module @test attributes {gpu.container_module} {
4+
llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64) attributes {llvm.emit_c_interface} {
5+
%0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
6+
%1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
7+
%2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
8+
%3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
9+
%4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
10+
%5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
11+
%6 = llvm.insertvalue %arg5, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
12+
%7 = llvm.insertvalue %arg6, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
13+
%8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xf32>
14+
%gpu_mem = gpu.alloc host_shared () : memref<64x64xf32>
15+
gpu.memcpy %gpu_mem, %8 : memref<64x64xf32>, memref<64x64xf32>
16+
%9 = llvm.mlir.constant(32 : index) : i64
17+
%10 = builtin.unrealized_conversion_cast %9 : i64 to index
18+
%11 = llvm.mlir.constant(2 : index) : i64
19+
%12 = builtin.unrealized_conversion_cast %11 : i64 to index
20+
%13 = llvm.mlir.constant(1 : index) : i64
21+
%14 = builtin.unrealized_conversion_cast %13 : i64 to index
22+
gpu.launch_func @entry_kernel::@entry_kernel blocks in (%12, %12, %14) threads in (%14, %14, %14) args(%10 : index, %gpu_mem : memref<64x64xf32>)
23+
gpu.memcpy %8, %gpu_mem : memref<64x64xf32>, memref<64x64xf32>
24+
gpu.dealloc %gpu_mem : memref<64x64xf32>
25+
llvm.return
26+
}
27+
28+
gpu.module @entry_kernel attributes {gpu.binary = "Some SPIRV here \00"} {
29+
gpu.func @entry_kernel(%arg0: index, %arg1: memref<64x64xf32>) kernel attributes {} {
30+
gpu.return
31+
}
32+
}
33+
}
34+
35+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
36+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
37+
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
38+
39+
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
40+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
41+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
42+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
43+
// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
44+
// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
45+
// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
46+
// CHECK: [[BB1]]:
47+
// CHECK: llvm.return [[NEW_PTR]]
48+
// CHECK: [[BB2]]:
49+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
50+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
51+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
52+
// CHECK: llvm.store [[NEW_PTR]], [[ADDR]]
53+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
54+
// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
55+
// CHECK: llvm.return [[OLD_PTR]]
56+
57+
// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
58+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
59+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
60+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
61+
// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
62+
// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
63+
// CHECK: [[BB1]]:
64+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
65+
// CHECK: llvm.return [[NEW_PTR]]
66+
// CHECK: [[BB2]]:
67+
// CHECK: llvm.return [[PTR]]
68+
69+
// CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64)
70+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
71+
// CHECK: llvm.call @gcGpuOclMallocShared([[CTX]], [[SIZE]])
72+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
73+
// CHECK: [[SRC:%.+]] = llvm.extractvalue
74+
// CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
75+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
76+
// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
77+
// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],
78+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
79+
// CHECK: [[SRC:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
80+
// CHECK: [[DST:%.+]] = llvm.extractvalue
81+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
82+
// CHECK: [[GPU_PTR:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][0]
83+
// CHECK: llvm.call @gcGpuOclDealloc([[CTX]], [[GPU_PTR]])
84+
85+
// CHECK: llvm.func @gcGpuOclKernelCreate
86+
// CHECK: llvm.func @gcGpuOclKernelDestroy
87+
// CHECK: llvm.func @gcGpuOclKernelLaunch
88+
89+
90+
// CHECK: llvm.func @gcGpuOclModuleDestructor()
91+
// CHECK: llvm.fence acquire
92+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
93+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
94+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
95+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
96+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
97+
// CHECK: llvm.store [[PTR]], [[ADDR]]
98+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
// RUN: gc-opt %s --gc-gpu-pipeline | FileCheck %s
2+
3+
module @test attributes {gpu.container_module} {
4+
llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64) attributes {llvm.emit_c_interface} {
5+
%0 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
6+
%1 = llvm.insertvalue %arg0, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
7+
%2 = llvm.insertvalue %arg1, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
8+
%3 = llvm.insertvalue %arg2, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
9+
%4 = llvm.insertvalue %arg3, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
10+
%5 = llvm.insertvalue %arg4, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
11+
%6 = llvm.insertvalue %arg5, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
12+
%7 = llvm.insertvalue %arg6, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
13+
%8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<64x64xf32>
14+
%gpu_mem = gpu.alloc host_shared () : memref<64x64xf32>
15+
gpu.memcpy %gpu_mem, %8 : memref<64x64xf32>, memref<64x64xf32>
16+
%9 = llvm.mlir.constant(32 : index) : i64
17+
%10 = builtin.unrealized_conversion_cast %9 : i64 to index
18+
%11 = llvm.mlir.constant(2 : index) : i64
19+
%12 = builtin.unrealized_conversion_cast %11 : i64 to index
20+
%13 = llvm.mlir.constant(1 : index) : i64
21+
%14 = builtin.unrealized_conversion_cast %13 : i64 to index
22+
23+
%floaat = llvm.mlir.constant(1.1 : f32) : f32
24+
%a_ptr_as_idx = memref.extract_aligned_pointer_as_index %gpu_mem : memref<64x64xf32> -> index
25+
%a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
26+
%a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
27+
%a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
28+
29+
gpu.launch blocks(%arg10, %arg11, %arg12) in (%arg16 = %12, %arg17 = %12, %arg18 = %12) threads(%arg13, %arg14, %arg15) in (%arg19 = %14, %arg20 = %14, %arg21 = %14) {
30+
llvm.store %floaat, %a_ptr_casted : f32, !llvm.ptr<1>
31+
gpu.terminator
32+
}
33+
gpu.memcpy %8, %gpu_mem : memref<64x64xf32>, memref<64x64xf32>
34+
gpu.dealloc %gpu_mem : memref<64x64xf32>
35+
llvm.return
36+
}
37+
}
38+
39+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV
40+
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
41+
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr
42+
43+
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
44+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
45+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
46+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
47+
// CHECK: [[CMPXCHG:%.+]] = llvm.cmpxchg [[PTR_ADDR]], [[ZERO]], [[NEW_PTR]]
48+
// CHECK: [[FLAG:%.+]] = llvm.extractvalue [[CMPXCHG]][1]
49+
// CHECK: llvm.cond_br [[FLAG]], [[BB1:\^.+]], [[BB2:\^.+]]
50+
// CHECK: [[BB1]]:
51+
// CHECK: llvm.return [[NEW_PTR]]
52+
// CHECK: [[BB2]]:
53+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
54+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
55+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
56+
// CHECK: llvm.store [[NEW_PTR]], [[ADDR]]
57+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])
58+
// CHECK: [[OLD_PTR:%.+]] = llvm.extractvalue [[CMPXCHG]][0]
59+
// CHECK: llvm.return [[OLD_PTR]]
60+
61+
// CHECK: llvm.func internal @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline}
62+
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
63+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
64+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
65+
// CHECK: [[ICMP:%.+]] = llvm.icmp "eq" [[PTR]], [[ZERO]]
66+
// CHECK: llvm.cond_br [[ICMP]], [[BB1:\^.+]], [[BB2:\^.+]]
67+
// CHECK: [[BB1]]:
68+
// CHECK: [[NEW_PTR:%.+]] = llvm.call @createGcGpuOclKernel_entry_kernel([[CTX]])
69+
// CHECK: llvm.return [[NEW_PTR]]
70+
// CHECK: [[BB2]]:
71+
// CHECK: llvm.return [[PTR]]
72+
73+
// CHECK: llvm.func @entry(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, [[CTX:%.+]]: !llvm.ptr, %arg8: !llvm.ptr, %arg9: i64)
74+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
75+
// CHECK: llvm.call @gcGpuOclMallocShared([[CTX]], [[SIZE]])
76+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
77+
// CHECK: [[SRC:%.+]] = llvm.extractvalue
78+
// CHECK: [[DST:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
79+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
80+
// CHECK: [[KERNEL:%.+]] = llvm.call @getGcGpuOclKernel_entry_kernel([[CTX:%.+]]) : (!llvm.ptr) -> !llvm.ptr
81+
// CHECK: llvm.call @gcGpuOclKernelLaunch([[CTX]], [[KERNEL]],
82+
// CHECK: [[SIZE:%.+]] = llvm.mlir.constant(16384 : i64) : i64
83+
// CHECK: [[SRC:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][1]
84+
// CHECK: [[DST:%.+]] = llvm.extractvalue
85+
// CHECK: llvm.call @gcGpuOclMemcpy([[CTX]], [[SRC]], [[DST]], [[SIZE]])
86+
// CHECK: [[GPU_PTR:%.+]] = llvm.extractvalue [[GPU_MEMREF:%.+]][0]
87+
// CHECK: llvm.call @gcGpuOclDealloc([[CTX]], [[GPU_PTR]])
88+
89+
// CHECK: llvm.func @gcGpuOclKernelCreate
90+
// CHECK: llvm.func @gcGpuOclKernelDestroy
91+
// CHECK: llvm.func @gcGpuOclKernelLaunch
92+
93+
94+
// CHECK: llvm.func @gcGpuOclModuleDestructor()
95+
// CHECK: llvm.fence acquire
96+
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
97+
// CHECK: [[PTR:%.+]] = llvm.load [[PTR_ADDR]]
98+
// CHECK: [[ONE:%.+]] = llvm.mlir.constant(1 : i64) : i64
99+
// CHECK: [[ARRAY:%.+]] = llvm.alloca [[ONE]]
100+
// CHECK: [[ADDR:%.+]] = llvm.getelementptr [[ARRAY]]
101+
// CHECK: llvm.store [[PTR]], [[ADDR]]
102+
// CHECK: llvm.call @gcGpuOclKernelDestroy([[ONE]], [[ARRAY]])

test/mlir/test/gc/Transforms/GPU/OCL/gpu-to-gpuocl.mlir

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,7 @@ module @test attributes {gpu.container_module} {
2525
llvm.return
2626
}
2727

28-
gpu.module @entry_kernel attributes {gpu.binary = "Some SPIRV here \00"} {
29-
gpu.func @entry_kernel(%arg0: index, %arg1: memref<64x64xf32>) kernel attributes {} {
30-
gpu.return
31-
}
32-
}
28+
gpu.binary @entry_kernel [#gpu.object<#xevm.target, "\03\02#\07\00\04\01\00\14\00+\00\0F\00\00\00\00\00\00\00\11\00\02\00\06\00\00\00\11\00\02\00\04\00\00\00\11\00\02\00\0B\00\00\00\0B\00\05\00\01\00\00\00OpenCL.std\00\00\0E\00\03\00\02\00\00\00\02\00\00\00\0F\00\07\00\06\00\00\00\05\00\00\00entry_kernel\00\00\00\00\10\00\03\00\05\00\00\00\1F\00\00\00\03\00\03\00\04\00\00\00\A0\86\01\00\05\00\06\00\05\00\00\00entry_kernel\00\00\00\00\15\00\04\00\02\00\00\00@\00\00\00\00\00\00\00\13\00\02\00\03\00\00\00!\00\04\00\04\00\00\00\03\00\00\00\02\00\00\006\00\05\00\03\00\00\00\05\00\00\00\04\00\00\00\04\00\00\007\00\03\00\02\00\00\00\06\00\00\00\F8\00\02\00\07\00\00\00\FD\00\01\008\00\01\00">]
3329
}
3430

3531
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_SPIRV

0 commit comments

Comments
 (0)