1
+ // RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s
2
+
3
+ gpu.module @main_kernel {
4
+
5
+ // CHECK-LABEL: @main_kernel(
6
+ // CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor
7
+ gpu.func @main_kernel (%arg0: !nvgpu.tensormap.descriptor <
8
+ tensor = memref <128 x32 xf32 , 3 >, swizzle = none , l2promo = none ,
9
+ oob = zero , interleave = none >) kernel attributes
10
+ { gpu.known_block_size = array <i32 : 128 , 1 , 1 >,
11
+ gpu.known_grid_size = array <i32 : 1 , 1 , 1 >
12
+ }
13
+ {
14
+ // CHECK: %[[c0:.+]] = arith.constant 0 : index
15
+ // CHECK: %[[S0:.+]] = gpu.thread_id x
16
+ // CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index
17
+ // CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
18
+ // CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>>
19
+ // CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
20
+ %c0 = arith.constant 0 : index
21
+ %0 = gpu.thread_id x
22
+ %1 = arith.cmpi eq , %0 , %c0 : index
23
+ %2 = gpu.dynamic_shared_memory : memref <?xi8 , #gpu.address_space <workgroup >>
24
+ %view = memref.view %2 [%c0 ][] : memref <?xi8 , #gpu.address_space <workgroup >> to memref <128 x32 xf32 , #gpu.address_space <workgroup >>
25
+ nvgpu.tma.async.store %view to %arg0 [%c0 , %c0 ], predicate = %1 : memref <128 x32 xf32 , #gpu.address_space <workgroup >> -> <tensor = memref <128 x32 xf32 , 3 >, swizzle = none , l2promo = none , oob = zero , interleave = none >
26
+ nvvm.cp.async.bulk.commit.group
27
+ nvvm.cp.async.bulk.wait_group 0
28
+ gpu.return
29
+ }
30
+ }
0 commit comments