Skip to content

Commit 3477bcf

Browse files
authored
[mlir][nvgpu] Mark TMA descriptor as MemWriteAt in tma.async.store (#79427)
1 parent 285bc69 commit 3477bcf

File tree

2 files changed

+31
-1
lines changed

2 files changed

+31
-1
lines changed

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def NVGPU_TmaAsyncStoreOp : NVGPU_Op<"tma.async.store", [AttrSizedOperandSegment
671671
tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
672672
}];
673673
let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$src,
674-
NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
674+
Arg<NVGPU_TensorMapDescriptor, "", [MemWriteAt<0, FullEffect>]>:$tensorMapDescriptor,
675675
Variadic<Index>:$coordinates,
676676
Optional<I1>:$predicate);
677677
let assemblyFormat = [{
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s
2+
3+
gpu.module @main_kernel {
4+
5+
// CHECK-LABEL: @main_kernel(
6+
// CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor
7+
gpu.func @main_kernel(%arg0: !nvgpu.tensormap.descriptor<
8+
tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none,
9+
oob = zero, interleave = none>) kernel attributes
10+
{ gpu.known_block_size = array<i32: 128, 1, 1>,
11+
gpu.known_grid_size = array<i32: 1, 1, 1>
12+
}
13+
{
14+
// CHECK: %[[c0:.+]] = arith.constant 0 : index
15+
// CHECK: %[[S0:.+]] = gpu.thread_id x
16+
// CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index
17+
// CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
18+
// CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>>
19+
// CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
20+
%c0 = arith.constant 0 : index
21+
%0 = gpu.thread_id x
22+
%1 = arith.cmpi eq, %0, %c0 : index
23+
%2 = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
24+
%view = memref.view %2[%c0][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>>
25+
nvgpu.tma.async.store %view to %arg0[%c0, %c0], predicate = %1 : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
26+
nvvm.cp.async.bulk.commit.group
27+
nvvm.cp.async.bulk.wait_group 0
28+
gpu.return
29+
}
30+
}

0 commit comments

Comments
 (0)