Skip to content

Commit 40d0058

Browse files
authored
[NVPTX] Add TMA bulk tensor reduction intrinsics (#116854)
This patch adds NVVM intrinsics and NVPTX codegen for: * cp.async.bulk.tensor.reduce.1D -> 5D variants, supporting both Tile and Im2Col modes. * These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-reduce.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor Signed-off-by: Durgadoss R <[email protected]>
1 parent 0d15d46 commit 40d0058

File tree

9 files changed

+835
-47
lines changed

9 files changed

+835
-47
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,7 @@ specified by the ``i32 %d0 ... i32 %d4`` arguments.
633633
For more information, refer PTX ISA
634634
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
635635

636-
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
636+
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``'
637637
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
638638

639639
Syntax:
@@ -648,7 +648,7 @@ Syntax:
648648
Overview:
649649
"""""""""
650650

651-
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
651+
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[3-5]d``' intrinsics
652652
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
653653
of PTX instructions. These instructions initiate an asynchronous prefetch
654654
of tensor data from global memory to the L2 cache. In im2col mode, some
@@ -663,6 +663,76 @@ the same functionality as described in the ``tile`` mode intrinsics above.
663663
For more information, refer PTX ISA
664664
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
665665

666+
'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].tile.[1-5]d``'
667+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
668+
669+
Syntax:
670+
"""""""
671+
672+
.. code-block:: llvm
673+
674+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
675+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
676+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
677+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
678+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
679+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
680+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
681+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
682+
683+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.2d(..., i32 %d0, i32 %d1, ...)
684+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
685+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
686+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
687+
688+
Overview:
689+
"""""""""
690+
691+
The '``@llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.tile.[1-5]d``' intrinsics
692+
correspond to the ``cp.reduce.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
693+
These instructions initiate an asynchronous reduction operation of tensor data
694+
in global memory with the tensor data in shared{::cta} memory, using ``tile`` mode.
695+
The dimension of the tensor data ranges from 1d to 5d with the coordinates
696+
specified by the ``i32 %d0 ... i32 %d4`` arguments. The supported reduction
697+
operations are {add, min, max, inc, dec, and, or, xor} as described in the
698+
``tile.1d`` intrinsics.
699+
700+
* The last argument to these intrinsics is a boolean flag
701+
indicating support for cache_hint. This flag argument must
702+
be a compile-time constant. When set, it indicates a valid
703+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
704+
variant of the PTX instruction.
705+
706+
For more information, refer PTX ISA
707+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_.
708+
709+
'``llvm.nvvm.cp.async.bulk.tensor.reduce.[red_op].im2col.[3-5]d``'
710+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
711+
712+
Syntax:
713+
"""""""
714+
715+
.. code-block:: llvm
716+
717+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
718+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
719+
declare void @llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
720+
721+
Overview:
722+
"""""""""
723+
724+
The '``@llvm.nvvm.cp.async.bulk.tensor.reduce.<red_op>.im2col.[3-5]d``' intrinsics
725+
correspond to the ``cp.reduce.async.bulk.tensor.[3-5]d.*`` set of PTX instructions.
726+
These instructions initiate an asynchronous reduction operation of tensor data
727+
in global memory with the tensor data in shared{::cta} memory, using ``im2col`` mode.
728+
In this mode, the tensor has to be at least three-dimensional. The supported reduction
729+
operations supported are the same as the ones in the tile mode. The last argument to
730+
these intrinsics is a boolean flag, with the same functionality as described in the
731+
``tile`` mode intrinsics above.
732+
733+
For more information, refer PTX ISA
734+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-reduce-async-bulk-tensor>`_.
735+
666736
Other Intrinsics
667737
----------------
668738

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,25 @@ class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
635635
ImmArg<ArgIndex<FlagsStartIdx>>];
636636
}
637637

638+
class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
639+
string Suffix = op # "_" # mode # "_" # dim # "d";
640+
string Name = "int_nvvm_cp_async_bulk_tensor_reduce_" # Suffix;
641+
642+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
643+
list<LLVMType> ArgsTy = !listconcat(
644+
[llvm_shared_ptr_ty, // src_smem_ptr
645+
llvm_ptr_ty], // tensormap_ptr
646+
TensorDimsTy, // actual tensor dims
647+
[llvm_i64_ty, // cache_hint
648+
llvm_i1_ty] // Flag for cache_hint
649+
);
650+
int FlagsStartIdx = !add(dim, 3);
651+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
652+
ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
653+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
654+
ImmArg<ArgIndex<FlagsStartIdx>>];
655+
}
656+
638657
let TargetPrefix = "nvvm" in {
639658
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
640659
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4936,4 +4955,14 @@ foreach dim = [1, 2, 3, 4, 5] in {
49364955
}
49374956
}
49384957

4958+
// Intrinsics for TMA Copy with reduction
4959+
foreach dim = [1, 2, 3, 4, 5] in {
4960+
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
4961+
foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in {
4962+
foreach reduce = [CP_ASYNC_BULK_TENSOR_REDUCE_INTR<dim, mode, red_op>] in
4963+
def reduce.Name : DefaultAttrsIntrinsic<[], reduce.ArgsTy, reduce.IntrProp>;
4964+
}
4965+
}
4966+
}
4967+
49394968
} // let TargetPrefix = "nvvm"
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
/// \file
10+
/// This file contains the definitions of the enumerations and flags
11+
/// associated with NVVM Intrinsics.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifndef LLVM_IR_NVVMINTRINSICFLAGS_H
16+
#define LLVM_IR_NVVMINTRINSICFLAGS_H
17+
18+
namespace llvm {
19+
namespace nvvm {
20+
21+
// Reduction Ops supported with TMA Copy from Shared
22+
// to Global Memory for the "cp.reduce.async.bulk.tensor.*"
23+
// family of PTX instructions.
24+
enum class TMAReductionOp : uint8_t {
25+
ADD = 0,
26+
MIN = 1,
27+
MAX = 2,
28+
INC = 3,
29+
DEC = 4,
30+
AND = 5,
31+
OR = 6,
32+
XOR = 7,
33+
};
34+
35+
} // namespace nvvm
36+
} // namespace llvm
37+
#endif // LLVM_IR_NVVMINTRINSICFLAGS_H

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "NVPTX.h"
1515
#include "NVPTXUtilities.h"
1616
#include "llvm/ADT/StringRef.h"
17+
#include "llvm/IR/NVVMIntrinsicFlags.h"
1718
#include "llvm/MC/MCExpr.h"
1819
#include "llvm/MC/MCInst.h"
1920
#include "llvm/MC/MCInstrInfo.h"
@@ -416,3 +417,40 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum,
416417
return;
417418
}
418419
}
420+
421+
void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum,
422+
raw_ostream &O,
423+
const char *Modifier) {
424+
const MCOperand &MO = MI->getOperand(OpNum);
425+
using RedTy = llvm::nvvm::TMAReductionOp;
426+
427+
switch (static_cast<RedTy>(MO.getImm())) {
428+
case RedTy::ADD:
429+
O << ".add";
430+
return;
431+
case RedTy::MIN:
432+
O << ".min";
433+
return;
434+
case RedTy::MAX:
435+
O << ".max";
436+
return;
437+
case RedTy::INC:
438+
O << ".inc";
439+
return;
440+
case RedTy::DEC:
441+
O << ".dec";
442+
return;
443+
case RedTy::AND:
444+
O << ".and";
445+
return;
446+
case RedTy::OR:
447+
O << ".or";
448+
return;
449+
case RedTy::XOR:
450+
O << ".xor";
451+
return;
452+
default:
453+
llvm_unreachable(
454+
"Invalid Reduction Op in printCpAsyncBulkTensorReductionMode");
455+
}
456+
}

llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class NVPTXInstPrinter : public MCInstPrinter {
5454
raw_ostream &O, const char *Modifier = nullptr);
5555
void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O,
5656
const char *Modifier = nullptr);
57+
void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O,
58+
const char *Modifier = nullptr);
5759
};
5860

5961
}

0 commit comments

Comments
 (0)