Skip to content

Commit 4edd711

Browse files
authored
[NVPTX] Add TMA bulk tensor prefetch intrinsics (#115527)
This patch adds NVVM intrinsics and NVPTX codegen for: * cp.async.bulk.tensor.prefetch.1D -> 5D variants, supporting both Tile and Im2Col modes. These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-prefetch.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. * PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor Signed-off-by: Durgadoss R <[email protected]>
1 parent 59770a4 commit 4edd711

File tree

6 files changed

+369
-14
lines changed

6 files changed

+369
-14
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above.
599599
For more information, refer PTX ISA
600600
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
601601

602+
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``'
603+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
604+
605+
Syntax:
606+
"""""""
607+
608+
.. code-block:: llvm
609+
610+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
611+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...)
612+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
613+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
614+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
615+
616+
Overview:
617+
"""""""""
618+
619+
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics
620+
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
621+
of PTX instructions. These instructions initiate an asynchronous prefetch
622+
of tensor data from global memory to the L2 cache. In tile mode, the
623+
multi-dimensional layout of the source tensor is preserved at the destination.
624+
The dimension of the tensor data ranges from 1d to 5d with the coordinates
625+
specified by the ``i32 %d0 ... i32 %d4`` arguments.
626+
627+
* The last argument to these intrinsics is a boolean flag
628+
indicating support for cache_hint. This flag argument must
629+
be a compile-time constant. When set, it indicates a valid
630+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
631+
variant of the PTX instruction.
632+
633+
For more information, refer PTX ISA
634+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
635+
636+
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
637+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
638+
639+
Syntax:
640+
"""""""
641+
642+
.. code-block:: llvm
643+
644+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
645+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
646+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
647+
648+
Overview:
649+
"""""""""
650+
651+
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
652+
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
653+
of PTX instructions. These instructions initiate an asynchronous prefetch
654+
of tensor data from global memory to the L2 cache. In im2col mode, some
655+
dimensions of the source tensor are unrolled into a single dimensional
656+
column at the destination. In this mode, the tensor has to be at least
657+
three-dimensional. Along with the tensor coordinates, im2col offsets are
658+
also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
659+
of im2col offsets is two less than the number of dimensions of the tensor
660+
operation. The last argument to these intrinsics is a boolean flag, with
661+
the same functionality as described in the ``tile`` mode intrinsics above.
662+
663+
For more information, refer PTX ISA
664+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
665+
602666
Other Intrinsics
603667
----------------
604668

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
613613
ImmArg<ArgIndex<FlagsStartIdx>>];
614614
}
615615

616+
class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
617+
string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d";
618+
619+
bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
620+
int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
621+
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
622+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
623+
list<LLVMType> ArgsTy = !listconcat(
624+
[llvm_ptr_ty], // tensormap_ptr
625+
TensorDimsTy, // actual tensor dims
626+
Im2ColOffsetsTy, // im2col offsets
627+
[llvm_i64_ty, // cache_hint
628+
llvm_i1_ty] // Flag for cache_hint
629+
);
630+
631+
int TempFlagsStartIdx = !add(dim, 2);
632+
int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
633+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
634+
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
635+
ImmArg<ArgIndex<FlagsStartIdx>>];
636+
}
637+
616638
let TargetPrefix = "nvvm" in {
617639
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
618640
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in {
49024924
def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
49034925
foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
49044926
def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
4927+
foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>] in
4928+
def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>;
49054929
}
49064930
}
49074931

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 90 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4175,6 +4175,10 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
41754175
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \
41764176
}()
41774177

4178+
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \
4179+
(IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
4180+
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
4181+
41784182
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
41794183
bool IsCacheHint, bool IsIm2Col) {
41804184
if (IsIm2Col) {
@@ -4242,6 +4246,55 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
42424246
}
42434247
}
42444248

4249+
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
4250+
bool IsIm2Col) {
4251+
if (IsIm2Col) {
4252+
switch (Dim) {
4253+
case 3:
4254+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL);
4255+
case 4:
4256+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL);
4257+
case 5:
4258+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL);
4259+
default:
4260+
llvm_unreachable("Invalid Dimension in im2col mode for "
4261+
"GetCpAsyncBulkTensorPrefetchOpcode.");
4262+
}
4263+
} else {
4264+
switch (Dim) {
4265+
case 1:
4266+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE);
4267+
case 2:
4268+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE);
4269+
case 3:
4270+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE);
4271+
case 4:
4272+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE);
4273+
case 5:
4274+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE);
4275+
default:
4276+
llvm_unreachable("Invalid Dimension in tile mode for "
4277+
"GetCpAsyncBulkTensorPrefetchOpcode.");
4278+
}
4279+
}
4280+
}
4281+
4282+
static size_t GetDimsFromIntrinsic(unsigned IID) {
4283+
switch (IID) {
4284+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4285+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
4286+
return 3;
4287+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4288+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
4289+
return 4;
4290+
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4291+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
4292+
return 5;
4293+
default:
4294+
llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
4295+
}
4296+
}
4297+
42454298
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
42464299
bool IsIm2Col) {
42474300
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
@@ -4250,21 +4303,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
42504303
// multicast_flag, cache_hint_flag}
42514304
// NumOperands = {Chain, IID} + {Actual intrinsic args}
42524305
// = {2} + {7 + dims + im2col_offsets}
4253-
auto getDimsFromIntrinsic = [](unsigned IID) {
4254-
switch (IID) {
4255-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4256-
return 3;
4257-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4258-
return 4;
4259-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4260-
return 5;
4261-
default:
4262-
llvm_unreachable(
4263-
"Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon.");
4264-
}
4265-
};
42664306
size_t NumOps = N->getNumOperands();
4267-
size_t NumDims = IsIm2Col ? getDimsFromIntrinsic(N->getConstantOperandVal(1))
4307+
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
42684308
: (NumOps - 9);
42694309
// Offsets is always 'NumDims - 2' and only for im2col mode
42704310
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
@@ -4316,6 +4356,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
43164356
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
43174357
}
43184358

4359+
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
4360+
bool IsIm2Col) {
4361+
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
4362+
// {src, dims{d0...dN}, im2col_offsets{dims-2}
4363+
// cache_hint, cache_hint_flag}
4364+
// NumOperands = {Chain, IID} + {Actual intrinsic args}
4365+
// = {2} + {3 + dims + im2col_offsets}
4366+
size_t NumOps = N->getNumOperands();
4367+
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
4368+
: (NumOps - 5);
4369+
// Offsets is always 'NumDims - 2' and only for im2col mode
4370+
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
4371+
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
4372+
size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
4373+
4374+
SDLoc DL(N);
4375+
SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
4376+
Ops.push_back(N->getOperand(0)); // Chain operand
4377+
4378+
unsigned Opcode =
4379+
GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
4380+
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
4381+
}
4382+
43194383
bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
43204384
unsigned IID = N->getConstantOperandVal(1);
43214385
switch (IID) {
@@ -4345,5 +4409,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
43454409
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
43464410
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
43474411
return true;
4412+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
4413+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
4414+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
4415+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
4416+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
4417+
SelectCpAsyncBulkTensorPrefetchCommon(N);
4418+
return true;
4419+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
4420+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
4421+
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
4422+
SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
4423+
return true;
43484424
}
43494425
}

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9494
void SelectI128toV2I64(SDNode *N);
9595
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9696
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
97+
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
9798
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
9899
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
99100
}

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,6 +605,52 @@ foreach dim = [1, 2, 3, 4, 5] in {
605605
}
606606
}
607607

608+
// TMA Prefetch from Global memory to L2 cache
609+
class PREFETCH_STRINGS<int dim, string mode, bit ch> {
610+
string prefix = "cp.async.bulk.prefetch.tensor";
611+
string dir = "L2.global";
612+
string inst_name = prefix
613+
# "." # dim # "d"
614+
# "." # dir
615+
# "." # mode
616+
# !if(ch, ".L2::cache_hint", "");
617+
string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_"
618+
# dim # "D"
619+
# !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
620+
}
621+
622+
multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
623+
defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
624+
defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
625+
defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]";
626+
627+
defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
628+
defvar im2col_dag = !if(!eq(mode, "im2col"),
629+
!dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
630+
(ins));
631+
defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
632+
defvar im2col_asm_str = ", {{" # im2col_str # "}}";
633+
634+
defvar asm_str = !if(!eq(mode, "im2col"),
635+
!strconcat(asm_str_default, im2col_asm_str), asm_str_default);
636+
637+
def "": NVPTXInst<(outs),
638+
!con((ins Int64Regs:$tmap), dims_dag, im2col_dag),
639+
!strconcat(PREFETCH_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
640+
Requires<[hasPTX<80>, hasSM<90>]>;
641+
def _CH: NVPTXInst<(outs),
642+
!con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
643+
!strconcat(PREFETCH_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
644+
Requires<[hasPTX<80>, hasSM<90>]>;
645+
}
646+
647+
foreach dim = [1, 2, 3, 4, 5] in {
648+
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
649+
defm PREFETCH_STRINGS<dim, mode, 0>.intr_name :
650+
CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>;
651+
}
652+
}
653+
608654
//-----------------------------------
609655
// MBarrier Functions
610656
//-----------------------------------

0 commit comments

Comments
 (0)