@@ -4175,6 +4175,10 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
4175
4175
return CP_ASYNC_BULK_TENSOR_OPCODE (G2S, dim, mode, ); \
4176
4176
}()
4177
4177
4178
+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (dim, mode ) \
4179
+ (IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
4180
+ : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
4181
+
4178
4182
static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
4179
4183
bool IsCacheHint, bool IsIm2Col) {
4180
4184
if (IsIm2Col) {
@@ -4242,6 +4246,55 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
4242
4246
}
4243
4247
}
4244
4248
4249
+ static unsigned GetCpAsyncBulkTensorPrefetchOpcode (size_t Dim, bool IsCacheHint,
4250
+ bool IsIm2Col) {
4251
+ if (IsIm2Col) {
4252
+ switch (Dim) {
4253
+ case 3 :
4254
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, IM2COL);
4255
+ case 4 :
4256
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, IM2COL);
4257
+ case 5 :
4258
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, IM2COL);
4259
+ default :
4260
+ llvm_unreachable (" Invalid Dimension in im2col mode for "
4261
+ " GetCpAsyncBulkTensorPrefetchOpcode." );
4262
+ }
4263
+ } else {
4264
+ switch (Dim) {
4265
+ case 1 :
4266
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (1D, TILE);
4267
+ case 2 :
4268
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (2D, TILE);
4269
+ case 3 :
4270
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, TILE);
4271
+ case 4 :
4272
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, TILE);
4273
+ case 5 :
4274
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, TILE);
4275
+ default :
4276
+ llvm_unreachable (" Invalid Dimension in tile mode for "
4277
+ " GetCpAsyncBulkTensorPrefetchOpcode." );
4278
+ }
4279
+ }
4280
+ }
4281
+
4282
+ static size_t GetDimsFromIntrinsic (unsigned IID) {
4283
+ switch (IID) {
4284
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4285
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
4286
+ return 3 ;
4287
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4288
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
4289
+ return 4 ;
4290
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4291
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
4292
+ return 5 ;
4293
+ default :
4294
+ llvm_unreachable (" Invalid im2col intrinsic in GetDimsFromIntrinsic." );
4295
+ }
4296
+ }
4297
+
4245
4298
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon (SDNode *N,
4246
4299
bool IsIm2Col) {
4247
4300
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
@@ -4250,21 +4303,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
4250
4303
// multicast_flag, cache_hint_flag}
4251
4304
// NumOperands = {Chain, IID} + {Actual intrinsic args}
4252
4305
// = {2} + {7 + dims + im2col_offsets}
4253
- auto getDimsFromIntrinsic = [](unsigned IID) {
4254
- switch (IID) {
4255
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
4256
- return 3 ;
4257
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
4258
- return 4 ;
4259
- case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4260
- return 5 ;
4261
- default :
4262
- llvm_unreachable (
4263
- " Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon." );
4264
- }
4265
- };
4266
4306
size_t NumOps = N->getNumOperands ();
4267
- size_t NumDims = IsIm2Col ? getDimsFromIntrinsic (N->getConstantOperandVal (1 ))
4307
+ size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
4268
4308
: (NumOps - 9 );
4269
4309
// Offsets is always 'NumDims - 2' and only for im2col mode
4270
4310
size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
@@ -4316,6 +4356,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
4316
4356
ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
4317
4357
}
4318
4358
4359
+ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon (SDNode *N,
4360
+ bool IsIm2Col) {
4361
+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
4362
+ // {src, dims{d0...dN}, im2col_offsets{dims-2}
4363
+ // cache_hint, cache_hint_flag}
4364
+ // NumOperands = {Chain, IID} + {Actual intrinsic args}
4365
+ // = {2} + {3 + dims + im2col_offsets}
4366
+ size_t NumOps = N->getNumOperands ();
4367
+ size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
4368
+ : (NumOps - 5 );
4369
+ // Offsets is always 'NumDims - 2' and only for im2col mode
4370
+ size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
4371
+ bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
4372
+ size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1 );
4373
+
4374
+ SDLoc DL (N);
4375
+ SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
4376
+ Ops.push_back (N->getOperand (0 )); // Chain operand
4377
+
4378
+ unsigned Opcode =
4379
+ GetCpAsyncBulkTensorPrefetchOpcode (NumDims, IsCacheHint, IsIm2Col);
4380
+ ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
4381
+ }
4382
+
4319
4383
bool NVPTXDAGToDAGISel::tryIntrinsicVoid (SDNode *N) {
4320
4384
unsigned IID = N->getConstantOperandVal (1 );
4321
4385
switch (IID) {
@@ -4345,5 +4409,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
4345
4409
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
4346
4410
SelectCpAsyncBulkTensorG2SCommon (N, /* IsIm2Col=*/ true );
4347
4411
return true ;
4412
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
4413
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
4414
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
4415
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
4416
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
4417
+ SelectCpAsyncBulkTensorPrefetchCommon (N);
4418
+ return true ;
4419
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
4420
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
4421
+ case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
4422
+ SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
4423
+ return true ;
4348
4424
}
4349
4425
}
0 commit comments