Skip to content

Commit baf2786

Browse files
authored
[MLIR][NVGPU] Move max threads/blocks size to dialect (NFC) (llvm#124454)
This PR moves maximum number of threads in a block and block in a grid to nvgpu dialect to avoid replicated code. The limits are defined here: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
1 parent f0b8ff1 commit baf2786

File tree

2 files changed

+21
-16
lines changed

2 files changed

+21
-16
lines changed

mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,20 @@
2222

2323
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
2424

25+
// Maximum warp size
2526
constexpr int kWarpSize = 32;
2627

28+
// Maximum number of threads in a block and block in a grid
29+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability
30+
constexpr int kMaxTotalBlockdim = 1024;
31+
constexpr int kMaxBlockdimx = 1024;
32+
constexpr int kMaxBlockdimy = 1024;
33+
constexpr int kMaxBlockdimz = 64;
34+
constexpr int kMaxTotalGriddim = 2147483647;
35+
constexpr int kMaxGriddimx = 2147483647;
36+
constexpr int kMaxGriddimy = 65535;
37+
constexpr int kMaxGriddimz = 65535;
38+
2739
/// M size of wgmma.mma_async instruction
2840
constexpr int kWgmmaSizeM = 64;
2941

mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1515
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
1616
#include "mlir/Dialect/MemRef/IR/MemRef.h"
17+
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
1718
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
1819
#include "mlir/Dialect/SCF/IR/SCF.h"
1920
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
@@ -237,25 +238,17 @@ DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
237238
std::optional<int64_t> blockDimZ) {
238239

239240
// TODO: pass a configuration object to set the limits properly.
240-
static constexpr int maxTotalBlockdim = 1024;
241-
static constexpr int maxBlockdimx = 1024;
242-
static constexpr int maxBlockdimy = 1024;
243-
static constexpr int maxBlockdimz = 64;
244-
static constexpr int maxTotalGriddim = 2147483647;
245-
static constexpr int maxGriddimx = 2147483647;
246-
static constexpr int maxGriddimy = 65535;
247-
static constexpr int maxGriddimz = 65535;
248241

249242
if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >
250-
maxTotalBlockdim ||
243+
kMaxTotalBlockdim ||
251244
(gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >
252-
maxTotalGriddim ||
253-
blockDimX.value_or(1) > maxBlockdimx ||
254-
blockDimY.value_or(1) > maxBlockdimy ||
255-
blockDimZ.value_or(1) > maxBlockdimz ||
256-
gridDimY.value_or(1) > maxGriddimy ||
257-
gridDimZ.value_or(1) > maxGriddimz ||
258-
gridDimX.value_or(1) > maxGriddimx) {
245+
kMaxTotalGriddim ||
246+
blockDimX.value_or(1) > kMaxBlockdimx ||
247+
blockDimY.value_or(1) > kMaxBlockdimy ||
248+
blockDimZ.value_or(1) > kMaxBlockdimz ||
249+
gridDimY.value_or(1) > kMaxGriddimy ||
250+
gridDimZ.value_or(1) > kMaxGriddimz ||
251+
gridDimX.value_or(1) > kMaxGriddimx) {
259252
return transformOp.emitSilenceableError()
260253
<< "Trying to launch a GPU kernel with grid_dims = ("
261254
<< gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "

0 commit comments

Comments
 (0)