Skip to content

Commit 6b21cf8

Browse files
authored
[flang][cuda] Compute grid x when calling a kernel with <<<*, block>>> (#115538)
`-1, 1, 1` is passed when calling a kernel with the `<<<*, block>>>` syntax. Query the device to compute the grid.x value.
1 parent 023483f commit 6b21cf8

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

flang/runtime/CUDA/kernel.cpp

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,55 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
2525
blockDim.x = blockX;
2626
blockDim.y = blockY;
2727
blockDim.z = blockZ;
28+
unsigned nbNegGridDim{0};
29+
if (gridX < 0) {
30+
++nbNegGridDim;
31+
}
32+
if (gridY < 0) {
33+
++nbNegGridDim;
34+
}
35+
if (gridZ < 0) {
36+
++nbNegGridDim;
37+
}
38+
if (nbNegGridDim == 1) {
39+
int maxBlocks, nbBlocks, dev, multiProcCount;
40+
cudaError_t err1, err2;
41+
nbBlocks = blockDim.x * blockDim.y * blockDim.z;
42+
cudaGetDevice(&dev);
43+
err1 = cudaDeviceGetAttribute(
44+
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
45+
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
46+
&maxBlocks, kernel, nbBlocks, smem);
47+
if (err1 == cudaSuccess && err2 == cudaSuccess) {
48+
maxBlocks = multiProcCount * maxBlocks;
49+
}
50+
if (maxBlocks > 0) {
51+
if (gridDim.x > 0) {
52+
maxBlocks = maxBlocks / gridDim.x;
53+
}
54+
if (gridDim.y > 0) {
55+
maxBlocks = maxBlocks / gridDim.y;
56+
}
57+
if (gridDim.z > 0) {
58+
maxBlocks = maxBlocks / gridDim.z;
59+
}
60+
if (maxBlocks < 1) {
61+
maxBlocks = 1;
62+
}
63+
if (gridX < 0) {
64+
gridDim.x = maxBlocks;
65+
}
66+
if (gridY < 0) {
67+
gridDim.y = maxBlocks;
68+
}
69+
if (gridZ < 0) {
70+
gridDim.z = maxBlocks;
71+
}
72+
}
73+
} else if (nbNegGridDim > 1) {
74+
Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
75+
terminator.Crash("Too many invalid grid dimensions");
76+
}
2877
cudaStream_t stream = 0; // TODO stream managment
2978
CUDA_REPORT_IF_ERROR(
3079
cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +90,55 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
4190
config.blockDim.x = blockX;
4291
config.blockDim.y = blockY;
4392
config.blockDim.z = blockZ;
93+
unsigned nbNegGridDim{0};
94+
if (gridX < 0) {
95+
++nbNegGridDim;
96+
}
97+
if (gridY < 0) {
98+
++nbNegGridDim;
99+
}
100+
if (gridZ < 0) {
101+
++nbNegGridDim;
102+
}
103+
if (nbNegGridDim == 1) {
104+
int maxBlocks, nbBlocks, dev, multiProcCount;
105+
cudaError_t err1, err2;
106+
nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
107+
cudaGetDevice(&dev);
108+
err1 = cudaDeviceGetAttribute(
109+
&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
110+
err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
111+
&maxBlocks, kernel, nbBlocks, smem);
112+
if (err1 == cudaSuccess && err2 == cudaSuccess) {
113+
maxBlocks = multiProcCount * maxBlocks;
114+
}
115+
if (maxBlocks > 0) {
116+
if (config.gridDim.x > 0) {
117+
maxBlocks = maxBlocks / config.gridDim.x;
118+
}
119+
if (config.gridDim.y > 0) {
120+
maxBlocks = maxBlocks / config.gridDim.y;
121+
}
122+
if (config.gridDim.z > 0) {
123+
maxBlocks = maxBlocks / config.gridDim.z;
124+
}
125+
if (maxBlocks < 1) {
126+
maxBlocks = 1;
127+
}
128+
if (gridX < 0) {
129+
config.gridDim.x = maxBlocks;
130+
}
131+
if (gridY < 0) {
132+
config.gridDim.y = maxBlocks;
133+
}
134+
if (gridZ < 0) {
135+
config.gridDim.z = maxBlocks;
136+
}
137+
}
138+
} else if (nbNegGridDim > 1) {
139+
Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
140+
terminator.Crash("Too many invalid grid dimensions");
141+
}
44142
config.dynamicSmemBytes = smem;
45143
config.stream = 0; // TODO stream managment
46144
cudaLaunchAttribute launchAttr[1];

0 commit comments

Comments
 (0)