Open
Description
The execution of CUDA kernel freezes on cooperative_groups::this_grid().sync();
if blockCount > 2 * smCount
, despite the maximum allowed blockCount
being reported as 16 * smCount
. The bug only exhibits itself if optimization is disabled. Any other optomization level, except -O0
works fine.
Tested with:
- clang version 17.0.6 (Fedora 17.0.6-2.fc39)
- CUDA versions 12.1 and 12.5, obtained from Nvidia website
Here is the code to reproduce the issue, along with the command, used to compile it.
clang++ --std=c++17 ./test.cu -o test -O0 --cuda-gpu-arch=sm_86 -L/usr/local/cuda-12.1/lib64 -lcudart_static -ldl -lrt -pthread
// System includes
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
//Cooperative groups
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
__global__ void testKernel() {
auto grid = cooperative_groups::this_grid();
grid.sync();
}
int main(int argc, char **argv) {
int blockSize=1;
int devID = 0;
cudaDeviceProp props;
// Get GPU information
cudaGetDevice(&devID);
cudaGetDeviceProperties(&props, devID);
printf("Device %d: \"%s\" with Compute %d.%d capability\n", devID, props.name,
props.major, props.minor);
int numBlocksPerSm = 0;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, testKernel, blockSize,
0);
int smCount = props.multiProcessorCount;
void *kernelArgs[] = {};
printf("SmCount: %i, maximum blocks per SM: %i.\n\n", smCount, numBlocksPerSm);
{
printf("Launching with 2 * SmCount: %i blocks.\n", 2 * smCount);
dim3 dimGrid(2 * smCount);
dim3 dimBlock(blockSize);
cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
dimBlock, kernelArgs, 0, NULL);
cudaDeviceSynchronize();
printf("Test passed.\n\n");
}
{
printf("Launching with 2 * SmCount + 1: %i blocks. Clang with -O0 will hang up here.\n", 2 * smCount + 1);
dim3 dimGrid(2 * smCount + 1);
dim3 dimBlock(blockSize);
cudaLaunchCooperativeKernel((void *)testKernel, dimGrid,
dimBlock, kernelArgs, 0, NULL);
cudaDeviceSynchronize();
printf("Test passed.\n\n");
}
printf("All tests passed.\n");
return EXIT_SUCCESS;
}