@@ -25,6 +25,55 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
25
25
blockDim.x = blockX;
26
26
blockDim.y = blockY;
27
27
blockDim.z = blockZ;
28
+ unsigned nbNegGridDim{0 };
29
+ if (gridX < 0 ) {
30
+ ++nbNegGridDim;
31
+ }
32
+ if (gridY < 0 ) {
33
+ ++nbNegGridDim;
34
+ }
35
+ if (gridZ < 0 ) {
36
+ ++nbNegGridDim;
37
+ }
38
+ if (nbNegGridDim == 1 ) {
39
+ int maxBlocks, nbBlocks, dev, multiProcCount;
40
+ cudaError_t err1, err2;
41
+ nbBlocks = blockDim.x * blockDim.y * blockDim.z ;
42
+ cudaGetDevice (&dev);
43
+ err1 = cudaDeviceGetAttribute (
44
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
45
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
46
+ &maxBlocks, kernel, nbBlocks, smem);
47
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
48
+ maxBlocks = multiProcCount * maxBlocks;
49
+ }
50
+ if (maxBlocks > 0 ) {
51
+ if (gridDim.x > 0 ) {
52
+ maxBlocks = maxBlocks / gridDim.x ;
53
+ }
54
+ if (gridDim.y > 0 ) {
55
+ maxBlocks = maxBlocks / gridDim.y ;
56
+ }
57
+ if (gridDim.z > 0 ) {
58
+ maxBlocks = maxBlocks / gridDim.z ;
59
+ }
60
+ if (maxBlocks < 1 ) {
61
+ maxBlocks = 1 ;
62
+ }
63
+ if (gridX < 0 ) {
64
+ gridDim.x = maxBlocks;
65
+ }
66
+ if (gridY < 0 ) {
67
+ gridDim.y = maxBlocks;
68
+ }
69
+ if (gridZ < 0 ) {
70
+ gridDim.z = maxBlocks;
71
+ }
72
+ }
73
+ } else if (nbNegGridDim > 1 ) {
74
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
75
+ terminator.Crash (" Too many invalid grid dimensions" );
76
+ }
28
77
cudaStream_t stream = 0 ; // TODO stream managment
29
78
CUDA_REPORT_IF_ERROR (
30
79
cudaLaunchKernel (kernel, gridDim, blockDim, params, smem, stream));
@@ -41,6 +90,55 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
41
90
config.blockDim .x = blockX;
42
91
config.blockDim .y = blockY;
43
92
config.blockDim .z = blockZ;
93
+ unsigned nbNegGridDim{0 };
94
+ if (gridX < 0 ) {
95
+ ++nbNegGridDim;
96
+ }
97
+ if (gridY < 0 ) {
98
+ ++nbNegGridDim;
99
+ }
100
+ if (gridZ < 0 ) {
101
+ ++nbNegGridDim;
102
+ }
103
+ if (nbNegGridDim == 1 ) {
104
+ int maxBlocks, nbBlocks, dev, multiProcCount;
105
+ cudaError_t err1, err2;
106
+ nbBlocks = config.blockDim .x * config.blockDim .y * config.blockDim .z ;
107
+ cudaGetDevice (&dev);
108
+ err1 = cudaDeviceGetAttribute (
109
+ &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
110
+ err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor (
111
+ &maxBlocks, kernel, nbBlocks, smem);
112
+ if (err1 == cudaSuccess && err2 == cudaSuccess) {
113
+ maxBlocks = multiProcCount * maxBlocks;
114
+ }
115
+ if (maxBlocks > 0 ) {
116
+ if (config.gridDim .x > 0 ) {
117
+ maxBlocks = maxBlocks / config.gridDim .x ;
118
+ }
119
+ if (config.gridDim .y > 0 ) {
120
+ maxBlocks = maxBlocks / config.gridDim .y ;
121
+ }
122
+ if (config.gridDim .z > 0 ) {
123
+ maxBlocks = maxBlocks / config.gridDim .z ;
124
+ }
125
+ if (maxBlocks < 1 ) {
126
+ maxBlocks = 1 ;
127
+ }
128
+ if (gridX < 0 ) {
129
+ config.gridDim .x = maxBlocks;
130
+ }
131
+ if (gridY < 0 ) {
132
+ config.gridDim .y = maxBlocks;
133
+ }
134
+ if (gridZ < 0 ) {
135
+ config.gridDim .z = maxBlocks;
136
+ }
137
+ }
138
+ } else if (nbNegGridDim > 1 ) {
139
+ Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
140
+ terminator.Crash (" Too many invalid grid dimensions" );
141
+ }
44
142
config.dynamicSmemBytes = smem;
45
143
config.stream = 0 ; // TODO stream managment
46
144
cudaLaunchAttribute launchAttr[1 ];
0 commit comments