1
1
module DynamicExpressionsCUDAExt
2
2
3
- using CUDA
3
+ using CUDA: @cuda , CuArray, blockDim, blockIdx, threadIdx
4
4
using DynamicExpressions: OperatorEnum, AbstractExpressionNode
5
5
using DynamicExpressions. EvaluateEquationModule: get_nbin, get_nuna
6
6
using DynamicExpressions. AsArrayModule: as_array
7
7
8
8
import DynamicExpressions. EvaluateEquationModule: eval_tree_array
9
9
10
+ # array type for exclusively testing purposes
11
+ struct FakeCuArray{T,N,A<: AbstractArray{T,N} } <: AbstractArray{T,N}
12
+ a:: A
13
+ end
14
+ Base. similar (x:: FakeCuArray , dims:: Integer... ) = FakeCuArray (similar (x. a, dims... ))
15
+ Base. getindex (x:: FakeCuArray , i:: Int... ) = getindex (x. a, i... )
16
+ Base. setindex! (x:: FakeCuArray , v, i:: Int... ) = setindex! (x. a, v, i... )
17
+ Base. size (x:: FakeCuArray ) = size (x. a)
18
+
19
+ const MaybeCuArray{T,N} = Union{CuArray{T,2 },FakeCuArray{T,N}}
20
+
21
+ to_device (a, :: CuArray ) = CuArray (a)
22
+ to_device (a, :: FakeCuArray ) = FakeCuArray (a)
23
+
10
24
function eval_tree_array (
11
- tree:: AbstractExpressionNode{T} , gcX:: CuArray {T,2} , operators:: OperatorEnum ; kws...
25
+ tree:: AbstractExpressionNode{T} , gcX:: MaybeCuArray {T,2} , operators:: OperatorEnum ; kws...
12
26
) where {T<: Number }
13
27
(outs, is_good) = eval_tree_array ((tree,), gcX, operators; kws... )
14
28
return (only (outs), only (is_good))
15
29
end
16
30
17
31
function eval_tree_array (
18
- trees:: NTuple{M,N } ,
19
- gcX:: CuArray {T,2} ,
32
+ trees:: Tuple{N,Vararg{N,M} } ,
33
+ gcX:: MaybeCuArray {T,2} ,
20
34
operators:: OperatorEnum ;
21
35
buffer= nothing ,
22
36
gpu_workspace= nothing ,
@@ -29,15 +43,19 @@ function eval_tree_array(
29
43
30
44
# # Floating point arrays:
31
45
gworkspace = if gpu_workspace === nothing
32
- CuArray {T} (undef , num_elem, num_nodes + 1 )
46
+ similar (gcX , num_elem, num_nodes + 1 )
33
47
else
34
48
gpu_workspace
35
49
end
36
50
gval = @view gworkspace[:, end ]
37
51
copyto! (gval, val)
38
52
39
53
# # Index arrays (much faster to have `@view` here)
40
- gbuffer = gpu_buffer === nothing ? CuArray (buffer) : copyto! (gpu_buffer, buffer)
54
+ gbuffer = if gpu_buffer === nothing
55
+ to_device (buffer, gcX)
56
+ else
57
+ copyto! (gpu_buffer, buffer)
58
+ end
41
59
gdegree = @view gbuffer[1 , :]
42
60
gfeature = @view gbuffer[2 , :]
43
61
gop = @view gbuffer[3 , :]
@@ -61,10 +79,10 @@ function eval_tree_array(
61
79
)
62
80
# ! format: on
63
81
64
- out = ntuple (i -> @view (gworkspace[:, roots[i]]), Val (M))
82
+ out = ntuple (i -> @view (gworkspace[:, roots[i]]), Val (M + 1 ))
65
83
is_good = ntuple (
66
84
i -> true , # Up to user to find NaNs
67
- Val (M),
85
+ Val (M + 1 ),
68
86
)
69
87
70
88
return (out, is_good)
@@ -87,12 +105,24 @@ function _launch_gpu_kernel!(
87
105
gpu_kernel! = create_gpu_kernel (operators, Val (nuna), Val (nbin))
88
106
for launch in one (I): I (num_launches)
89
107
# ! format: off
90
- @cuda threads= num_threads blocks= num_blocks gpu_kernel! (
91
- buffer,
92
- launch, num_elem, num_nodes, execution_order,
93
- cX, idx_self, idx_l, idx_r,
94
- degree, constant, val, feature, op
95
- )
108
+ if buffer isa CuArray
109
+ @cuda threads= num_threads blocks= num_blocks gpu_kernel! (
110
+ buffer,
111
+ launch, num_elem, num_nodes, execution_order,
112
+ cX, idx_self, idx_l, idx_r,
113
+ degree, constant, val, feature, op
114
+ )
115
+ else
116
+ Threads. @threads for i in 1 : (num_threads * num_blocks)
117
+ gpu_kernel! (
118
+ buffer,
119
+ launch, num_elem, num_nodes, execution_order,
120
+ cX, idx_self, idx_l, idx_r,
121
+ degree, constant, val, feature, op,
122
+ i
123
+ )
124
+ end
125
+ end
96
126
# ! format: on
97
127
end
98
128
return nothing
0 commit comments