1
- module DynamicExpressionsCUDAExt
1
+ module DynamicExpressionsKernelAbstractionsExt
2
2
3
- using CUDA : @cuda , CuArray, blockDim, blockIdx, threadIdx
3
+ using KernelAbstractions : @index , @kernel , @Const , get_backend
4
4
using DynamicExpressions: OperatorEnum, AbstractExpressionNode
5
5
using DynamicExpressions. EvaluateEquationModule: get_nbin, get_nuna
6
6
using DynamicExpressions. AsArrayModule: as_array
7
7
8
- import DynamicExpressions. EvaluateEquationModule : eval_tree_array
8
+ import DynamicExpressions. ExtensionInterfaceModule : gpu_eval_tree_array
9
9
10
- # array type for exclusively testing purposes
11
- struct FakeCuArray{T,N,A<: AbstractArray{T,N} } <: AbstractArray{T,N}
12
- a:: A
13
- end
14
- Base. similar (x:: FakeCuArray , dims:: Integer... ) = FakeCuArray (similar (x. a, dims... ))
15
- Base. getindex (x:: FakeCuArray , i:: Int... ) = getindex (x. a, i... )
16
- Base. setindex! (x:: FakeCuArray , v, i:: Int... ) = setindex! (x. a, v, i... )
17
- Base. size (x:: FakeCuArray ) = size (x. a)
18
-
19
- const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
20
-
21
- to_device (a, :: CuArray ) = CuArray (a)
22
- to_device (a, :: FakeCuArray ) = FakeCuArray (a)
23
-
24
- function eval_tree_array (
25
- tree:: AbstractExpressionNode{T} , gcX:: MaybeCuArray{T,2} , operators:: OperatorEnum ; kws...
10
+ function gpu_eval_tree_array (
11
+ tree:: AbstractExpressionNode{T} , gcX, operators:: OperatorEnum ; kws...
26
12
) where {T<: Number }
27
- (outs, is_good) = eval_tree_array ((tree,), gcX, operators; kws... )
13
+ (outs, is_good) = gpu_eval_tree_array ((tree,), gcX, operators; kws... )
28
14
return (only (outs), only (is_good))
29
15
end
30
16
31
- function eval_tree_array (
17
+ function gpu_eval_tree_array (
32
18
trees:: Union{Tuple{N,Vararg{N}},AbstractVector{N}} ,
33
- gcX:: MaybeCuArray{T,2} ,
19
+ gcX,
34
20
operators:: OperatorEnum ;
21
+ backend= get_backend (gcX),
35
22
buffer= nothing ,
36
23
gpu_workspace= nothing ,
37
24
gpu_buffer= nothing ,
38
25
roots= nothing ,
39
26
num_nodes= nothing ,
40
27
num_launches= nothing ,
41
28
update_buffers:: Val{_update_buffers} = Val (true ),
42
- kws... ,
43
29
) where {T<: Number ,N<: AbstractExpressionNode{T} ,_update_buffers}
44
30
if _update_buffers
45
31
(; val, roots, buffer, num_nodes, num_launches) = as_array (Int32, trees; buffer)
@@ -82,6 +68,7 @@ function eval_tree_array(
82
68
83
69
# ! format: off
84
70
_launch_gpu_kernel! (
71
+ backend,
85
72
num_threads, num_blocks, num_launches, gworkspace,
86
73
# Thread info:
87
74
num_elem, num_nodes, gexecution_order,
99
86
100
87
# ! format: off
101
88
function _launch_gpu_kernel! (
89
+ backend,
102
90
num_threads, num_blocks, num_launches:: Integer , buffer:: AbstractArray{T,2} ,
103
91
# Thread info:
104
92
num_elem:: Integer , num_nodes:: Integer , execution_order:: AbstractArray{I} ,
@@ -114,24 +102,12 @@ function _launch_gpu_kernel!(
114
102
gpu_kernel! = create_gpu_kernel (operators, Val (nuna), Val (nbin))
115
103
for launch in one (I): I (num_launches)
116
104
# ! format: off
117
- if buffer isa CuArray
118
- @cuda threads= num_threads blocks= num_blocks gpu_kernel! (
119
- buffer,
120
- launch, num_elem, num_nodes, execution_order,
121
- cX, idx_self, idx_l, idx_r,
122
- degree, constant, val, feature, op
123
- )
124
- else
125
- Threads. @threads for i in 1 : (num_threads * num_blocks)
126
- gpu_kernel! (
127
- buffer,
128
- launch, num_elem, num_nodes, execution_order,
129
- cX, idx_self, idx_l, idx_r,
130
- degree, constant, val, feature, op,
131
- i
132
- )
133
- end
134
- end
105
+ gpu_kernel! (backend, num_threads * num_blocks)(
106
+ buffer,
107
+ launch, num_elem, num_nodes, execution_order,
108
+ cX, idx_self, idx_l, idx_r,
109
+ degree, constant, val, feature, op
110
+ )
135
111
# ! format: on
136
112
end
137
113
return nothing
@@ -146,19 +122,17 @@ end
146
122
for nuna in 0 : 10 , nbin in 0 : 10
147
123
@eval function create_gpu_kernel (operators:: OperatorEnum , :: Val{$nuna} , :: Val{$nbin} )
148
124
# ! format: off
149
- function (
125
+ @kernel function k (
150
126
# Storage:
151
127
buffer,
152
128
# Thread info:
153
- launch:: Integer , num_elem:: Integer , num_nodes:: Integer , execution_order:: AbstractArray ,
129
+ @Const ( launch) :: Integer , @Const ( num_elem) :: Integer , @Const ( num_nodes) :: Integer , @Const ( execution_order) :: AbstractArray{I} ,
154
130
# Input data and tree
155
- cX:: AbstractArray , idx_self:: AbstractArray , idx_l:: AbstractArray , idx_r:: AbstractArray ,
156
- degree:: AbstractArray , constant:: AbstractArray , val:: AbstractArray , feature:: AbstractArray , op:: AbstractArray ,
157
- # Override for unittesting:
158
- i= nothing ,
131
+ @Const (cX):: AbstractArray , @Const (idx_self):: AbstractArray , @Const (idx_l):: AbstractArray , @Const (idx_r):: AbstractArray ,
132
+ @Const (degree):: AbstractArray , @Const (constant):: AbstractArray , @Const (val):: AbstractArray , @Const (feature):: AbstractArray , @Const (op):: AbstractArray ,
159
133
)
160
134
# ! format: on
161
- i = i === nothing ? ( blockIdx () . x - 1 ) * blockDim () . x + threadIdx () . x : i
135
+ i = @index (Global, Linear)
162
136
if i > num_elem * num_nodes
163
137
return nothing
164
138
end
@@ -186,8 +160,8 @@ for nuna in 0:10, nbin in 0:10
186
160
l_idx = idx_l[node]
187
161
Base. Cartesian. @nif (
188
162
$ nuna,
189
- i -> i == cur_op,
190
- i -> let op = operators. unaops[i ]
163
+ j -> j == cur_op,
164
+ j -> let op = operators. unaops[j ]
191
165
buffer[elem, cur_idx] = op (buffer[elem, l_idx])
192
166
end
193
167
)
@@ -197,8 +171,8 @@ for nuna in 0:10, nbin in 0:10
197
171
r_idx = idx_r[node]
198
172
Base. Cartesian. @nif (
199
173
$ nbin,
200
- i -> i == cur_op,
201
- i -> let op = operators. binops[i ]
174
+ j -> j == cur_op,
175
+ j -> let op = operators. binops[j ]
202
176
buffer[elem, cur_idx] = op (buffer[elem, l_idx], buffer[elem, r_idx])
203
177
end
204
178
)
0 commit comments