Start switching to KernelAbstractions.jl

MilesCranmer · MilesCranmer · commit c1e579a95c5f · 2024-02-26T01:15:34.000Z
diff --git a/Project.toml b/Project.toml
@@ -14,15 +14,15 @@ TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 
 [weakdeps]
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SymbolicUtils = "d1185830-fcd6-423d-90d6-eec64667417b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 DynamicExpressionsBumperExt = "Bumper"
-DynamicExpressionsCUDAExt = "CUDA"
+DynamicExpressionsKernelAbstractionsExt = "KernelAbstractions"
 DynamicExpressionsLoopVectorizationExt = "LoopVectorization"
 DynamicExpressionsOptimExt = "Optim"
 DynamicExpressionsSymbolicUtilsExt = "SymbolicUtils"
@@ -31,7 +31,7 @@ DynamicExpressionsZygoteExt = "Zygote"
 [compat]
 Aqua = "0.7"
 Bumper = "0.6"
-CUDA = "4, 5"
+KernelAbstractions = "0.9"
 Compat = "3.37, 4"
 Enzyme = "^0.11.12"
 LoopVectorization = "0.12"
@@ -47,9 +47,9 @@ julia = "1.6"
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -61,4 +61,4 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "SafeTestsets", "Aqua", "Bumper", "CUDA", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
+test = ["Test", "SafeTestsets", "Aqua", "Bumper", "KernelAbstractions", "Enzyme", "ForwardDiff", "LinearAlgebra", "LoopVectorization", "Optim", "SpecialFunctions", "StaticArrays", "SymbolicUtils", "Zygote"]
diff --git a/ext/DynamicExpressionsKernelAbstractionsExt.jl b/ext/DynamicExpressionsKernelAbstractionsExt.jl
@@ -1,45 +1,31 @@
-module DynamicExpressionsCUDAExt
+module DynamicExpressionsKernelAbstractionsExt
 
-using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
+using KernelAbstractions: @index, @kernel, @Const, get_backend
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
-import DynamicExpressions.EvaluateEquationModule: eval_tree_array
+import DynamicExpressions.ExtensionInterfaceModule: gpu_eval_tree_array
 
-# array type for exclusively testing purposes
-struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
-    a::A
-end
-Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
-Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
-Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
-Base.size(x::FakeCuArray) = size(x.a)
-
-const MaybeCuArray{T,N} = Union{CuArray{T,N},FakeCuArray{T,N}}
-
-to_device(a, ::CuArray) = CuArray(a)
-to_device(a, ::FakeCuArray) = FakeCuArray(a)
-
-function eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
+function gpu_eval_tree_array(
+    tree::AbstractExpressionNode{T}, gcX, operators::OperatorEnum; kws...
 ) where {T<:Number}
-    (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
+    (outs, is_good) = gpu_eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
-function eval_tree_array(
+function gpu_eval_tree_array(
     trees::Union{Tuple{N,Vararg{N}},AbstractVector{N}},
-    gcX::MaybeCuArray{T,2},
+    gcX,
     operators::OperatorEnum;
+    backend=get_backend(gcX),
     buffer=nothing,
     gpu_workspace=nothing,
     gpu_buffer=nothing,
     roots=nothing,
     num_nodes=nothing,
     num_launches=nothing,
     update_buffers::Val{_update_buffers}=Val(true),
-    kws...,
 ) where {T<:Number,N<:AbstractExpressionNode{T},_update_buffers}
     if _update_buffers
         (; val, roots, buffer, num_nodes, num_launches) = as_array(Int32, trees; buffer)
@@ -82,6 +68,7 @@ function eval_tree_array(
 
     #! format: off
     _launch_gpu_kernel!(
+        backend,
         num_threads, num_blocks, num_launches, gworkspace,
         # Thread info:
         num_elem, num_nodes, gexecution_order,
@@ -99,6 +86,7 @@ end
 
 #! format: off
 function _launch_gpu_kernel!(
+    backend,
     num_threads, num_blocks, num_launches::Integer, buffer::AbstractArray{T,2},
     # Thread info:
     num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray{I},
@@ -114,24 +102,12 @@ function _launch_gpu_kernel!(
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
         #! format: off
-        if buffer isa CuArray
-            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
-                buffer,
-                launch, num_elem, num_nodes, execution_order,
-                cX, idx_self, idx_l, idx_r,
-                degree, constant, val, feature, op
-            )
-        else
-            Threads.@threads for i in 1:(num_threads * num_blocks)
-                gpu_kernel!(
-                    buffer,
-                    launch, num_elem, num_nodes, execution_order,
-                    cX, idx_self, idx_l, idx_r,
-                    degree, constant, val, feature, op,
-                    i
-                )
-            end
-        end
+        gpu_kernel!(backend, num_threads * num_blocks)(
+            buffer,
+            launch, num_elem, num_nodes, execution_order,
+            cX, idx_self, idx_l, idx_r,
+            degree, constant, val, feature, op
+        )
         #! format: on
     end
     return nothing
@@ -146,19 +122,17 @@ end
 for nuna in 0:10, nbin in 0:10
     @eval function create_gpu_kernel(operators::OperatorEnum, ::Val{$nuna}, ::Val{$nbin})
         #! format: off
-        function (
+        @kernel function k(
             # Storage:
             buffer,
             # Thread info:
-            launch::Integer, num_elem::Integer, num_nodes::Integer, execution_order::AbstractArray,
+            @Const(launch)::Integer, @Const(num_elem)::Integer, @Const(num_nodes)::Integer, @Const(execution_order)::AbstractArray{I},
             # Input data and tree
-            cX::AbstractArray, idx_self::AbstractArray, idx_l::AbstractArray, idx_r::AbstractArray,
-            degree::AbstractArray, constant::AbstractArray, val::AbstractArray, feature::AbstractArray, op::AbstractArray,
-            # Override for unittesting:
-            i=nothing,
+            @Const(cX)::AbstractArray, @Const(idx_self)::AbstractArray, @Const(idx_l)::AbstractArray, @Const(idx_r)::AbstractArray,
+            @Const(degree)::AbstractArray, @Const(constant)::AbstractArray, @Const(val)::AbstractArray, @Const(feature)::AbstractArray, @Const(op)::AbstractArray,
         )
             #! format: on
-            i = i === nothing ? (blockIdx().x - 1) * blockDim().x + threadIdx().x : i
+            i = @index(Global, Linear)
             if i > num_elem * num_nodes
                 return nothing
             end
@@ -186,8 +160,8 @@ for nuna in 0:10, nbin in 0:10
                     l_idx = idx_l[node]
                     Base.Cartesian.@nif(
                         $nuna,
-                        i -> i == cur_op,
-                        i -> let op = operators.unaops[i]
+                        j -> j == cur_op,
+                        j -> let op = operators.unaops[j]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx])
                         end
                     )
@@ -197,8 +171,8 @@ for nuna in 0:10, nbin in 0:10
                     r_idx = idx_r[node]
                     Base.Cartesian.@nif(
                         $nbin,
-                        i -> i == cur_op,
-                        i -> let op = operators.binops[i]
+                        j -> j == cur_op,
+                        j -> let op = operators.binops[j]
                             buffer[elem, cur_idx] = op(buffer[elem, l_idx], buffer[elem, r_idx])
                         end
                     )
diff --git a/src/DynamicExpressions.jl b/src/DynamicExpressions.jl
@@ -48,7 +48,8 @@ import .EquationModule: constructorof, preserve_sharing
     eval_diff_tree_array, eval_grad_tree_array
 @reexport import .SimplifyEquationModule: combine_operators, simplify_tree!
 @reexport import .EvaluationHelpersModule
-@reexport import .ExtensionInterfaceModule: node_to_symbolic, symbolic_to_node
+@reexport import .ExtensionInterfaceModule:
+    node_to_symbolic, symbolic_to_node, gpu_eval_tree_array
 @reexport import .RandomModule: NodeSampler
 @reexport import .AsArrayModule: as_array
 
diff --git a/src/ExtensionInterface.jl b/src/ExtensionInterface.jl
@@ -14,6 +14,9 @@ end
 function bumper_eval_tree_array(args...)
     return error("Please load the Bumper.jl package to use this feature.")
 end
+function gpu_eval_tree_array(args...)
+    return error("Please load a GPU backend such as CUDA.jl to use this feature.")
+end
 function bumper_kern1! end
 function bumper_kern2! end