Start working on CPU-based unittests

MilesCranmer · MilesCranmer · commit 74094e36e61c · 2024-02-25T22:15:47.000Z
diff --git a/ext/DynamicExpressionsCUDAExt.jl b/ext/DynamicExpressionsCUDAExt.jl
@@ -1,22 +1,36 @@
 module DynamicExpressionsCUDAExt
 
-using CUDA
+using CUDA: @cuda, CuArray, blockDim, blockIdx, threadIdx
 using DynamicExpressions: OperatorEnum, AbstractExpressionNode
 using DynamicExpressions.EvaluateEquationModule: get_nbin, get_nuna
 using DynamicExpressions.AsArrayModule: as_array
 
 import DynamicExpressions.EvaluateEquationModule: eval_tree_array
 
+# array type for exclusively testing purposes
+struct FakeCuArray{T,N,A<:AbstractArray{T,N}} <: AbstractArray{T,N}
+    a::A
+end
+Base.similar(x::FakeCuArray, dims::Integer...) = FakeCuArray(similar(x.a, dims...))
+Base.getindex(x::FakeCuArray, i::Int...) = getindex(x.a, i...)
+Base.setindex!(x::FakeCuArray, v, i::Int...) = setindex!(x.a, v, i...)
+Base.size(x::FakeCuArray) = size(x.a)
+
+const MaybeCuArray{T,N} = Union{CuArray{T,2},FakeCuArray{T,N}}
+
+to_device(a, ::CuArray) = CuArray(a)
+to_device(a, ::FakeCuArray) = FakeCuArray(a)
+
 function eval_tree_array(
-    tree::AbstractExpressionNode{T}, gcX::CuArray{T,2}, operators::OperatorEnum; kws...
+    tree::AbstractExpressionNode{T}, gcX::MaybeCuArray{T,2}, operators::OperatorEnum; kws...
 ) where {T<:Number}
     (outs, is_good) = eval_tree_array((tree,), gcX, operators; kws...)
     return (only(outs), only(is_good))
 end
 
 function eval_tree_array(
-    trees::NTuple{M,N},
-    gcX::CuArray{T,2},
+    trees::Tuple{N,Vararg{N,M}},
+    gcX::MaybeCuArray{T,2},
     operators::OperatorEnum;
     buffer=nothing,
     gpu_workspace=nothing,
@@ -29,15 +43,19 @@ function eval_tree_array(
 
     ## Floating point arrays:
     gworkspace = if gpu_workspace === nothing
-        CuArray{T}(undef, num_elem, num_nodes + 1)
+        similar(gcX, num_elem, num_nodes + 1)
     else
         gpu_workspace
     end
     gval = @view gworkspace[:, end]
     copyto!(gval, val)
 
     ## Index arrays (much faster to have `@view` here)
-    gbuffer = gpu_buffer === nothing ? CuArray(buffer) : copyto!(gpu_buffer, buffer)
+    gbuffer = if gpu_buffer === nothing
+        to_device(buffer, gcX)
+    else
+        copyto!(gpu_buffer, buffer)
+    end
     gdegree = @view gbuffer[1, :]
     gfeature = @view gbuffer[2, :]
     gop = @view gbuffer[3, :]
@@ -61,10 +79,10 @@ function eval_tree_array(
     )
     #! format: on
 
-    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M))
+    out = ntuple(i -> @view(gworkspace[:, roots[i]]), Val(M + 1))
     is_good = ntuple(
         i -> true,  # Up to user to find NaNs
-        Val(M),
+        Val(M + 1),
     )
 
     return (out, is_good)
@@ -87,12 +105,24 @@ function _launch_gpu_kernel!(
     gpu_kernel! = create_gpu_kernel(operators, Val(nuna), Val(nbin))
     for launch in one(I):I(num_launches)
         #! format: off
-        @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
-            buffer,
-            launch, num_elem, num_nodes, execution_order,
-            cX, idx_self, idx_l, idx_r,
-            degree, constant, val, feature, op
-        )
+        if buffer isa CuArray
+            @cuda threads=num_threads blocks=num_blocks gpu_kernel!(
+                buffer,
+                launch, num_elem, num_nodes, execution_order,
+                cX, idx_self, idx_l, idx_r,
+                degree, constant, val, feature, op
+            )
+        else
+            Threads.@threads for i in 1:(num_threads * num_blocks)
+                gpu_kernel!(
+                    buffer,
+                    launch, num_elem, num_nodes, execution_order,
+                    cX, idx_self, idx_l, idx_r,
+                    degree, constant, val, feature, op,
+                    i
+                )
+            end
+        end
         #! format: on
     end
     return nothing
diff --git a/test/test_cuda.jl b/test/test_cuda.jl
@@ -0,0 +1,29 @@
+using DynamicExpressions
+using CUDA
+using Random
+
+ext = Base.get_extension(DynamicExpressions, :DynamicExpressionsCUDAExt)
+const FakeCuArray = ext.FakeCuArray
+
+include("tree_gen_utils.jl")
+
+let
+    operators = OperatorEnum(; binary_operators=[+, -, *, /], unary_operators=[cos, sin]);
+    x1, x2, x3 = (i -> Node(Float64; feature=i)).(1:3)
+
+    for T in (Float32, Float64, ComplexF64), num_trees in (1, 2, 3), seed in 0:10
+        Random.seed!(seed)
+        num_rows = rand(10:30)
+        nodes_per = rand(10:25, num_trees)
+        trees = ntuple(i -> gen_random_tree_fixed_size(nodes_per[i], operators, 3, T), num_trees)
+        @show trees
+        X = randn(T, 3, num_rows)
+        y, completed = eval_tree_array(trees, X, operators)
+        gpu_y, gpu_completed = eval_tree_array(trees, FakeCuArray(X), operators)
+        gpu_y = Array.(gpu_y)
+
+        for i in eachindex(completed, gpu_completed)
+            @test ((completed[i] && gpu_completed[i]) && (y[i] ≈ gpu_y[i])) || (!completed[i] && !gpu_completed[i])
+        end
+    end
+end
diff --git a/test/unittest.jl b/test/unittest.jl
@@ -9,7 +9,7 @@ end
 end
 
 # Trigger extensions:
-using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim
+using Zygote, SymbolicUtils, LoopVectorization, Bumper, Optim, CUDA
 
 @safetestset "Test deprecations" begin
     include("test_deprecations.jl")
@@ -110,3 +110,7 @@ end
 @safetestset "Test random sampling" begin
     include("test_random.jl")
 end
+
+@safetestset "Test CUDA" begin
+    include("test_cuda.jl")
+end