Handle vector parameters scalar output

ricardoV94 · aseyboldt · ricardoV94 · commit 08db524fb480 · 2024-04-23T04:51:34.000+02:00
Co-authored-by: Adrian Seyboldt &lt;aseyboldt@users.noreply.github.com&gt;
diff --git a/pytensor/link/numba/dispatch/random.py b/pytensor/link/numba/dispatch/random.py
@@ -198,11 +198,11 @@ def numba_funcify_RandomVariable(op: RandomVariable, node, **kwargs):
     # scalar_op_fn = locals()['scalar_op_fn']
 
     # @numba_basic.numba_njit
-    # def scalar_op_fn(rng, mu, scale):
+    # def core_op_fn(rng, mu, scale):
     #     return rng.normal(mu, scale)
 
     @numba_basic.numba_njit
-    def scalar_op_fn(rng, p):
+    def core_op_fn(rng, p):
         unif_sample = rng.uniform(0, 1)
         return np.searchsorted(np.cumsum(p), unif_sample)
 
@@ -227,7 +227,7 @@ def random_wrapper(rng, size, dtype, *inputs):
             rng = copy(rng)
 
         draws = _vectorized(
-            scalar_op_fn,
+            core_op_fn,
             input_bc_patterns_enc,
             output_bc_patterns_enc,
             output_dtypes_enc,
diff --git a/pytensor/link/numba/dispatch/vectorize_codegen.py b/pytensor/link/numba/dispatch/vectorize_codegen.py
@@ -12,7 +12,6 @@
 from numba.core.base import BaseContext
 from numba.core.types.misc import NoneType
 from numba.np import arrayobj
-from numba.np.ufunc.wrappers import _ArrayArgLoader
 
 
 def compute_itershape(
@@ -158,7 +157,7 @@ def make_loop_call(
     input_types: tuple[Any, ...],
     output_types: tuple[Any, ...],
 ):
-    # safe = (False, False)
+    safe = (False, False)
 
     n_outputs = len(outputs)
 
@@ -183,14 +182,6 @@ def extract_array(aryty, obj):
     # input_scope_set = mod.add_metadata([input_scope, output_scope])
     # output_scope_set = mod.add_metadata([input_scope, output_scope])
 
-    typ = input_types[0]
-    inp = inputs[0]
-    shape = cgutils.unpack_tuple(builder, inp.shape)
-    strides = cgutils.unpack_tuple(builder, inp.strides)
-    loader = _ArrayArgLoader(typ.dtype, typ.ndim, shape[-1], False, shape, strides)
-
-    inputs = tuple(extract_array(aryty, ary) for aryty, ary in zip(input_types, inputs))
-
     outputs = tuple(
         extract_array(aryty, ary) for aryty, ary in zip(output_types, outputs)
     )
@@ -221,13 +212,50 @@ def extract_array(aryty, obj):
 
     # Load values from input arrays
     input_vals = []
-    for array_info, bc in zip(inputs, input_bc):
-        idxs_bc = [zero if bc else idx for idx, bc in zip(idxs, bc)]
-        # ptr = cgutils.get_item_pointer2(context, builder, *array_info, idxs_bc, *safe)
-        val = loader.load(context, builder, inp.data, idxs[0] or zero)
-        # val = builder.load(ptr)
-        # val.set_metadata("alias.scope", input_scope_set)
-        # val.set_metadata("noalias", output_scope_set)
+    for input, input_type, bc in zip(inputs, input_types, input_bc):
+        core_ndim = input_type.ndim - len(bc)
+
+        idxs_bc = [zero if bc else idx for idx, bc in zip(idxs, bc)] + [
+            zero
+        ] * core_ndim
+        ptr = cgutils.get_item_pointer2(
+            context,
+            builder,
+            input.data,
+            cgutils.unpack_tuple(builder, input.shape),
+            cgutils.unpack_tuple(builder, input.strides),
+            input_type.layout,
+            idxs_bc,
+            *safe,
+        )
+        if core_ndim == 0:
+            # Retrive scalar item at index
+            val = builder.load(ptr)
+            # val.set_metadata("alias.scope", input_scope_set)
+            # val.set_metadata("noalias", output_scope_set)
+        else:
+            # Retrieve array item at index
+            # This is a streamlined version of Numba's `GUArrayArg.load`
+            # TODO check layout arg!
+            core_arry_type = types.Array(
+                dtype=input_type.dtype, ndim=core_ndim, layout=input_type.layout
+            )
+            core_array = context.make_array(core_arry_type)(context, builder)
+            core_shape = cgutils.unpack_tuple(builder, input.shape)[-core_ndim:]
+            core_strides = cgutils.unpack_tuple(builder, input.strides)[-core_ndim:]
+            itemsize = context.get_abi_sizeof(context.get_data_type(input_type.dtype))
+            context.populate_array(
+                core_array,
+                # TODO whey do we need to bitcast?
+                data=builder.bitcast(ptr, core_array.data.type),
+                shape=cgutils.pack_array(builder, core_shape),
+                strides=cgutils.pack_array(builder, core_strides),
+                itemsize=context.get_constant(types.intp, itemsize),
+                # TODO what is meminfo about?
+                meminfo=None,
+            )
+            val = core_array._getvalue()
+
         input_vals.append(val)
 
     inner_codegen = context.get_function(scalar_func, scalar_signature)
@@ -350,17 +378,30 @@ def _vectorized(
 
     batch_ndim = len(input_bc_patterns[0])
 
-    if not all(input.ndim >= batch_ndim for input in inputs):
-        raise TypingError("Vectorized inputs must have the same rank.")
+    if not all(
+        len(pattern) == batch_ndim for pattern in input_bc_patterns + output_bc_patterns
+    ):
+        raise TypingError(
+            "Vectorized broadcastable patterns must have the same length."
+        )
 
-    if not all(len(pattern) >= batch_ndim for pattern in output_bc_patterns):
-        raise TypingError("Invalid output broadcasting pattern.")
+    core_input_types = []
+    for input_type, bc_pattern in zip(inputs, input_bc_patterns):
+        core_ndim = input_type.ndim - len(bc_pattern)
+        # TODO: Reconsider this
+        if core_ndim == 0:
+            core_input_type = input_type.dtype
+        else:
+            core_input_type = types.Array(
+                dtype=input_type.dtype, ndim=core_ndim, layout=input_type.layout
+            )
+        core_input_types.append(core_input_type)
 
-    scalar_signature = typingctx.resolve_function_type(
+    core_signature = typingctx.resolve_function_type(
         scalar_func,
         [
             *constant_inputs,
-            *[in_type.dtype if in_type.ndim == 0 else in_type for in_type in inputs],
+            *core_input_types,
         ],
         {},
     )
@@ -415,7 +456,7 @@ def codegen(
             ctx,
             builder,
             scalar_func,
-            scalar_signature,
+            core_signature,
             iter_shape,
             constant_inputs,
             inputs,
diff --git a/tests/link/numba/test_random.py b/tests/link/numba/test_random.py
@@ -645,7 +645,30 @@ def test_rng_non_default_update():
 
 
 def test_categorical_rv():
-    x = pt.random.categorical(p=[[0.5, 0, 0, 0.5], [0, 0.5, 0.5, 0]], size=(2,))
+    p = np.array(
+        [
+            [
+                [1.0, 0, 0, 0],
+                [0.0, 1.0, 0, 0],
+                [0.0, 0, 1.0, 0],
+            ],
+            [
+                [0, 0, 0, 1.0],
+                [0, 0, 0, 1.0],
+                [0, 0, 0, 1.0],
+            ],
+        ]
+    )
+    x = pt.random.categorical(p=p, size=None)
     updates = {x.owner.inputs[0]: x.owner.outputs[0]}
     fn = function([], x, updates=updates, mode="NUMBA")
-    print([fn() for _ in range(50)])
+    res = fn()
+    assert np.all(np.argmax(p, axis=-1) == res)
+
+    # Batch size
+    x = pt.random.categorical(p=p[None], size=(3, *p.shape[:-1]))
+    fn = function([], x, updates=updates, mode="NUMBA")
+    new_res = fn()
+    assert new_res.shape == (3, *res.shape)
+    for new_res_row in new_res:
+        assert np.all(new_res_row == res)