Add Numba implementation of Blockwise

ricardoV94 · ricardoV94 · commit c8f56500a8fa · 2024-10-04T14:02:14.000+02:00
diff --git a/pytensor/link/numba/dispatch/__init__.py b/pytensor/link/numba/dispatch/__init__.py
@@ -2,15 +2,16 @@
 from pytensor.link.numba.dispatch.basic import numba_funcify, numba_typify
 
 # Load dispatch specializations
-import pytensor.link.numba.dispatch.scalar
-import pytensor.link.numba.dispatch.tensor_basic
+import pytensor.link.numba.dispatch.blockwise
+import pytensor.link.numba.dispatch.elemwise
 import pytensor.link.numba.dispatch.extra_ops
 import pytensor.link.numba.dispatch.nlinalg
 import pytensor.link.numba.dispatch.random
-import pytensor.link.numba.dispatch.elemwise
 import pytensor.link.numba.dispatch.scan
-import pytensor.link.numba.dispatch.sparse
+import pytensor.link.numba.dispatch.scalar
 import pytensor.link.numba.dispatch.slinalg
+import pytensor.link.numba.dispatch.sparse
 import pytensor.link.numba.dispatch.subtensor
+import pytensor.link.numba.dispatch.tensor_basic
 
 # isort: on
diff --git a/pytensor/link/numba/dispatch/blockwise.py b/pytensor/link/numba/dispatch/blockwise.py
@@ -0,0 +1,90 @@
+from numba.core.extending import overload
+from numba.np.unsafe.ndarray import to_fixed_tuple
+
+from pytensor.link.numba.dispatch.basic import numba_funcify
+from pytensor.link.numba.dispatch.vectorize_codegen import (
+    _jit_options,
+    _vectorized,
+    encode_literals,
+    store_core_outputs,
+)
+from pytensor.tensor import get_vector_length
+from pytensor.tensor.blockwise import Blockwise, BlockwiseWithCoreShape
+
+
+@numba_funcify.register
+def numba_funcify_Blockwise(op: BlockwiseWithCoreShape, node, **kwargs):
+    [blockwise_node] = op.fgraph.apply_nodes
+    blockwise_op: Blockwise = blockwise_node.op
+    core_op = blockwise_op.core_op
+    nin = len(blockwise_node.inputs)
+    nout = len(blockwise_node.outputs)
+    core_shapes_len = [get_vector_length(sh) for sh in node.inputs[nin:]]
+    core_shape_0 = core_shapes_len[0] if nout > 0 else None
+    core_shape_1 = core_shapes_len[1] if nout > 1 else None
+    core_shape_2 = core_shapes_len[2] if nout > 2 else None
+    if nout > 3:
+        raise NotImplementedError(
+            "Blockwise with more than 3 outputs not supported in Numba backend"
+        )
+
+    core_node = blockwise_op._create_dummy_core_node(blockwise_node.inputs)
+    core_op_fn = numba_funcify(
+        core_op,
+        node=core_node,
+        parent_node=node,
+        fastmath=_jit_options["fastmath"],
+        **kwargs,
+    )
+    core_op_fn = store_core_outputs(core_op_fn, nin=nin, nout=nout)
+
+    batch_ndim = blockwise_op.batch_ndim(node)
+
+    # numba doesn't support nested literals right now...
+    input_bc_patterns = encode_literals(
+        tuple(inp.type.broadcastable[:batch_ndim] for inp in node.inputs)
+    )
+    output_bc_patterns = encode_literals(
+        tuple(out.type.broadcastable[:batch_ndim] for out in node.outputs)
+    )
+    output_dtypes = encode_literals(tuple(out.type.dtype for out in node.outputs))
+    inplace_pattern = encode_literals(())
+    # inplace = rv_op.inplace
+
+    def blockwise_wrapper(*inputs_and_core_shapes):
+        inputs, core_shapes = inputs_and_core_shapes[:nin], inputs_and_core_shapes[nin:]
+        # Appease numba Gods :(
+        # Secular solution welcomed
+        if nout == 1:
+            tuple_core_shapes = (to_fixed_tuple(core_shapes[0], core_shape_0),)
+        elif nout == 2:
+            tuple_core_shapes = (
+                to_fixed_tuple(core_shapes[0], core_shape_0),
+                to_fixed_tuple(core_shapes[1], core_shape_1),
+            )
+        else:
+            tuple_core_shapes = (
+                to_fixed_tuple(core_shapes[0], core_shape_0),
+                to_fixed_tuple(core_shapes[1], core_shape_1),
+                to_fixed_tuple(core_shapes[2], core_shape_2),
+            )
+        return _vectorized(
+            core_op_fn,
+            input_bc_patterns,
+            output_bc_patterns,
+            output_dtypes,
+            inplace_pattern,
+            (),  # constant_inputs
+            inputs,
+            tuple_core_shapes,
+            None,  # size
+        )
+
+    def blockwise(*inputs_and_core_shapes):
+        raise NotImplementedError("Non-jitted blockwise not implemented")
+
+    @overload(blockwise, jit_options=_jit_options)
+    def ov_blockwise(*inputs_and_core_shapes):
+        return blockwise_wrapper
+
+    return blockwise
diff --git a/pytensor/link/numba/dispatch/elemwise.py b/pytensor/link/numba/dispatch/elemwise.py
@@ -508,7 +508,7 @@ def elemwise_wrapper(*inputs):
             inplace_pattern_enc,
             (),  # constant_inputs
             inputs,
-            core_output_shapes,  # core_shapes
+            core_output_shapes,
             None,  # size
         )
 
diff --git a/pytensor/link/numba/dispatch/random.py b/pytensor/link/numba/dispatch/random.py
@@ -388,7 +388,7 @@ def random_wrapper(core_shape, rng, size, *dist_params):
         return rng, draws
 
     def random(core_shape, rng, size, *dist_params):
-        pass
+        raise NotImplementedError("Non-jitted random variable not implemented")
 
     @overload(random, jit_options=_jit_options)
     def ov_random(core_shape, rng, size, *dist_params):
diff --git a/pytensor/tensor/blockwise.py b/pytensor/tensor/blockwise.py
@@ -7,7 +7,8 @@
 from pytensor import config
 from pytensor.compile.builders import OpFromGraph
 from pytensor.gradient import DisconnectedType
-from pytensor.graph.basic import Apply, Constant
+from pytensor.graph import FunctionGraph
+from pytensor.graph.basic import Apply, Constant, ancestors
 from pytensor.graph.null_type import NullType
 from pytensor.graph.op import Op
 from pytensor.graph.replace import (
@@ -179,16 +180,39 @@ def infer_shape(
 
         batch_shape = broadcast_shape(*batch_shapes, arrays_are_shapes=True)
 
+        # Try to extract the core shapes from the core_op
+        if hasattr(self.core_op, "infer_shape"):
+            dummy_core_node = self._create_dummy_core_node(node.inputs)
+            dummy_core_inputs = dummy_core_node.inputs
+            dummy_fgraph = FunctionGraph(outputs=dummy_core_node.outputs, clone=False)
+            core_input_shapes = [
+                input_shape[batch_ndims:] for input_shape in input_shapes
+            ]
+            core_output_shapes = self.core_op.infer_shape(
+                dummy_fgraph, dummy_core_node, core_input_shapes
+            )
+
         out_shapes = []
-        for output, sig in zip(node.outputs, self.outputs_sig):
+        for o, (output, sig) in enumerate(zip(node.outputs, self.outputs_sig)):
             core_out_shape = []
             for i, dim_name in enumerate(sig):
                 # The output dim is the same as another input dim
                 if dim_name in core_dims:
                     core_out_shape.append(core_dims[dim_name])
                 else:
-                    # TODO: We could try to make use of infer_shape of core_op
+                    if hasattr(self.core_op, "infer_shape"):
+                        # If the input values are needed to compute the dimension length, we can't use the infer_shape
+                        # of the core_node as the value is not constant across batch dims of the Blockwise
+                        core_out_dim = core_output_shapes[o][i]
+                        if not (
+                            set(dummy_core_inputs) & set(ancestors([core_out_dim]))
+                        ):
+                            core_out_shape.append(core_out_dim)
+                            continue
+
+                    # Fallback shape requires evaluating the Blockwise Op
                     core_out_shape.append(Shape_i(batch_ndims + i)(output))
+
             out_shapes.append((*batch_shape, *core_out_shape))
 
         return out_shapes
@@ -379,3 +403,11 @@ def vectorize_node_fallback(op: Op, node: Apply, *bached_inputs) -> Apply:
 
 class OpWithCoreShape(OpFromGraph):
     """Generalizes an `Op` to include core shape as an additional input."""
+
+
+class BlockwiseWithCoreShape(OpWithCoreShape):
+    """Generalizes a Blockwise `Op` to include a core shape parameter."""
+
+    def __str__(self):
+        [blockwise_node] = self.fgraph.apply_nodes
+        return f"[{blockwise_node.op!s}]"
diff --git a/pytensor/tensor/rewriting/__init__.py b/pytensor/tensor/rewriting/__init__.py
@@ -9,6 +9,7 @@
 import pytensor.tensor.rewriting.jax
 import pytensor.tensor.rewriting.linalg
 import pytensor.tensor.rewriting.math
+import pytensor.tensor.rewriting.numba
 import pytensor.tensor.rewriting.ofg
 import pytensor.tensor.rewriting.shape
 import pytensor.tensor.rewriting.special
diff --git a/pytensor/tensor/rewriting/numba.py b/pytensor/tensor/rewriting/numba.py
@@ -0,0 +1,97 @@
+from pytensor.compile import optdb
+from pytensor.graph import node_rewriter
+from pytensor.graph.basic import applys_between
+from pytensor.graph.rewriting.basic import out2in
+from pytensor.tensor.basic import as_tensor, constant
+from pytensor.tensor.blockwise import Blockwise, BlockwiseWithCoreShape
+
+
+@node_rewriter([Blockwise])
+def introduce_explicit_core_shape_blockwise(fgraph, node):
+    """Introduce the core shape of a Blockwise.
+
+    We wrap Blockwise graphs into a BlockwiseWithCoreShape OpFromGraph
+    that has an extra "non-functional" input that represents the core shape of the Blockwise variable.
+    This core_shape is used by the numba backend to pre-allocate the output array.
+
+    If available, the core shape is extracted from the shape feature of the graph,
+    which has a higher change of having been simplified, optimized, constant-folded.
+    If missing, we fall back to the op._supp_shape_from_params method.
+
+    This rewrite is required for the numba backend implementation of Blockwise.
+
+    Example
+    -------
+
+    .. code-block:: python
+
+            import pytensor
+            import pytensor.tensor as pt
+
+            x = pt.random.dirichlet(alphas=[1, 2, 3], size=(5,))
+            pytensor.dprint(x, print_type=True)
+            # dirichlet_rv{"(a)->(a)"}.1 [id A] <Matrix(float64, shape=(5, 3))>
+            #  ├─ RNG(<Generator(PCG64) at 0x7F09E59C18C0>) [id B] <RandomGeneratorType>
+            #  ├─ [5] [id C] <Vector(int64, shape=(1,))>
+            #  └─ ExpandDims{axis=0} [id D] <Matrix(int64, shape=(1, 3))>
+            #     └─ [1 2 3] [id E] <Vector(int64, shape=(3,))>
+
+            # After the rewrite, note the new core shape input [3] [id B]
+            fn = pytensor.function([], x, mode="NUMBA")
+            pytensor.dprint(fn.maker.fgraph)
+            # [dirichlet_rv{"(a)->(a)"}].1 [id A] 0
+            #  ├─ [3] [id B]
+            #  ├─ RNG(<Generator(PCG64) at 0x7F15B8E844A0>) [id C]
+            #  ├─ [5] [id D]
+            #  └─ [[1 2 3]] [id E]
+            # Inner graphs:
+            # [dirichlet_rv{"(a)->(a)"}] [id A]
+            #  ← dirichlet_rv{"(a)->(a)"}.0 [id F]
+            #     ├─ *1-<RandomGeneratorType> [id G]
+            #     ├─ *2-<Vector(int64, shape=(1,))> [id H]
+            #     └─ *3-<Matrix(int64, shape=(1, 3))> [id I]
+            #  ← dirichlet_rv{"(a)->(a)"}.1 [id F]
+            #     └─ ···
+    """
+    op: Blockwise = node.op  # type: ignore[annotation-unchecked]
+    batch_ndim = op.batch_ndim(node)
+
+    shape_feature: ShapeFeature | None = getattr(fgraph, "shape_feature", None)  # type: ignore[annotation-unchecked]
+    if shape_feature:
+        core_shapes = [
+            [shape_feature.get_shape(out, i) for i in range(batch_ndim, out.type.ndim)]
+            for out in node.outputs
+        ]
+    else:
+        raise ValueError
+        core_shapes = op._supp_shape_from_params(op.dist_params(node))
+
+    core_shapes = [
+        as_tensor(core_shape) if len(core_shape) else constant([], dtype="int64")
+        for core_shape in core_shapes
+    ]
+
+    if any(
+        isinstance(node.op, Blockwise)
+        for node in applys_between(node.inputs, core_shapes)
+    ):
+        # If Blockwise shows up in the shape graph we can't introduce the core shape
+        return None
+
+    return (
+        BlockwiseWithCoreShape(
+            [*node.inputs, *core_shapes],
+            node.outputs,
+            destroy_map=op.destroy_map,
+        )
+        .make_node(*node.inputs, *core_shapes)
+        .outputs
+    )
+
+
+optdb.register(
+    introduce_explicit_core_shape_blockwise.__name__,
+    out2in(introduce_explicit_core_shape_blockwise),
+    "numba",
+    position=100,
+)
diff --git a/tests/link/numba/test_basic.py b/tests/link/numba/test_basic.py
@@ -242,7 +242,7 @@ def compare_numba_and_py(
     Parameters
     ----------
     fgraph
-        `FunctionGraph` or inputs to compare.
+        `FunctionGraph` or tuple(inputs, outputs) to compare.
     inputs
         Numeric inputs to be passed to the compiled graphs.
     assert_fn
diff --git a/tests/link/numba/test_blockwise.py b/tests/link/numba/test_blockwise.py
@@ -0,0 +1,30 @@
+import numpy as np
+import pytest
+from link.numba.test_basic import compare_numba_and_py, numba_mode
+
+from pytensor.tensor import tensor
+from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.nlinalg import SVD, Det
+from pytensor.tensor.slinalg import Cholesky
+
+
+# Fails if object mode warning is issued
+pytestmark = pytest.mark.filterwarnings("error")
+
+# TODO: Test inplace
+# TODO: Test non rectangular fails gracefully
+
+
+@pytest.mark.parametrize("core_op", [Det(), Cholesky(), SVD(compute_uv=True)], ids=str)
+def test_blockwise(core_op):
+    x = tensor(shape=(5, None, None))
+    outs = Blockwise(core_op=core_op)(x, return_list=True)
+
+    x_test = np.eye(3) * np.arange(1, 6)[:, None, None]
+    fn, _ = compare_numba_and_py(
+        ([x], outs),
+        [x_test],
+        numba_mode=numba_mode.including("ShapeOpt"),
+        eval_obj_mode=False,
+    )
+    fn.dprint(print_type=True)

Original file line number	Diff line number	Diff line change
`@@ -508,7 +508,7 @@ def elemwise_wrapper(*inputs):`
`508`	`508`	`inplace_pattern_enc,`
`509`	`509`	`(), # constant_inputs`
`510`	`510`	`inputs,`
`511`		`- core_output_shapes, # core_shapes`
	`511`	`+ core_output_shapes,`
`512`	`512`	`None, # size`
`513`	`513`	`)`
`514`	`514`