Tune Fusion Optimizer constraints to backend

ricardoV94 · ricardoV94 · commit 941d4cf3d8c7 · 2022-12-16T13:58:57.000+01:00
The previous approach was insufficient. For instance, `test_shape_i_const` manually included `fast_run` which includes the fusion optimization, even when the test default mode is `fast_compile`. This led to an issue because `fast_compile` mode prevents the creation of `c_thunks`, even when a `C` compiler is available. This forces the use of Python perform method which is limited to 32 operands. The Fusion Optimizer only looked at the `cxx` flag and was assuming that the C limit (1024 operands) was in place. In this test, one of the fused Composite now surpassed that limit.
diff --git a/pytensor/compile/mode.py b/pytensor/compile/mode.py
@@ -441,19 +441,34 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
 # FunctionMaker, the Mode will be taken from this dictionary using the
 # string as the key
 # Use VM_linker to allow lazy evaluation by default.
-FAST_COMPILE = Mode(VMLinker(use_cloop=False, c_thunks=False), "fast_compile")
+FAST_COMPILE = Mode(
+    VMLinker(use_cloop=False, c_thunks=False),
+    RewriteDatabaseQuery(include=["fast_compile"], exclude=["cxx_only"]),
+)
 if config.cxx:
-    FAST_RUN = Mode("cvm", "fast_run")
+    FAST_RUN = Mode(
+        "cvm",
+        RewriteDatabaseQuery(include=["fast_run"], exclude=["jax", "numba"]),
+    )
 else:
-    FAST_RUN = Mode("vm", "fast_run")
+    FAST_RUN = Mode(
+        "vm",
+        RewriteDatabaseQuery(
+            include=["fast_run"], exclude=["cxx_only", "jax", "numba"]
+        ),
+    )
 
 JAX = Mode(
     JAXLinker(),
-    RewriteDatabaseQuery(include=["fast_run", "jax"], exclude=["cxx_only", "BlasOpt"]),
+    RewriteDatabaseQuery(
+        include=["fast_run", "jax"], exclude=["cxx_only", "BlasOpt", "numba"]
+    ),
 )
 NUMBA = Mode(
     NumbaLinker(),
-    RewriteDatabaseQuery(include=["fast_run"], exclude=["cxx_only", "BlasOpt"]),
+    RewriteDatabaseQuery(
+        include=["fast_run", "numba"], exclude=["cxx_only", "BlasOpt", "jax"]
+    ),
 )
 
 
diff --git a/pytensor/tensor/rewriting/elemwise.py b/pytensor/tensor/rewriting/elemwise.py
@@ -593,20 +593,13 @@ def local_add_mul_fusion(fgraph, node):
         return [output]
 
 
-def elemwise_max_operands_fct(node) -> int:
-    # `Elemwise.perform` uses NumPy ufuncs and they are limited to 32 operands (inputs and outputs)
-    if not config.cxx:
-        return 32
-    return 1024
-
-
 class FusionOptimizer(GraphRewriter):
     """Graph optimizer that fuses consecutive Elemwise operations."""
 
-    def __init__(self, local_optimizer=None):
-        # TODO: Figure out what to do with this
+    def __init__(self, backend):
         super().__init__()
-        self.optimizer = local_optimizer
+        assert backend in ("py", "c", "numba")
+        self.backend = backend
 
     def add_requirements(self, fgraph):
         fgraph.attach_feature(ReplaceValidate())
@@ -654,29 +647,29 @@ def elemwise_to_scalar(inputs, outputs):
         return scalar_inputs, scalar_outputs
 
     def apply(self, fgraph):
+        # Even though this rewrite it marked as `cxx_only`,
+        # it may sometimes be called when `cxx` is disabled -.-
+        if self.backend == "c" and not config.cxx:
+            return
+
         nb_replacement = 0
 
         if fgraph.profile:
             validate_before = fgraph.profile.validate_time
             callbacks_before = fgraph.execute_callbacks_times.copy()
             callback_before = fgraph.execute_callbacks_time
 
-        max_operands = elemwise_max_operands_fct(None)
+        # `Elemwise.perform` uses NumPy ufuncs and they are limited to 32 operands (inputs and outputs)
+        max_operands = 32 if self.backend == "py" else 1024
 
-        def find_next_fuseable_subgraph(
-            fg: FunctionGraph,
-        ) -> Generator[Tuple[List[Variable], List[Variable]], None, None]:
-            """Find all subgraphs in a FunctionGraph that can be fused together
-
-            Yields
-            -------
-            List of inputs and outputs that determine subgraphs which can be fused. This
-            method assumes that such replacement is done across iterations of the
-            generator.
-            """
+        if self.backend in ("py", "c"):
+            # Python mode is not really a backend, and it may or may not call C code
+            # Rewrites don't have access to the linker to make this decision, So we assume
+            # we can only fuse Ops with C implementation
 
+            # Python rewrite may
             @lru_cache(maxsize=None)
-            def elemwise_scalar_op_has_c_code(node: Apply) -> bool:
+            def elemwise_scalar_op_can_be_fused(node: Apply) -> bool:
                 if node.op.scalar_op.supports_c_code(node.inputs, node.outputs):
                     return True
                 else:
@@ -690,6 +683,24 @@ def elemwise_scalar_op_has_c_code(node: Apply) -> bool:
                     )
                     return False
 
+        elif self.backend == "numba":
+
+            def elemwise_scalar_op_can_be_fused(node: Apply) -> bool:
+                # Should we truncate at numba elemwise ops that need to run in object mode?
+                return True
+
+        def find_next_fuseable_subgraph(
+            fg: FunctionGraph,
+        ) -> Generator[Tuple[List[Variable], List[Variable]], None, None]:
+            """Find all subgraphs in a FunctionGraph that can be fused together
+
+            Yields
+            -------
+            List of inputs and outputs that determine subgraphs which can be fused. This
+            method assumes that such replacement is done across iterations of the
+            generator.
+            """
+
             # We start by creating two maps, 1) from each node to each potentially
             # fuseable client (both nodes must be single output Elemwise with same
             # broadcast type) and 2) from each node to each certainly unfuseable
@@ -702,7 +713,7 @@ def elemwise_scalar_op_has_c_code(node: Apply) -> bool:
                     and isinstance(out.owner.op, Elemwise)
                     # and not isinstance(out.owner.op.scalar_op, aes.Composite)
                     and len(out.owner.outputs) == 1
-                    and elemwise_scalar_op_has_c_code(out.owner)
+                    and elemwise_scalar_op_can_be_fused(out.owner)
                 )
                 for client, _ in clients:
                     if (
@@ -713,7 +724,7 @@ def elemwise_scalar_op_has_c_code(node: Apply) -> bool:
                         and len(client.outputs) == 1
                         and out.type.broadcastable
                         == client.outputs[0].type.broadcastable
-                        and elemwise_scalar_op_has_c_code(client)
+                        and elemwise_scalar_op_can_be_fused(client)
                     ):
                         if client not in fuseable_clients[out]:
                             fuseable_clients[out].append(client)
@@ -1001,7 +1012,7 @@ def elemwise_scalar_op_has_c_code(node: Apply) -> bool:
             if (len(inputs) + len(outputs)) > max_operands:
                 warn(
                     "Loop fusion failed because the resulting node would exceed "
-                    "the kernel argument limit."
+                    "the backend limit for number of operands."
                 )
                 break
 
@@ -1067,30 +1078,68 @@ def print_profile(stream, prof, level=0):
         print(blanc, " time_toposort", prof[7], file=stream)
 
 
-if config.tensor__local_elemwise_fusion:
-    # Must be after gpu(48.5) and before AddDestroyHandler(49.5)
-    fuse_seqopt = SequenceDB()
-    fuse_seqopt.register(
+fuse_opt_py = SequenceDB()
+fuse_opt_c = SequenceDB()
+fuse_opt_numba = SequenceDB()
+for fuse_opt in (fuse_opt_py, fuse_opt_c, fuse_opt_numba):
+    fuse_opt.register(
         "local_add_mul_fusion",
         EquilibriumGraphRewriter(rewriters=[local_add_mul_fusion], max_use_ratio=1000),
         "fast_run",
         "fusion",
         position=0,
     )
-    fuse_seqopt.register(
-        "composite_elemwise_fusion",
-        FusionOptimizer(),
+fuse_opt_py.register(
+    "composite_elemwise_fusion_py",
+    FusionOptimizer("py"),
+    "fast_run",
+    "fusion",
+    position=1,
+)
+fuse_opt_c.register(
+    "composite_elemwise_fusion_c",
+    FusionOptimizer("c"),
+    "fast_run",
+    "fusion",
+    position=1,
+)
+fuse_opt_numba.register(
+    "composite_elemwise_fusion_numba",
+    FusionOptimizer("numba"),
+    "fast_run",
+    "fusion",
+    position=1,
+)
+
+
+if config.tensor__local_elemwise_fusion:
+    # Must be after gpu(48.5) and before AddDestroyHandler(49.5)
+    compile.optdb.register(  # type: ignore
+        "elemwise_fusion_c",
+        fuse_opt_c,
         "fast_run",
         "fusion",
-        position=1,
+        "local_elemwise_fusion",
+        "FusionOptimizer",
+        "cxx_only",
+        position=49,
     )
+    # We allow the Python version to run afterwards,
+    # since there is no mode for Python only
     compile.optdb.register(  # type: ignore
-        "elemwise_fusion",
-        fuse_seqopt,
+        "elemwise_fusion_py",
+        fuse_opt_py,
         "fast_run",
         "fusion",
         "local_elemwise_fusion",
         "FusionOptimizer",
+        position=49.01,
+    )
+    # TODO: Not sure about this... Could rewrites receive info about the linker that is being used?
+    compile.optdb.register(  # type: ignore
+        "elemwise_fusion_numba",
+        fuse_opt_numba,
+        "numba",
         position=49,
     )
 
diff --git a/tests/tensor/rewriting/test_elemwise.py b/tests/tensor/rewriting/test_elemwise.py
@@ -270,7 +270,7 @@ class TestFusion:
             "fusion",
             "inplace",
         ],
-        exclude=["cxx_only", "BlasOpt"],
+        exclude=["BlasOpt"],
     )
     mode = Mode(get_default_mode().linker, rewrites)
     _shared = staticmethod(shared)
diff --git a/tests/tensor/test_subtensor.py b/tests/tensor/test_subtensor.py
@@ -986,7 +986,7 @@ def test_adv_sub1_idx_broadcast(self):
     def test_shape_i_const(self):
         # Each axis is treated independently by shape_i/shape operators
 
-        mode_opt = self.mode.including("fast_run").excluding("fusion")
+        mode_opt = self.mode.including("fast_run")
         data = self.shared(np.array(np.arange(5), dtype=self.dtype))
         for start in [None] + [-8, -5, -1, 0, 1, 5, 8]:
             outs = []

Original file line number	Diff line number	Diff line change
`@@ -270,7 +270,7 @@ class TestFusion:`
`270`	`270`	`"fusion",`
`271`	`271`	`"inplace",`
`272`	`272`	`],`
`273`		`- exclude=["cxx_only", "BlasOpt"],`
	`273`	`+ exclude=["BlasOpt"],`
`274`	`274`	`)`
`275`	`275`	`mode = Mode(get_default_mode().linker, rewrites)`
`276`	`276`	`_shared = staticmethod(shared)`