Cleanup Fusion rewrites

ricardoV94 · ricardoV94 · commit a2f101adbf86 · 2023-02-08T10:49:15.000+01:00
* Move local_add_mul_fusion to `rewriting/elemwise` and remove unused/duplicated TestAddMulFusion tests
* Use EquilibriumGraphRewriter for local_add_mul_fusion
* Do not register optional rewrites if tensor__local_elemwise_fusion flag is disabled
diff --git a/pytensor/graph/rewriting/db.py b/pytensor/graph/rewriting/db.py
@@ -427,7 +427,7 @@ def query(
                 position_cutoff = tags[0].position_cutoff
 
             # The RewriteDatabaseQuery instance might contain extra rewrites which need
-            # to be added the the sequence of rewrites (don't alter the
+            # to be added to the sequence of rewrites (don't alter the
             # original dictionary)
             if len(tags[0].extra_rewrites) > 0:
                 position_dict = position_dict.copy()
diff --git a/pytensor/tensor/rewriting/elemwise.py b/pytensor/tensor/rewriting/elemwise.py
@@ -13,6 +13,7 @@
 from pytensor.graph.features import ReplaceValidate
 from pytensor.graph.op import compute_test_value, get_test_value
 from pytensor.graph.rewriting.basic import (
+    EquilibriumGraphRewriter,
     GraphRewriter,
     copy_stack_trace,
     in2out,
@@ -529,6 +530,60 @@ def local_upcast_elemwise_constant_inputs(fgraph, node):
                 return rval
 
 
+@node_rewriter([Elemwise])
+def local_add_mul_fusion(fgraph, node):
+    """Fuse consecutive add or mul in one such node with more inputs.
+
+    It is better to fuse add/mul that way then in a Composite node as
+    this make the inner graph of the Composite smaller. This allows to
+    put more computation in a Composite before hitting the max
+    recursion limit when pickling Composite.
+
+    This rewrite is almost useless after the AlgebraicCanonizer is used,
+    but it catches a few edge cases that are not canonicalized by it
+    """
+    if not isinstance(node.op, Elemwise) or not isinstance(
+        node.op.scalar_op, (aes.Add, aes.Mul)
+    ):
+        return False
+
+    s_op = node.op.scalar_op.__class__
+    new_inp = []
+    fused = False
+    nb_inputs = len(node.inputs)
+    max_inputs = float("inf")
+    if hasattr(node.op, "max_inputs"):
+        max_inputs = node.op.max_inputs(node)
+    for inp in node.inputs:
+        if (
+            inp.owner
+            and isinstance(inp.owner.op, Elemwise)
+            and isinstance(inp.owner.op.scalar_op, s_op)
+            and
+            # Do not duplicate the operation.
+            len(fgraph.clients[inp]) == 1
+            and (nb_inputs + len(inp.owner.inputs) - 1) <= max_inputs
+        ):
+            new_inp.extend(inp.owner.inputs)
+            fused = True
+        else:
+            new_inp.append(inp)
+
+    # We can not compare the number of inputs as Mul and Add could have
+    # 0 or 1 inputs in some corner cases.
+    if fused:
+        output = node.op(*new_inp)
+        copy_stack_trace(node.outputs[0], output)
+
+        # Do the recursion here to help lower the number of
+        # FusionOptimizer iteration.
+        if output.owner:
+            output2 = local_add_mul_fusion.transform(fgraph, output.owner)
+            if output2:
+                return output2
+        return [output]
+
+
 def local_elemwise_fusion_op(op_class, max_input_fct=lambda node: 32, maker=None):
     r"""Create a recursive function that fuses `Elemwise` `Op`\s.
 
@@ -901,6 +956,13 @@ def print_profile(cls, stream, prof, level=0):
 if config.tensor__local_elemwise_fusion:
     # Must be after gpu(48.5) and before AddDestroyHandler(49.5)
     fuse_seqopt = SequenceDB()
+    fuse_seqopt.register(
+        "local_add_mul_fusion",
+        EquilibriumGraphRewriter(rewriters=[local_add_mul_fusion], max_use_ratio=1000),
+        "fast_run",
+        "fusion",
+        position=0,
+    )
     fuse_seqopt.register(
         "composite_elemwise_fusion",
         FusionOptimizer(local_elemwise_fusion),
@@ -917,15 +979,6 @@ def print_profile(cls, stream, prof, level=0):
         "FusionOptimizer",
         position=49,
     )
-else:
-    compile.optdb.register(  # type: ignore
-        "elemwise_fusion",
-        FusionOptimizer(local_elemwise_fusion),
-        "fusion",
-        "local_elemwise_fusion",
-        "FusionOptimizer",
-        position=49,
-    )
 
 
 @register_canonicalize
diff --git a/pytensor/tensor/rewriting/math.py b/pytensor/tensor/rewriting/math.py
@@ -92,7 +92,6 @@
     register_uncanonicalize,
     register_useless,
 )
-from pytensor.tensor.rewriting.elemwise import FusionOptimizer, fuse_seqopt
 from pytensor.tensor.shape import Shape, Shape_i
 from pytensor.tensor.subtensor import Subtensor
 from pytensor.tensor.type import (
@@ -2966,66 +2965,6 @@ def check_input(inputs):
     return [ret]
 
 
-def local_add_mul_fusion(fgraph, node):
-    """Fuse consecutive add or mul in one such node with more inputs.
-
-    It is better to fuse add/mul that way then in a Composite node as
-    this make the inner graph of the Composite smaller. This allow to
-    put more computation in a Composite before hitting the max
-    recursion limit when pickling Composite.
-
-    """
-    if not isinstance(node.op, Elemwise) or not isinstance(
-        node.op.scalar_op, (aes.Add, aes.Mul)
-    ):
-        return False
-
-    s_op = node.op.scalar_op.__class__
-    new_inp = []
-    fused = False
-    nb_inputs = len(node.inputs)
-    max_inputs = float("inf")
-    if hasattr(node.op, "max_inputs"):
-        max_inputs = node.op.max_inputs(node)
-    for inp in node.inputs:
-        if (
-            inp.owner
-            and isinstance(inp.owner.op, Elemwise)
-            and isinstance(inp.owner.op.scalar_op, s_op)
-            and
-            # Do not duplicate the operation.
-            len(fgraph.clients[inp]) == 1
-            and (nb_inputs + len(inp.owner.inputs) - 1) <= max_inputs
-        ):
-            new_inp.extend(inp.owner.inputs)
-            fused = True
-        else:
-            new_inp.append(inp)
-
-    # We can not compare the number of inputs as Mul and Add could have
-    # 0 or 1 inputs in some corner cases.
-    if fused:
-        output = node.op(*new_inp)
-        copy_stack_trace(node.outputs[0], output)
-
-        # Do the recursion here to help lower the number of
-        # FusionOptimizer iteration.
-        if output.owner:
-            output2 = local_add_mul_fusion(fgraph, output.owner)
-            if output2:
-                return output2
-        return [output]
-
-
-fuse_seqopt.register(
-    "local_add_mul_fusion",
-    FusionOptimizer(local_add_mul_fusion),
-    "fast_run",
-    "fusion",
-    position=0,
-)
-
-
 def _skip_mul_1(r):
     if r.owner and r.owner.op == mul:
         not_is_1 = [i for i in r.owner.inputs if not _is_1(i)]
diff --git a/tests/tensor/rewriting/test_elemwise.py b/tests/tensor/rewriting/test_elemwise.py
@@ -4,9 +4,9 @@
 import pytest
 
 import pytensor
-import pytensor.scalar as aes
-import pytensor.tensor as at
+from pytensor import scalar as aes
 from pytensor import shared
+from pytensor import tensor as at
 from pytensor.compile.function import function
 from pytensor.compile.mode import Mode, get_default_mode
 from pytensor.configdefaults import config
@@ -263,9 +263,8 @@ def test_local_useless_dimshuffle_in_reshape():
 class TestFusion:
     rewrites = RewriteDatabaseQuery(
         include=[
-            "local_elemwise_fusion",
-            "composite_elemwise_fusion",
             "canonicalize",
+            "fusion",
             "inplace",
         ],
         exclude=["cxx_only", "BlasOpt"],
@@ -1007,22 +1006,10 @@ def test_big_fusion(self):
         )
 
     def test_add_mul_fusion_inplace(self):
-
-        rewrites = RewriteDatabaseQuery(
-            include=[
-                "local_elemwise_fusion",
-                "composite_elemwise_fusion",
-                "canonicalize",
-                "inplace",
-            ],
-            exclude=["cxx_only", "BlasOpt"],
-        )
-
-        mode = Mode(self.mode.linker, rewrites)
-
         x, y, z = dmatrices("xyz")
         out = dot(x, y) + x + y + z
-        f = function([x, y, z], out, mode=mode)
+
+        f = function([x, y, z], out, mode=self.mode)
         topo = [n for n in f.maker.fgraph.toposort()]
         assert len(topo) == 2
         assert topo[-1].op.inplace_pattern
@@ -1050,8 +1037,7 @@ def impl(self, x):
 
         mode = Mode(linker="cvm")
         mode._optimizer = mode._optimizer.including(
-            "local_elemwise_fusion",
-            "composite_elemwise_fusion",
+            "fusion",
             "canonicalize",
             "inplace",
         )
@@ -1073,18 +1059,6 @@ def test_test_values(self, test_value):
         are checked.
 
         """
-
-        rewrites = RewriteDatabaseQuery(
-            include=[
-                "local_elemwise_fusion",
-                "composite_elemwise_fusion",
-                "canonicalize",
-            ],
-            exclude=["cxx_only", "BlasOpt"],
-        )
-
-        mode = Mode(self.mode.linker, rewrites)
-
         x, y, z = dmatrices("xyz")
 
         x.tag.test_value = test_value
@@ -1101,7 +1075,7 @@ def test_test_values(self, test_value):
         ):
             out = x * y + z
             with cm:
-                f = function([x, y, z], out, mode=mode)
+                f = function([x, y, z], out, mode=self.mode)
 
         if test_value.size != 0:
             # Confirm that the fusion happened
diff --git a/tests/tensor/rewriting/test_math.py b/tests/tensor/rewriting/test_math.py