Do more agressive scan memory saves in JIT backends

ricardoV94 · ricardoV94 · commit 4b67bf5c0030 · 2025-03-10T14:58:23.000+01:00
diff --git a/pytensor/compile/mode.py b/pytensor/compile/mode.py
@@ -454,6 +454,19 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
         RewriteDatabaseQuery(include=["fast_run", "py_only"]),
     )
 
+NUMBA = Mode(
+    NumbaLinker(),
+    RewriteDatabaseQuery(
+        include=["fast_run", "numba"],
+        exclude=[
+            "cxx_only",
+            "BlasOpt",
+            "local_careduce_fusion",
+            "scan_save_mem_prealloc",
+        ],
+    ),
+)
+
 JAX = Mode(
     JAXLinker(),
     RewriteDatabaseQuery(
@@ -463,6 +476,7 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
             "BlasOpt",
             "fusion",
             "inplace",
+            "scan_save_mem_prealloc",
         ],
     ),
 )
@@ -476,16 +490,10 @@ def clone(self, link_kwargs=None, optimizer="", **kwargs):
             "fusion",
             "inplace",
             "local_uint_constant_indices",
+            "scan_save_mem_prealloc",
         ],
     ),
 )
-NUMBA = Mode(
-    NumbaLinker(),
-    RewriteDatabaseQuery(
-        include=["fast_run", "numba"],
-        exclude=["cxx_only", "BlasOpt", "local_careduce_fusion"],
-    ),
-)
 
 
 predefined_modes = {
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
@@ -1085,7 +1085,9 @@ def add_scan_configvars():
         "scan__allow_output_prealloc",
         "Allow/disallow memory preallocation for outputs inside of scan "
         "(default: True)",
-        BoolParam(True),
+        # Non-mutable because ScanSaveMem rewrite checks it,
+        # and we can't have the rewrite and the implementation mismatch
+        BoolParam(True, mutable=False),
         in_c_key=False,
     )
 
diff --git a/pytensor/scan/rewriting.py b/pytensor/scan/rewriting.py
@@ -70,7 +70,7 @@
     get_slice_elements,
     set_subtensor,
 )
-from pytensor.tensor.variable import TensorConstant
+from pytensor.tensor.variable import TensorConstant, TensorVariable
 
 
 list_opt_slice = [
@@ -1182,8 +1182,7 @@ def while_scan_merge_subtensor_last_element(fgraph, scan_node):
     return subtensor_merge_replacements
 
 
-@node_rewriter([Scan])
-def scan_save_mem(fgraph, node):
+def scan_save_mem_rewrite(fgraph, node, backend_supports_output_pre_allocation: bool):
     r"""Graph optimizer that reduces scan memory consumption.
 
     This optimizations attempts to determine if a `Scan` node, during its execution,
@@ -1214,10 +1213,16 @@ def scan_save_mem(fgraph, node):
 
     The scan perform implementation takes the output sizes into consideration,
     saving the newest results over the oldest ones whenever the buffer is filled.
-    """
-    if not isinstance(node.op, Scan):
-        return False
 
+    Paramaters
+    ----------
+    backend_supports_output_pre_allocation: bool
+        When the backend supports output pre-allocation Scan must keep buffers
+        with a length of required_states + 1, because the inner function will
+        attempt to write the inner function outputs directly into the provided
+        position in the outer circular buffer. This would invalidate results,
+        if the input is still needed for some other output computation.
+    """
     if hasattr(fgraph, "shape_feature"):
         shape_of = fgraph.shape_feature.shape_of
     else:
@@ -1270,14 +1275,15 @@ def scan_save_mem(fgraph, node):
     # Note: For simplicity while Scans also have global_nsteps set to None.
     #  All step optimizations require knowing the shape of the output, which
     #  cannot be determined from the inputs alone.
+    global_nsteps: None | dict
     assert len(node.outputs) >= c_outs
     if len(node.outputs) == c_outs and not op.info.as_while:
         global_nsteps = {"real": -1, "sym": []}
     else:
         global_nsteps = None
 
     # Keeps track of the original slices that each client represent
-    slices = [None for o in node.outputs]
+    slices: list[None | list] = [None for o in node.outputs]
 
     # A list for each output indicating how many intermediate values
     # should be stored. If negative it means none of the intermediate
@@ -1294,7 +1300,7 @@ def scan_save_mem(fgraph, node):
     # or not
     flag_store = False
 
-    # 2.2 Loop over the clients
+    # 2.2 Loop over the clients to figure out how many steps we actually need to do in the Scan
     for i, out in enumerate(node.outputs[:c_outs]):
         # look at all its clients
         slices[i] = []
@@ -1337,7 +1343,7 @@ def scan_save_mem(fgraph, node):
                     except KeyError:
                         length = out.shape[0]
                 cf_slice = get_canonical_form_slice(this_slice[0], length)
-                slices[i] += [(cf_slice, this_slice)]
+                slices[i] += [(cf_slice, this_slice)]  # type: ignore
 
                 if isinstance(this_slice[0], slice) and this_slice[0].stop is None:
                     global_nsteps = None
@@ -1476,7 +1482,10 @@ def scan_save_mem(fgraph, node):
                     # for mitsots and sitsots (because mitmots are not
                     # currently supported by the mechanism) and only if
                     # the pre-allocation mechanism is activated.
-                    prealloc_outs = config.scan__allow_output_prealloc
+                    prealloc_outs = (
+                        backend_supports_output_pre_allocation
+                        and config.scan__allow_output_prealloc
+                    )
 
                     first_mitsot_idx = op_info.n_mit_mot
                     last_sitsot_idx = (
@@ -1485,6 +1494,8 @@ def scan_save_mem(fgraph, node):
                     preallocable_output = first_mitsot_idx <= i <= last_sitsot_idx
 
                     if prealloc_outs and preallocable_output:
+                        # TODO: If there's only one output or other outputs do not depend
+                        #  on the same input, we could reduce the buffer size to the minimum
                         pval = select_max(nw_steps - start + init_l[i], init_l[i] + 1)
                     else:
                         pval = select_max(nw_steps - start + init_l[i], init_l[i])
@@ -1651,7 +1662,7 @@ def scan_save_mem(fgraph, node):
             name=op.name,
             allow_gc=op.allow_gc,
         )
-        new_outs = new_op(*node_ins, return_list=True)
+        new_outs = cast(list[TensorVariable], new_op(*node_ins, return_list=True))
 
         old_new = []
         # 3.7 Get replace pairs for those outputs that do not change
@@ -1681,7 +1692,7 @@ def scan_save_mem(fgraph, node):
                     sl_ins = get_slice_elements(
                         nw_slice, lambda entry: isinstance(entry, Variable)
                     )
-                    new_o = subtens(new_outs[nw_pos], *sl_ins)
+                    new_o = cast(TensorVariable, subtens(new_outs[nw_pos], *sl_ins))
                     if new_o.ndim > 0:
                         new_o = new_o[:: cnf_slice[1]]
                     replaced_outs.append(idx)
@@ -1736,7 +1747,7 @@ def scan_save_mem(fgraph, node):
                     sl_ins = get_slice_elements(
                         nw_slice, lambda entry: isinstance(entry, Variable)
                     )
-                    new_o = subtens(new_outs[nw_pos], *sl_ins)
+                    new_o = cast(TensorVariable, subtens(new_outs[nw_pos], *sl_ins))
                     if new_o.ndim > 0:
                         new_o = new_o[:: cnf_slice[1]]
                     old_new += [(old, new_o)]
@@ -1767,6 +1778,20 @@ def scan_save_mem(fgraph, node):
         return False
 
 
+@node_rewriter([Scan])
+def scan_save_mem_prealloc(fgraph, node):
+    return scan_save_mem_rewrite(
+        fgraph, node, backend_supports_output_pre_allocation=True
+    )
+
+
+@node_rewriter([Scan])
+def scan_save_mem_no_prealloc(fgraph, node):
+    return scan_save_mem_rewrite(
+        fgraph, node, backend_supports_output_pre_allocation=False
+    )
+
+
 class ScanMerge(GraphRewriter):
     r"""Graph optimizer that merges different scan ops.
 
@@ -2494,12 +2519,21 @@ def scan_push_out_dot1(fgraph, node):
 optdb.register("scan_eqopt2", scan_eqopt2, "fast_run", "scan", position=1.6)
 # ScanSaveMem should execute only once per node.
 optdb.register(
-    "scan_save_mem",
-    in2out(scan_save_mem, ignore_newtrees=True),
+    "scan_save_mem_prealloc",
+    in2out(scan_save_mem_prealloc, ignore_newtrees=True),
     "fast_run",
     "scan",
     position=1.61,
 )
+optdb.register(
+    "scan_save_mem_no_prealloc",
+    in2out(scan_save_mem_no_prealloc, ignore_newtrees=True),
+    "numba",
+    "jax",
+    "pytorch",
+    use_db_name_as_tag=False,
+    position=1.61,
+)
 optdb.register(
     "scan_make_inplace",
     ScanInplaceOptimizer(),
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
@@ -1,7 +1,7 @@
 import logging
 import sys
 import warnings
-from collections.abc import Callable, Iterable
+from collections.abc import Callable, Iterable, Sequence
 from itertools import chain, groupby
 from textwrap import dedent
 from typing import cast, overload
@@ -645,7 +645,7 @@ def indexed_result_shape(array_shape, indices, indices_are_shapes=False):
 
 
 def get_slice_elements(
-    idxs: list,
+    idxs: Sequence,
     cond: Callable = lambda x: isinstance(x, Variable),
 ) -> list:
     """Extract slice elements conditional on a given predicate function.
diff --git a/tests/link/numba/test_scan.py b/tests/link/numba/test_scan.py
@@ -465,7 +465,7 @@ def buffer_tester(self, n_steps, op_size, buffer_size, benchmark=None):
         )
         if buffer_size == "unit":
             xs_kept = xs[-1]  # Only last state is used
-            expected_buffer_size = 2
+            expected_buffer_size = 1
         elif buffer_size == "aligned":
             xs_kept = xs[-2:]  # The buffer will be aligned at the end of the 9 steps
             expected_buffer_size = 2
@@ -555,8 +555,7 @@ def f_pow2(x_tm2, x_tm1):
             accept_inplace=True,
             on_unused_input="ignore",
         )
-        assert tuple(mitsot_buffer_shape) == (3,)
-
+        assert tuple(mitsot_buffer_shape) == (2,)
         if benchmark is not None:
             numba_fn.trust_input = True
             benchmark(numba_fn, *test_vals)
diff --git a/tests/scan/test_rewriting.py b/tests/scan/test_rewriting.py
@@ -742,7 +742,7 @@ def rnn_step1(
         utt.assert_allclose(f_opt_output, f_no_opt_output)
 
     def test_non_zero_init(self):
-        """Test the case where the initial value for the nitsot output is non-zero."""
+        """Test the case where the initial value for the sitsot output is non-zero."""
 
         input1 = tensor3()
         input2 = tensor3()
@@ -759,8 +759,7 @@ def inner_fct(seq1, seq2, seq3, previous_output):
 
         init = pt.as_tensor_variable(np.random.normal(size=(3, 7)))
 
-        # Compile the function twice, once with the optimization and once
-        # without
+        # Compile the function twice, once with the optimization and once without
         opt_mode = mode.including("scan")
         h, _ = pytensor.scan(
             inner_fct,
@@ -792,7 +791,7 @@ def inner_fct(seq1, seq2, seq3, previous_output):
         output_opt = f_opt(input1_value, input2_value, input3_value)
         output_no_opt = f_no_opt(input1_value, input2_value, input3_value)
 
-        utt.assert_allclose(output_opt, output_no_opt)
+        np.testing.assert_allclose(output_opt, output_no_opt)
 
 
 class TestScanMerge: