Rewrite dots as multiplication without summation

ricardoV94 · ricardoV94 · commit 9a7bad29518c · 2025-01-27T14:40:04.000+01:00
diff --git a/pytensor/tensor/math.py b/pytensor/tensor/math.py
@@ -29,7 +29,7 @@
     stack,
     switch,
 )
-from pytensor.tensor.blockwise import Blockwise, vectorize_node_fallback
+from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import (
     CAReduce,
     Elemwise,
@@ -2220,7 +2220,7 @@ def outer(x, y):
         x = x.flatten()
     if y.ndim != 1:
         y = y.flatten()
-    return dot(x.dimshuffle(0, "x"), y.dimshuffle("x", 0))
+    return mul.outer(x, y)
 
 
 class All(FixedOpCAReduce):
@@ -2726,6 +2726,22 @@ def logsumexp(x, axis=None, keepdims=False):
     return log(sum(exp(x), axis=axis, keepdims=keepdims))
 
 
+# Predefine all batched variations of Dot
+_inner_prod = Blockwise(
+    _dot,
+    signature="(n),(n)->()",
+)
+
+_matrix_vec_prod = Blockwise(
+    _dot,
+    signature="(m,k),(k)->(m)",
+)
+
+_vec_matrix_prod = Blockwise(
+    _dot,
+    signature="(k),(k,n)->(n)",
+)
+
 _matrix_matrix_matmul = Blockwise(
     _dot,
     signature="(m,k),(k,n)->(m,n)",
@@ -2795,14 +2811,24 @@ def matmul(x1: "ArrayLike", x2: "ArrayLike", dtype: Optional["DTypeLike"] = None
 
 
 @_vectorize_node.register(Dot)
-def vectorize_node_dot_to_matmul(op, node, batched_x, batched_y):
+def vectorize_node_dot(op, node, batched_x, batched_y):
     old_x, old_y = node.inputs
-    if old_x.type.ndim == 2 and old_y.type.ndim == 2:
-        # If original input is equivalent to a matrix-matrix product,
-        # return specialized Matmul Op to avoid unnecessary new Ops.
-        return matmul(batched_x, batched_y).owner
-    else:
-        return vectorize_node_fallback(op, node, batched_x, batched_y)
+    old_x_ndim = old_x.type.ndim
+    old_y_ndim = old_y.type.ndim
+    match (old_x_ndim, old_y_ndim):
+        case (1, 1):
+            batch_op = _inner_prod
+        case (2, 1):
+            batch_op = _matrix_vec_prod
+        case (1, 2):
+            batch_op = _vec_matrix_prod
+        case (2, 2):
+            batch_op = _matrix_matrix_matmul
+        case _:
+            raise ValueError(
+                f"Core dot Op should have 1D or 2D inputs, got {old_x_ndim}D and {old_y_ndim}D."
+            )
+    return batch_op(batched_x, batched_y).owner
 
 
 def nan_to_num(x, nan=0.0, posinf=None, neginf=None):
diff --git a/pytensor/tensor/rewriting/math.py b/pytensor/tensor/rewriting/math.py
@@ -44,6 +44,11 @@
     Prod,
     Sum,
     _conj,
+    _dot,
+    _inner_prod,
+    _matrix_matrix_matmul,
+    _matrix_vec_prod,
+    _vec_matrix_prod,
     add,
     digamma,
     dot,
@@ -242,6 +247,66 @@ def local_batched_matmul_to_core_matmul(fgraph, node):
     return None
 
 
+@register_canonicalize
+@register_specialize
+@node_rewriter(
+    [_dot, _inner_prod, _matrix_vec_prod, _vec_matrix_prod, _matrix_matrix_matmul]
+)
+def local_dot_to_mul(fgraph, node):
+    """Rewrite dots that correspond to multiplication without summation."""
+    a, b = node.inputs
+    a_st_shape = a.type.shape
+    b_st_shape = b.type.shape
+    if isinstance(node.op, Dot):
+        core_a_ndim = a.type.ndim
+        core_b_ndim = b.type.ndim
+    else:
+        # Blockwise variants of Dot
+        core_a_ndim = len(node.op.inputs_sig[0])
+        core_b_ndim = len(node.op.inputs_sig[1])
+
+    if core_a_ndim > 2 or core_b_ndim > 2:
+        # Shouldn't happen, but here just in case
+        return None
+
+    if core_b_ndim == 1:
+        if a_st_shape[-1] == 1 or b_st_shape[-1] == 1:
+            if core_a_ndim == 1:
+                # inner product: (..., 1) * (..., 1) -> (...)
+                # just squeeze the last dimensions of a and b
+                new_a = a.squeeze(-1)
+                new_b = b.squeeze(-1)
+            else:
+                # matrix vector product: (..., m, 1) * (..., 1) -> (..., m)
+                # the last dimension b is already aligned for the elemwise multiplication
+                # after we squeeze the last dimension of a
+                new_a = a.squeeze(-1)
+                new_b = b
+        else:
+            return None
+
+    else:
+        if a_st_shape[-1] == 1 or b_st_shape[-2] == 1:
+            if core_a_ndim == 1:
+                # vector_matrix product: (..., 1) * (..., 1, n) -> (..., n)
+                # the last dimension of a is already aligned for the elemwise multiplication
+                # after we squeeze the one to last dimension of b
+                new_a = a
+                new_b = b.squeeze(-2)
+            else:
+                # matrix matrix product: (..., m, 1) * (..., 1, n) -> (..., m, n)
+                # the dimensions of a and b are already aligned for the elemwise multiplication
+                new_a = a
+                new_b = b
+        else:
+            return None
+
+    new_a = copy_stack_trace(a, new_a)
+    new_b = copy_stack_trace(b, new_b)
+    new_out = copy_stack_trace(node.out, mul(new_a, new_b))
+    return [new_out]
+
+
 def is_inverse_pair(node_op, prev_op, inv_pair):
     """
     Given two consecutive operations, check if they are the
diff --git a/tests/compile/test_profiling.py b/tests/compile/test_profiling.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 import pytensor.tensor as pt
-from pytensor.compile import ProfileStats
+from pytensor.compile import ProfileStats, get_mode
 from pytensor.compile.function import function
 from pytensor.configdefaults import config
 from pytensor.ifelse import ifelse
@@ -28,7 +28,10 @@ def test_profiling(self):
             x = [fvector(f"val{i}") for i in range(3)]
 
             z = []
-            z += [pt.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1)]
+            z += [
+                pt.dot(x[i][:, None], x[i + 1][None, :]).sum(axis=1)
+                for i in range(len(x) - 1)
+            ]
             z += [x[i] + x[i + 1] for i in range(len(x) - 1)]
 
             p = ProfileStats(False, gpu_checks=False)
@@ -38,6 +41,9 @@ def test_profiling(self):
             else:
                 m = None
 
+            # This test requires an unoptimized outer mul written as a dot
+            m = get_mode(m).excluding("local_dot_to_mul")
+
             f = function(x, z, profile=p, name="test_profiling", mode=m)
 
             inp = [np.arange(1024, dtype="float32") + 1 for i in range(len(x))]
diff --git a/tests/tensor/rewriting/test_math.py b/tests/tensor/rewriting/test_math.py
@@ -16,7 +16,8 @@
 from pytensor.compile.mode import Mode, get_default_mode, get_mode
 from pytensor.compile.ops import DeepCopyOp, deep_copy_op
 from pytensor.configdefaults import config
-from pytensor.graph.basic import Apply, equal_computations
+from pytensor.graph import vectorize_graph
+from pytensor.graph.basic import Apply, ancestors, equal_computations
 from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.rewriting.basic import (
     SequentialNodeRewriter,
@@ -4571,3 +4572,51 @@ def test_log_kv_stabilization():
         out.eval({x: 1000.0}, mode=mode),
         -1003.2180912984705,
     )
+
+
+@pytest.mark.parametrize(
+    "a_shape,b_shape",
+    [
+        ((1,), (1,)),
+        ((3, 1), (1,)),
+        ((1,), (1, 3)),
+        ((3, 1), (1, 3)),
+    ],
+)
+@pytest.mark.parametrize("batched", (False, True))
+def test_local_dot_to_mul(batched, a_shape, b_shape):
+    a = tensor("a", shape=a_shape)
+    b = tensor("b", shape=b_shape)
+
+    out = dot(a, b)
+    if batched:
+        batch_a = tensor("batch_a", shape=(1, 5, *a_shape))
+        batch_b = tensor("batch_b", shape=(7, 1, *b_shape))
+        out = vectorize_graph(out, {a: batch_a, b: batch_b})
+        a = batch_a
+        b = batch_b
+
+    assert (
+        sum(
+            isinstance(var.owner.op, (Blockwise | Dot))
+            for var in ancestors([out])
+            if var.owner
+        )
+        == 1
+    )
+
+    rewritten_out = rewrite_graph(out)
+    assert rewritten_out.type.shape == out.type.shape
+    assert not any(
+        isinstance(var.owner.op, (Blockwise | Dot))
+        for var in ancestors([rewritten_out])
+        if var.owner
+    )
+
+    a_test = np.random.normal(size=a.type.shape).astype(a.type.dtype)
+    b_test = np.random.normal(size=b.type.shape).astype(b.type.dtype)
+    test_mode = Mode(linker="py", optimizer=None)
+    np.testing.assert_allclose(
+        out.eval({a: a_test, b: b_test}, mode=test_mode),
+        rewritten_out.eval({a: a_test, b: b_test}, mode=test_mode),
+    )
diff --git a/tests/tensor/test_basic.py b/tests/tensor/test_basic.py
@@ -770,9 +770,9 @@ def test_alloc_constant_folding(self):
             self.allocs,
             [
                 # IncSubtensor1
-                (some_matrix[:60], 2),
+                (some_matrix[:60], 1),
                 # AdvancedIncSubtensor1
-                (some_matrix[arange(60)], 2),
+                (some_matrix[arange(60)], 1),
                 # AdvancedIncSubtensor
                 (some_matrix[idx, idx], 1),
             ],
diff --git a/tests/tensor/test_blas.py b/tests/tensor/test_blas.py
@@ -40,7 +40,7 @@
     ger,
     ger_destructive,
 )
-from pytensor.tensor.math import Dot, dot, mean, mul, outer, sigmoid
+from pytensor.tensor.math import Dot, dot, mean, mul, sigmoid
 from pytensor.tensor.rewriting.blas import local_dot22_to_dot22scalar, local_gemm_to_ger
 from pytensor.tensor.type import (
     cmatrix,
@@ -1721,9 +1721,12 @@ def clone(self, op):
 class TestGer(unittest_tools.OptimizationTestMixin):
     shared = staticmethod(shared)
 
+    def outer_via_dot(self, x, y):
+        return pt.dot(x[:, None], y[None, :])
+
     def setup_method(self):
         self.mode = pytensor.compile.get_default_mode().including("fast_run")
-        self.mode = self.mode.excluding("c_blas", "scipy_blas")
+        self.mode = self.mode.excluding("c_blas", "scipy_blas", "local_dot_to_mul")
         dtype = self.dtype = "float64"  # optimization isn't dtype-dependent
         self.A = tensor(dtype=dtype, shape=(None, None))
         self.a = tensor(dtype=dtype, shape=())
@@ -1795,7 +1798,7 @@ def test_b_nonconst_does_not_triggers_ger(self):
 
     def test_outer(self):
         rng = np.random.default_rng(unittest_tools.fetch_seed())
-        f = self.function([self.x, self.y], outer(self.x, self.y))
+        f = self.function([self.x, self.y], self.outer_via_dot(self.x, self.y))
         self.assertFunctionContains(f, self.ger_destructive)
         f(
             rng.random(5).astype(self.dtype),
@@ -1804,7 +1807,9 @@ def test_outer(self):
 
     def test_A_plus_outer(self):
         rng = np.random.default_rng(unittest_tools.fetch_seed())
-        f = self.function([self.A, self.x, self.y], self.A + outer(self.x, self.y))
+        f = self.function(
+            [self.A, self.x, self.y], self.A + self.outer_via_dot(self.x, self.y)
+        )
         self.assertFunctionContains(f, self.ger)
         f(
             rng.random((5, 4)).astype(self.dtype),
@@ -1820,7 +1825,7 @@ def test_A_plus_outer(self):
     def test_A_plus_scaled_outer(self):
         rng = np.random.default_rng(unittest_tools.fetch_seed())
         f = self.function(
-            [self.A, self.x, self.y], self.A + 0.1 * outer(self.x, self.y)
+            [self.A, self.x, self.y], self.A + 0.1 * self.outer_via_dot(self.x, self.y)
         )
         self.assertFunctionContains(f, self.ger)
         f(
@@ -1839,7 +1844,7 @@ def test_scaled_A_plus_scaled_outer(self):
         f = self.function(
             [self.A, self.x, self.y],
             np.asarray(0.2, self.dtype) * self.A
-            + np.asarray(0.1, self.dtype) * outer(self.x, self.y),
+            + np.asarray(0.1, self.dtype) * self.outer_via_dot(self.x, self.y),
         )
         # Why gemm? This make the graph simpler did we test that it
         # make it faster?
@@ -1863,7 +1868,7 @@ def given_dtype(self, dtype, M, N, *, destructive=True):
         x = tensor(dtype=dtype, shape=(None,))
         y = tensor(dtype=dtype, shape=(None,))
 
-        f = self.function([A, x, y], A + 0.1 * outer(x, y))
+        f = self.function([A, x, y], A + 0.1 * self.outer_via_dot(x, y))
         self.assertFunctionContains(
             f, self.ger_destructive if destructive else self.ger
         )
@@ -1923,7 +1928,12 @@ def test_inplace(self):
             [self.x, self.y],
             [],
             updates=[
-                (A, A + pt.constant(0.1, dtype=self.dtype) * outer(self.x, self.y))
+                (
+                    A,
+                    A
+                    + pt.constant(0.1, dtype=self.dtype)
+                    * self.outer_via_dot(self.x, self.y),
+                )
             ],
         )
         self.assertFunctionContains(f, self.ger_destructive)
@@ -2264,10 +2274,15 @@ def cmp_ger(self, a_shp, b_shp, c_shp, rng):
         b_dev = b.get_value(borrow=False, return_internal_type=True)
         c_dev = c.get_value(borrow=False, return_internal_type=True)
 
-        f_n = function([], [], updates=[(a, (a + l * outer(b, c)))], mode=self.mode)
+        f_n = function(
+            [], [], updates=[(a, (a + l * self.outer_via_dot(b, c)))], mode=self.mode
+        )
 
         f_t = function(
-            [], [], updates=[(a_t, (a_t + l * outer(b, c).T))], mode=self.mode
+            [],
+            [],
+            updates=[(a_t, (a_t + l * self.outer_via_dot(b, c).T))],
+            mode=self.mode,
         )
 
         # Try with all stride patterns, and all transposed patterns
diff --git a/tests/tensor/test_blas_c.py b/tests/tensor/test_blas_c.py
diff --git a/tests/tensor/test_blas_scipy.py b/tests/tensor/test_blas_scipy.py