pymc-devs
diff --git a/‎pytensor/graph/basic.py
Lines changed: 4 additions & 3 deletions b/‎pytensor/graph/basic.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎pytensor/graph/replace.py
Lines changed: 5 additions & 0 deletions b/‎pytensor/graph/replace.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎pytensor/link/jax/dispatch/slinalg.py
Lines changed: 9 additions & 1 deletion b/‎pytensor/link/jax/dispatch/slinalg.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎pytensor/link/numba/dispatch/slinalg.py
Lines changed: 22 additions & 2 deletions b/‎pytensor/link/numba/dispatch/slinalg.py
Lines changed: 22 additions & 2 deletions
diff --git a/‎pytensor/sparse/basic.py
Lines changed: 87 additions & 9 deletions b/‎pytensor/sparse/basic.py
Lines changed: 87 additions & 9 deletions
diff --git a/‎pytensor/tensor/basic.py
Lines changed: 35 additions & 6 deletions b/‎pytensor/tensor/basic.py
Lines changed: 35 additions & 6 deletions
diff --git a/‎pytensor/tensor/blas.py
Lines changed: 9 additions & 9 deletions b/‎pytensor/tensor/blas.py
Lines changed: 9 additions & 9 deletions
diff --git a/‎pytensor/tensor/blas_scipy.py
Lines changed: 1 addition & 5 deletions b/‎pytensor/tensor/blas_scipy.py
Lines changed: 1 addition & 5 deletions
@@ -1439,15 +1439,16 @@ def io_toposort(
         order = []
         while todo:
             cur = todo.pop()
-            # We suppose that all outputs are always computed
-            if cur.outputs[0] in computed:
+            if all(out in computed for out in cur.outputs):
                 continue
             if all(i in computed or i.owner is None for i in cur.inputs):
                 computed.update(cur.outputs)
                 order.append(cur)
             else:
                 todo.append(cur)
-                todo.extend(i.owner for i in cur.inputs if i.owner)
+                todo.extend(
+                    i.owner for i in cur.inputs if (i.owner and i not in computed)
+                )
         return order
 
     compute_deps = None
 
@@ -306,6 +306,11 @@ def vectorize_graph(
         vect_inputs = [vect_vars.get(inp, inp) for inp in node.inputs]
         vect_node = vectorize_node(node, *vect_inputs)
         for output, vect_output in zip(node.outputs, vect_node.outputs):
+            if output in vect_vars:
+                # This can happen when some outputs of a multi-output node are given a replacement,
+                # while some of the remaining outputs are still needed in the graph.
+                # We make sure we don't overwrite the provided replacement with the newly vectorized output
+                continue
             vect_vars[output] = vect_output
 
     seq_vect_outputs = [vect_vars[out] for out in seq_outputs]
 
@@ -1,7 +1,7 @@
 import jax
 
 from pytensor.link.jax.dispatch.basic import jax_funcify
-from pytensor.tensor.slinalg import Cholesky, Solve, SolveTriangular
+from pytensor.tensor.slinalg import BlockDiagonal, Cholesky, Solve, SolveTriangular
 
 
 @jax_funcify.register(Cholesky)
@@ -45,3 +45,11 @@ def solve_triangular(A, b):
         )
 
     return solve_triangular
+
+
+@jax_funcify.register(BlockDiagonal)
+def jax_funcify_BlockDiagonalMatrix(op, **kwargs):
+    def block_diag(*inputs):
+        return jax.scipy.linalg.block_diag(*inputs)
+
+    return block_diag
@@ -9,7 +9,7 @@
 
 from pytensor.link.numba.dispatch import basic as numba_basic
 from pytensor.link.numba.dispatch.basic import numba_funcify
-from pytensor.tensor.slinalg import Cholesky, SolveTriangular
+from pytensor.tensor.slinalg import Cholesky, BlockDiagonal, SolveTriangular
 
 
 _PTR = ctypes.POINTER
@@ -299,7 +299,6 @@ def solve_triangular(a, b):
 
     return solve_triangular
 
-
 def _cholesky(a, lower=False, overwrite_a=False, check_finite=True):
     return linalg.cholesky(
         a, lower=lower, overwrite_a=overwrite_a, check_finite=check_finite
@@ -357,3 +356,24 @@ def nb_cholesky(a):
         return res
 
     return nb_cholesky
+
+@numba_funcify.register(BlockDiagonal)
+def numba_funcify_BlockDiagonal(op, node, **kwargs):
+    dtype = node.outputs[0].dtype
+
+    # TODO: Why do we always inline all functions? It doesn't work with starred args, so can't use it in this case.
+    @numba_basic.numba_njit(inline="never")
+    def block_diag(*arrs):
+        shapes = np.array([a.shape for a in arrs], dtype="int")
+        out_shape = [int(s) for s in np.sum(shapes, axis=0)]
+        out = np.zeros((out_shape[0], out_shape[1]), dtype=dtype)
+
+        r, c = 0, 0
+        for arr, shape in zip(arrs, shapes):
+            rr, cc = shape
+            out[r : r + rr, c : c + cc] = arr
+            r += rr
+            c += cc
+        return out
+
+    return block_diag
@@ -7,6 +7,7 @@
 TODO: Automatic methods for determining best sparse format?
 
 """
+from typing import Literal
 from warnings import warn
 
 import numpy as np
@@ -47,6 +48,7 @@
     trunc,
 )
 from pytensor.tensor.shape import shape, specify_broadcastable
+from pytensor.tensor.slinalg import BaseBlockDiagonal, _largest_common_dtype
 from pytensor.tensor.type import TensorType
 from pytensor.tensor.type import continuous_dtypes as tensor_continuous_dtypes
 from pytensor.tensor.type import discrete_dtypes as tensor_discrete_dtypes
@@ -60,7 +62,6 @@
 
 sparse_formats = ["csc", "csr"]
 
-
 """
 Types of sparse matrices to use for testing.
 
@@ -183,7 +184,6 @@ def as_sparse_variable(x, name=None, ndim=None, **kwargs):
 
 as_sparse = as_sparse_variable
 
-
 as_sparse_or_tensor_variable = as_symbolic
 
 
@@ -1800,7 +1800,7 @@ def infer_shape(self, fgraph, node, shapes):
         return r
 
     def __str__(self):
-        return f"{self.__class__.__name__ }{{axis={self.axis}}}"
+        return f"{self.__class__.__name__}{{axis={self.axis}}}"
 
 
 def sp_sum(x, axis=None, sparse_grad=False):
@@ -2775,19 +2775,14 @@ def comparison(self, x, y):
 
 greater_equal_s_d = GreaterEqualSD()
 
-
 eq = __ComparisonSwitch(equal_s_s, equal_s_d, equal_s_d)
 
-
 neq = __ComparisonSwitch(not_equal_s_s, not_equal_s_d, not_equal_s_d)
 
-
 lt = __ComparisonSwitch(less_than_s_s, less_than_s_d, greater_than_s_d)
 
-
 gt = __ComparisonSwitch(greater_than_s_s, greater_than_s_d, less_than_s_d)
 
-
 le = __ComparisonSwitch(less_equal_s_s, less_equal_s_d, greater_equal_s_d)
 
 ge = __ComparisonSwitch(greater_equal_s_s, greater_equal_s_d, less_equal_s_d)
@@ -2992,7 +2987,7 @@ def __str__(self):
         l = []
         if self.inplace:
             l.append("inplace")
-        return f"{self.__class__.__name__ }{{{', '.join(l)}}}"
+        return f"{self.__class__.__name__}{{{', '.join(l)}}}"
 
     def make_node(self, x):
         """
@@ -3291,6 +3286,7 @@ class TrueDot(Op):
     # Simplify code by splitting into DotSS and DotSD.
 
     __props__ = ()
+
     # The grad_preserves_dense attribute doesn't change the
     # execution behavior.  To let the optimizer merge nodes with
     # different values of this attribute we shouldn't compare it
@@ -4260,3 +4256,85 @@ def grad(self, inputs, grads):
 
 
 construct_sparse_from_list = ConstructSparseFromList()
+
+
+class SparseBlockDiagonal(BaseBlockDiagonal):
+    __props__ = (
+        "n_inputs",
+        "format",
+    )
+
+    def __init__(self, n_inputs: int, format: Literal["csc", "csr"] = "csc"):
+        super().__init__(n_inputs)
+        self.format = format
+
+    def make_node(self, *matrices):
+        matrices = self._validate_and_prepare_inputs(
+            matrices, as_sparse_or_tensor_variable
+        )
+        dtype = _largest_common_dtype(matrices)
+        out_type = matrix(format=self.format, dtype=dtype)
+
+        return Apply(self, matrices, [out_type])
+
+    def perform(self, node, inputs, output_storage, params=None):
+        dtype = node.outputs[0].type.dtype
+        output_storage[0][0] = scipy.sparse.block_diag(
+            inputs, format=self.format
+        ).astype(dtype)
+
+
+def block_diag(*matrices: TensorVariable, format: Literal["csc", "csr"] = "csc"):
+    r"""
+    Construct a block diagonal matrix from a sequence of input matrices.
+
+    Given the inputs `A`, `B` and `C`, the output will have these arrays arranged on the diagonal:
+
+    [[A, 0, 0],
+     [0, B, 0],
+     [0, 0, C]]
+
+    Parameters
+    ----------
+    A, B, C ... : tensors
+        Input tensors to form the block diagonal matrix. last two dimensions of the inputs will be used, and all
+        inputs should have at least 2 dimensins.
+
+        Note that the input matrices need not be sparse themselves, and will be automatically converted to the
+        requested format if they are not.
+
+    format: str, optional
+        The format of the output sparse matrix. One of 'csr' or 'csc'. Default is 'csr'. Ignored if sparse=False.
+
+    Returns
+    -------
+    out: sparse matrix tensor
+        Symbolic sparse matrix in the specified format.
+
+    Examples
+    --------
+    Create a sparse block diagonal matrix from two sparse 2x2 matrices:
+
+    ..code-block:: python
+        import numpy as np
+        from pytensor.sparse import block_diag
+        from scipy.sparse import csr_matrix
+
+        A = csr_matrix([[1, 2], [3, 4]])
+        B = csr_matrix([[5, 6], [7, 8]])
+        result_sparse = block_diag(A, B, format='csr', name='X')
+
+        print(result_sparse)
+        >>>  SparseVariable{csr,int32}
+
+        print(result_sparse.toarray().eval())
+        >>> array([[1, 2, 0, 0],
+        >>> [3, 4, 0, 0],
+        >>> [0, 0, 5, 6],
+        >>> [0, 0, 7, 8]])
+    """
+    if len(matrices) == 1:
+        return matrices
+
+    _sparse_block_diagonal = SparseBlockDiagonal(n_inputs=len(matrices), format=format)
+    return _sparse_block_diagonal(*matrices)
@@ -43,7 +43,12 @@
     get_vector_length,
 )
 from pytensor.tensor.blockwise import Blockwise
-from pytensor.tensor.elemwise import DimShuffle, Elemwise, scalar_elemwise
+from pytensor.tensor.elemwise import (
+    DimShuffle,
+    Elemwise,
+    get_normalized_batch_axes,
+    scalar_elemwise,
+)
 from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.shape import (
     Shape,
@@ -3614,13 +3619,18 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
 
 
 @_vectorize_node.register(ExtractDiag)
-def vectorize_extract_diag(op: ExtractDiag, node, batched_x):
-    batched_ndims = batched_x.type.ndim - node.inputs[0].type.ndim
+def vectorize_extract_diag(op: ExtractDiag, node, batch_x):
+    core_ndim = node.inputs[0].type.ndim
+    batch_ndim = batch_x.type.ndim - core_ndim
+    batch_axis1, batch_axis2 = get_normalized_batch_axes(
+        (op.axis1, op.axis2), core_ndim, batch_ndim
+    )
+
     return diagonal(
-        batched_x,
+        batch_x,
         offset=op.offset,
-        axis1=op.axis1 + batched_ndims,
-        axis2=op.axis2 + batched_ndims,
+        axis1=batch_axis1,
+        axis2=batch_axis2,
     ).owner
 
 
@@ -4269,6 +4279,25 @@ def take_along_axis(arr, indices, axis=0):
     return arr[_make_along_axis_idx(arr.shape, indices, axis)]
 
 
+def ix_(*args):
+    """
+    PyTensor np.ix_ analog
+
+    See numpy.lib.index_tricks.ix_ for reference
+    """
+    out = []
+    nd = len(args)
+    for k, new in enumerate(args):
+        if new is None:
+            out.append(slice(None))
+        new = as_tensor(new)
+        if new.ndim != 1:
+            raise ValueError("Cross index must be 1 dimensional")
+        new = new.reshape((1,) * k + (new.size,) + (1,) * (nd - k - 1))
+        out.append(new)
+    return tuple(out)
+
+
 __all__ = [
     "take_along_axis",
     "expand_dims",
 
@@ -144,20 +144,20 @@
 # If check_init_y() == True we need to initialize y when beta == 0.
 def check_init_y():
     if check_init_y._result is None:
-        if not have_fblas:
+        if not have_fblas:  # pragma: no cover
             check_init_y._result = False
-
-        y = float("NaN") * np.ones((2,))
-        x = np.ones((2,))
-        A = np.ones((2, 2))
-        gemv = _blas_gemv_fns[y.dtype]
-        gemv(1.0, A.T, x, 0.0, y, overwrite_y=True, trans=True)
-        check_init_y._result = np.isnan(y).any()
+        else:
+            y = float("NaN") * np.ones((2,))
+            x = np.ones((2,))
+            A = np.ones((2, 2))
+            gemv = _blas_gemv_fns[y.dtype]
+            gemv(1.0, A.T, x, 0.0, y, overwrite_y=True, trans=True)
+            check_init_y._result = np.isnan(y).any()
 
     return check_init_y._result
 
 
-check_init_y._result = None
+check_init_y._result = None  # type: ignore
 
 
 class Gemv(Op):
 
@@ -19,17 +19,13 @@
 
 
 class ScipyGer(Ger):
-    def prepare_node(self, node, storage_map, compute_map, impl):
-        if impl == "py":
-            node.tag.local_ger = _blas_ger_fns[np.dtype(node.inputs[0].type.dtype)]
-
     def perform(self, node, inputs, output_storage):
         cA, calpha, cx, cy = inputs
         (cZ,) = output_storage
         # N.B. some versions of scipy (e.g. mine) don't actually work
         # in-place on a, even when I tell it to.
         A = cA
-        local_ger = node.tag.local_ger
+        local_ger = _blas_ger_fns[cA.dtype]
         if A.size == 0:
             # We don't have to compute anything, A is empty.
             # We need this special case because Numpy considers it