fixup! ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame

kernc · kernc · commit c2adfccb6383 · 2017-03-01T18:17:55.000+01:00
diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
@@ -186,9 +186,32 @@ the correct dense result.
 Interaction with scipy.sparse
 -----------------------------
 
-Experimental api to transform between sparse pandas and scipy.sparse structures.
+.. versionadded:: 0.20.0
 
-A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
+Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
+
+.. ipython:: python
+
+   from scipy.sparse import csr_matrix
+
+   arr = np.random.random(size=(1000, 5))
+   arr[arr < .9] = 0
+
+   sp_arr = csr_matrix(arr)
+   sp_arr
+
+   sdf = pd.SparseDataFrame(sp_arr)
+   sdf
+
+To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method:
+
+.. ipython:: python
+
+   sdf.to_coo()
+
+.. versionadded:: 0.16.0
+
+Additionally, an experimental :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
 
 The method requires a ``MultiIndex`` with two or more levels.
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -157,7 +157,7 @@ New Behavior:
 
 SciPy sparse matrix from/to SparseDataFrame
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmatrix`` instances. E.g.
+Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances (:issue:`4343`).
 
 .. ipython:: python
 
@@ -166,10 +166,10 @@ Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmat
    arr[arr < .9] = 0
    sp_arr = csr_matrix(arr)
    sp_arr
-   sdf = pd.DataFrame(sp_arr)
+   sdf = pd.SparseDataFrame(sp_arr)
    sdf
 
-To convert a SparseDataFrame back to scipy sparse matrix in COO format, you can use:
+To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use:
 
 .. ipython:: python
 
diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py
@@ -12,7 +12,7 @@
 
 from pandas.types.missing import isnull, notnull
 from pandas.types.cast import _maybe_upcast, _find_common_type
-from pandas.types.common import _ensure_platform_int
+from pandas.types.common import _ensure_platform_int, is_scipy_sparse
 
 from pandas.core.common import _try_sort
 from pandas.compat.numpy import function as nv
@@ -29,10 +29,6 @@
 from pandas.util.decorators import Appender
 import pandas.core.ops as ops
 
-try:
-    from scipy.sparse import spmatrix  # noqa
-except ImportError:
-    spmatrix = type('mock spmatrix', (), {})
 
 _shared_doc_kwargs = dict(klass='SparseDataFrame')
 
@@ -102,7 +98,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
         elif isinstance(data, BlockManager):
             mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                                  dtype=dtype, copy=copy)
-        elif isinstance(data, spmatrix):
+        elif is_scipy_sparse(data):
             mgr = self._init_spmatrix(data, index, columns, dtype=dtype)
         elif data is None:
             data = DataFrame()
@@ -175,19 +171,21 @@ def _init_dict(self, data, index, columns, dtype=None):
         return to_manager(sdict, columns, index)
 
     def _init_matrix(self, data, index, columns, dtype=None):
+        """ Init self from ndarray or list of lists """
         data = _prep_ndarray(data, copy=False)
         index, columns = self._prep_index(data, index, columns)
         data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
         return self._init_dict(data, index, columns, dtype)
 
     def _init_spmatrix(self, data, index, columns, dtype=None):
+        """ Init self from scipy.sparse matrix """
         index, columns = self._prep_index(data, index, columns)
         data = data.tocoo(copy=False)
         N = len(index)
         bindex = np.arange(N, dtype=np.int32)
 
         sdict = {}
-        values = Series(data.data, index=data.row)
+        values = Series(data.data, index=data.row, copy=False)
         for col, rowvals in values.groupby(data.col):
             blocs, blens = get_blocks(bindex[rowvals.index])
             sdict[columns[col]] = SparseSeries(
@@ -217,49 +215,12 @@ def _prep_index(self, data, index, columns):
                              (len(index), N))
         return index, columns
 
-    def as_matrix(self, columns=None, sparse=False):
-        """
-        Convert the frame to its Numpy-array or SciPy sparse COO matrix
-        representation.
-
-        Parameters
-        ----------
-        columns : list, optional, default=None
-            If None, return all columns. Otherwise, returns specified columns.
-        sparse : bool, optional, default=True
-            If True, return an instance of scipy.sparse.coo_matrix instead
-            of ndarray. If False, the result values array will be DENSE.
-
-        Returns
-        -------
-        values : ndarray or scipy.sparse.spmatrix
-            If the caller is heterogeneous and contains booleans or objects,
-            the result will be of dtype=object. See Notes.
-
-        Notes
-        -----
-        The dtype will be the lowest-common-denominator type (implicit
-        upcasting); that is to say if the dtypes (even of numeric types)
-        are mixed, the one that accommodates all will be chosen.
-
-        e.g. If the dtypes are float16 and float32, dtype will be upcast to
-        float32. By numpy.find_common_type convention, mixing int64 and
-        and uint64 will result in a float64 dtype.
-
-        See Also
-        --------
-        pandas.SparseDataFrame.to_coo
-        """
-        if sparse:
-            subdf = self if columns is None else self[columns]
-            return subdf.to_coo()
-
-        return super(SparseDataFrame, self).as_matrix(columns=columns)
-
     def to_coo(self):
         """
         Convert the frame to its SciPy sparse COO matrix representation.
 
+        .. versionadded:: 0.20.0
+
         Returns
         -------
         coo_matrix : scipy.sparse.spmatrix
diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py
@@ -2,6 +2,8 @@
 
 import operator
 
+import pytest
+
 from numpy import nan
 import numpy as np
 import pandas as pd
@@ -1118,28 +1120,27 @@ def test_isnotnull(self):
                             'B': [True, False, True, True, False]})
         tm.assert_frame_equal(res.to_dense(), exp)
 
-    def test_from_to_scipy(self):
+    @pytest.mark.importorskip('scipy.sparse')
+    @pytest.mark.parameterize('index', [None, list('abc')])
+    @pytest.mark.parameterize('columns', [None, list('def')])
+    @pytest.mark.parameterize('fill_value', [None, 0, np.nan])
+    def test_from_to_scipy(self, index=None, columns=None, fill_value=None):
         # GH 4343
-        tm._skip_if_no_scipy()
         from scipy.sparse import csr_matrix
 
-        arr = np.array([[0, 1, 0],
-                        [0, 0, 1],
-                        [1, 1, 1.]])
-        spm = csr_matrix(arr)
-
-        for index, columns in ((list('abc'), list('def')),
-                               (None, None)):
-            sdf = pd.SparseDataFrame(spm, index=index, columns=columns)
+        arr = np.array([[np.nan, 1, np.nan],
+                        [np.nan, np.nan, 1],
+                        [1, 1, 1]])
+        spm = csr_matrix(np.nan_to_num(arr))
 
-            if index is not None:
-                tm.assert_index_equal(sdf.index, pd.Index(index))
-            if columns is not None:
-                tm.assert_index_equal(sdf.columns, pd.Index(columns))
+        sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
+                                 default_fill_value=fill_value)
+        res = pd.SparseDataFrame(arr, index=index, columns=columns,
+                                 default_fill_value=fill_value)
 
-            tm.assert_numpy_array_equal(sdf.fillna(0).values, arr)
-            tm.assert_equal((sdf.to_coo() != spm).data.size, 0)
-            tm.assert_equal((sdf.as_matrix(sparse=True) != spm).data.size, 0)
+        tm.assert_sp_frame_equal(sdf, res)
+        tm.assert_frame_equal(sdf.to_dense(), res.to_dense())
+        tm.assert_equal((sdf.to_coo() != spm).nnz, 0)
 
 
 class TestSparseDataFrameArithmetic(tm.TestCase):
diff --git a/pandas/types/common.py b/pandas/types/common.py
@@ -59,6 +59,21 @@ def is_sparse(array):
     return isinstance(array, (ABCSparseArray, ABCSparseSeries))
 
 
+# oh the troubles to reduce import time
+_spmatrix = None
+
+
+def is_scipy_sparse(array):
+    """ return if we are a scipy.sparse.spmatrix """
+    global _spmatrix
+    if _spmatrix is None:
+        try:
+            from scipy.sparse import spmatrix as _spmatrix
+        except ImportError:
+            _spmatrix = type('mock spmatrix', (), {})
+    return isinstance(array, _spmatrix)
+
+
 def is_categorical(array):
     """ return if we are a categorical possibility """
     return isinstance(array, ABCCategorical) or is_categorical_dtype(array)
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -297,11 +297,6 @@ def _skip_if_no_scipy():
     except ImportError:
         import pytest
         pytest.skip('scipy.interpolate missing')
-    try:
-        import scipy.sparse
-    except ImportError:
-        import pytest
-        pytest.skip('scipy.sparse missing')
 
 
 def _skip_if_scipy_0_17():