Skip to content

Commit c2adfcc

Browse files
committed
fixup! ENH: Native conversion from/to scipy.sparse matrix to SparseDataFrame
1 parent e2de850 commit c2adfcc

File tree

6 files changed

+68
-73
lines changed

6 files changed

+68
-73
lines changed

doc/source/sparse.rst

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,32 @@ the correct dense result.
186186
Interaction with scipy.sparse
187187
-----------------------------
188188

189-
Experimental api to transform between sparse pandas and scipy.sparse structures.
189+
.. versionadded:: 0.20.0
190190

191-
A :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
191+
Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
192+
193+
.. ipython:: python
194+
195+
from scipy.sparse import csr_matrix
196+
197+
arr = np.random.random(size=(1000, 5))
198+
arr[arr < .9] = 0
199+
200+
sp_arr = csr_matrix(arr)
201+
sp_arr
202+
203+
sdf = pd.SparseDataFrame(sp_arr)
204+
sdf
205+
206+
To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use :meth:`SparseDataFrame.to_coo` method:
207+
208+
.. ipython:: python
209+
210+
sdf.to_coo()
211+
212+
.. versionadded:: 0.16.0
213+
214+
Additionally, an experimental :meth:`SparseSeries.to_coo` method is implemented for transforming a ``SparseSeries`` indexed by a ``MultiIndex`` to a ``scipy.sparse.coo_matrix``.
192215

193216
The method requires a ``MultiIndex`` with two or more levels.
194217

doc/source/whatsnew/v0.20.0.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ New Behavior:
157157

158158
SciPy sparse matrix from/to SparseDataFrame
159159
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
160-
Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmatrix`` instances. E.g.
160+
Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances (:issue:`4343`).
161161

162162
.. ipython:: python
163163

@@ -166,10 +166,10 @@ Pandas now supports creating SparseDataFrames directly from ``scipy.sparse.spmat
166166
arr[arr < .9] = 0
167167
sp_arr = csr_matrix(arr)
168168
sp_arr
169-
sdf = pd.DataFrame(sp_arr)
169+
sdf = pd.SparseDataFrame(sp_arr)
170170
sdf
171171

172-
To convert a SparseDataFrame back to scipy sparse matrix in COO format, you can use:
172+
To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you can use:
173173

174174
.. ipython:: python
175175

pandas/sparse/frame.py

Lines changed: 7 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from pandas.types.missing import isnull, notnull
1414
from pandas.types.cast import _maybe_upcast, _find_common_type
15-
from pandas.types.common import _ensure_platform_int
15+
from pandas.types.common import _ensure_platform_int, is_scipy_sparse
1616

1717
from pandas.core.common import _try_sort
1818
from pandas.compat.numpy import function as nv
@@ -29,10 +29,6 @@
2929
from pandas.util.decorators import Appender
3030
import pandas.core.ops as ops
3131

32-
try:
33-
from scipy.sparse import spmatrix # noqa
34-
except ImportError:
35-
spmatrix = type('mock spmatrix', (), {})
3632

3733
_shared_doc_kwargs = dict(klass='SparseDataFrame')
3834

@@ -102,7 +98,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
10298
elif isinstance(data, BlockManager):
10399
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
104100
dtype=dtype, copy=copy)
105-
elif isinstance(data, spmatrix):
101+
elif is_scipy_sparse(data):
106102
mgr = self._init_spmatrix(data, index, columns, dtype=dtype)
107103
elif data is None:
108104
data = DataFrame()
@@ -175,19 +171,21 @@ def _init_dict(self, data, index, columns, dtype=None):
175171
return to_manager(sdict, columns, index)
176172

177173
def _init_matrix(self, data, index, columns, dtype=None):
174+
""" Init self from ndarray or list of lists """
178175
data = _prep_ndarray(data, copy=False)
179176
index, columns = self._prep_index(data, index, columns)
180177
data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
181178
return self._init_dict(data, index, columns, dtype)
182179

183180
def _init_spmatrix(self, data, index, columns, dtype=None):
181+
""" Init self from scipy.sparse matrix """
184182
index, columns = self._prep_index(data, index, columns)
185183
data = data.tocoo(copy=False)
186184
N = len(index)
187185
bindex = np.arange(N, dtype=np.int32)
188186

189187
sdict = {}
190-
values = Series(data.data, index=data.row)
188+
values = Series(data.data, index=data.row, copy=False)
191189
for col, rowvals in values.groupby(data.col):
192190
blocs, blens = get_blocks(bindex[rowvals.index])
193191
sdict[columns[col]] = SparseSeries(
@@ -217,49 +215,12 @@ def _prep_index(self, data, index, columns):
217215
(len(index), N))
218216
return index, columns
219217

220-
def as_matrix(self, columns=None, sparse=False):
221-
"""
222-
Convert the frame to its Numpy-array or SciPy sparse COO matrix
223-
representation.
224-
225-
Parameters
226-
----------
227-
columns : list, optional, default=None
228-
If None, return all columns. Otherwise, returns specified columns.
229-
sparse : bool, optional, default=True
230-
If True, return an instance of scipy.sparse.coo_matrix instead
231-
of ndarray. If False, the result values array will be DENSE.
232-
233-
Returns
234-
-------
235-
values : ndarray or scipy.sparse.spmatrix
236-
If the caller is heterogeneous and contains booleans or objects,
237-
the result will be of dtype=object. See Notes.
238-
239-
Notes
240-
-----
241-
The dtype will be the lowest-common-denominator type (implicit
242-
upcasting); that is to say if the dtypes (even of numeric types)
243-
are mixed, the one that accommodates all will be chosen.
244-
245-
e.g. If the dtypes are float16 and float32, dtype will be upcast to
246-
float32. By numpy.find_common_type convention, mixing int64 and
247-
and uint64 will result in a float64 dtype.
248-
249-
See Also
250-
--------
251-
pandas.SparseDataFrame.to_coo
252-
"""
253-
if sparse:
254-
subdf = self if columns is None else self[columns]
255-
return subdf.to_coo()
256-
257-
return super(SparseDataFrame, self).as_matrix(columns=columns)
258-
259218
def to_coo(self):
260219
"""
261220
Convert the frame to its SciPy sparse COO matrix representation.
262221
222+
.. versionadded:: 0.20.0
223+
263224
Returns
264225
-------
265226
coo_matrix : scipy.sparse.spmatrix

pandas/tests/sparse/test_frame.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import operator
44

5+
import pytest
6+
57
from numpy import nan
68
import numpy as np
79
import pandas as pd
@@ -1118,28 +1120,27 @@ def test_isnotnull(self):
11181120
'B': [True, False, True, True, False]})
11191121
tm.assert_frame_equal(res.to_dense(), exp)
11201122

1121-
def test_from_to_scipy(self):
1123+
@pytest.mark.importorskip('scipy.sparse')
1124+
@pytest.mark.parameterize('index', [None, list('abc')])
1125+
@pytest.mark.parameterize('columns', [None, list('def')])
1126+
@pytest.mark.parameterize('fill_value', [None, 0, np.nan])
1127+
def test_from_to_scipy(self, index=None, columns=None, fill_value=None):
11221128
# GH 4343
1123-
tm._skip_if_no_scipy()
11241129
from scipy.sparse import csr_matrix
11251130

1126-
arr = np.array([[0, 1, 0],
1127-
[0, 0, 1],
1128-
[1, 1, 1.]])
1129-
spm = csr_matrix(arr)
1130-
1131-
for index, columns in ((list('abc'), list('def')),
1132-
(None, None)):
1133-
sdf = pd.SparseDataFrame(spm, index=index, columns=columns)
1131+
arr = np.array([[np.nan, 1, np.nan],
1132+
[np.nan, np.nan, 1],
1133+
[1, 1, 1]])
1134+
spm = csr_matrix(np.nan_to_num(arr))
11341135

1135-
if index is not None:
1136-
tm.assert_index_equal(sdf.index, pd.Index(index))
1137-
if columns is not None:
1138-
tm.assert_index_equal(sdf.columns, pd.Index(columns))
1136+
sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
1137+
default_fill_value=fill_value)
1138+
res = pd.SparseDataFrame(arr, index=index, columns=columns,
1139+
default_fill_value=fill_value)
11391140

1140-
tm.assert_numpy_array_equal(sdf.fillna(0).values, arr)
1141-
tm.assert_equal((sdf.to_coo() != spm).data.size, 0)
1142-
tm.assert_equal((sdf.as_matrix(sparse=True) != spm).data.size, 0)
1141+
tm.assert_sp_frame_equal(sdf, res)
1142+
tm.assert_frame_equal(sdf.to_dense(), res.to_dense())
1143+
tm.assert_equal((sdf.to_coo() != spm).nnz, 0)
11431144

11441145

11451146
class TestSparseDataFrameArithmetic(tm.TestCase):

pandas/types/common.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,21 @@ def is_sparse(array):
5959
return isinstance(array, (ABCSparseArray, ABCSparseSeries))
6060

6161

62+
# oh the troubles to reduce import time
63+
_spmatrix = None
64+
65+
66+
def is_scipy_sparse(array):
67+
""" return if we are a scipy.sparse.spmatrix """
68+
global _spmatrix
69+
if _spmatrix is None:
70+
try:
71+
from scipy.sparse import spmatrix as _spmatrix
72+
except ImportError:
73+
_spmatrix = type('mock spmatrix', (), {})
74+
return isinstance(array, _spmatrix)
75+
76+
6277
def is_categorical(array):
6378
""" return if we are a categorical possibility """
6479
return isinstance(array, ABCCategorical) or is_categorical_dtype(array)

pandas/util/testing.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,11 +297,6 @@ def _skip_if_no_scipy():
297297
except ImportError:
298298
import pytest
299299
pytest.skip('scipy.interpolate missing')
300-
try:
301-
import scipy.sparse
302-
except ImportError:
303-
import pytest
304-
pytest.skip('scipy.sparse missing')
305300

306301

307302
def _skip_if_scipy_0_17():

0 commit comments

Comments
 (0)