Open
Description
Code Sample, a copy-pastable example if possible
import pandas as pd
from scipy import sparse
X_sp = sparse.coo_matrix((2**30, 2**10))
X_pd = pd.DataFrame.sparse.from_spmatrix(X_sp)
X_sp.sum(axis=1)
X_sp.sum(axis=0)
X_pd.sum(axis=1)
X_pd.sum(axis=0)
Problem description
The new sparse dataframe is coerced to dense when computing the sum.
>>> import pandas as pd
>>> from scipy import sparse
>>> X_sp = sparse.coo_matrix((2**30, 2**10))
>>> X_pd = pd.DataFrame.sparse.from_spmatrix(X_sp)
>>> X_sp.sum(axis=1)
matrix([[0.],
[0.],
[0.],
...,
[0.],
[0.],
[0.]])
>>> X_sp.sum(axis=0)
matrix([[0., 0., 0., ..., 0., 0., 0.]])
>>> X_pd.sum(axis=1)
Traceback (most recent call last):
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/frame.py", line 7908, in _reduce
values = self.values
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/generic.py", line 5443, in values
return self._data.as_array(transpose=self._AXIS_REVERSED)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 822, in as_array
arr = mgr._interleave()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 840, in _interleave
result = np.empty(self.shape, dtype=dtype)
numpy.core._exceptions.MemoryError: Unable to allocate array with shape (1024, 1073741824) and data type float64
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/generic.py", line 11585, in stat_func
min_count=min_count,
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/frame.py", line 7953, in _reduce
result = f(data.values)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/generic.py", line 5443, in values
return self._data.as_array(transpose=self._AXIS_REVERSED)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 822, in as_array
arr = mgr._interleave()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 840, in _interleave
result = np.empty(self.shape, dtype=dtype)
numpy.core._exceptions.MemoryError: Unable to allocate array with shape (1024, 1073741824) and data type float64
>>> X_pd.sum(axis=0)
# hangs forever
^C
Traceback (most recent call last):
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/frame.py", line 7908, in _reduce
values = self.values
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/generic.py", line 5443, in values
return self._data.as_array(transpose=self._AXIS_REVERSED)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 822, in as_array
arr = mgr._interleave()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/internals/managers.py", line 840, in _interleave
result = np.empty(self.shape, dtype=dtype)
numpy.core._exceptions.MemoryError: Unable to allocate array with shape (1024, 1073741824) and data type float64
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/generic.py", line 11585, in stat_func
min_count=min_count,
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/frame.py", line 7935, in _reduce
result = opa.get_result()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/apply.py", line 186, in get_result
return self.apply_standard()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/apply.py", line 292, in apply_standard
self.apply_series_generator()
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/apply.py", line 308, in apply_series_generator
results[i] = self.f(v)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/frame.py", line 7893, in f
return op(x, axis=axis, skipna=skipna, **kwds)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/nanops.py", line 70, in _f
return f(*args, **kwargs)
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/nanops.py", line 495, in nansum
values, skipna, fill_value=0, mask=mask
File "/home/scottgigante/sandbox/lib/python3.7/site-packages/pandas/core/nanops.py", line 309, in _get_values
values = values.copy()
KeyboardInterrupt
Expected Output
The output should be computed successfully as in the scipy case.
Output of pd.show_versions()
INSTALLED VERSIONS
------------------
commit : None
python : 3.7.3.final.0
python-bits : 64
OS : Linux
OS-release : 5.0.10-arch1-1-ARCH
machine : x86_64
processor :
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 0.25.1
numpy : 1.17.0
pytz : 2019.2
dateutil : 2.8.0
pip : 19.2.3
setuptools : 41.0.1
Cython : None
pytest : 5.1.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 2.10.1
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
fastparquet : None
gcsfs : None
lxml.etree : None
matplotlib : 3.1.1
numexpr : 2.7.0
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
s3fs : None
scipy : 1.3.1
sqlalchemy : None
tables : 3.5.2
xarray : None
xlrd : None
xlwt : None
xlsxwriter : None