Skip to content

Commit 71f92a9

Browse files
committed
PERF: sparse to_csv
Improves to_csv performance for sparse matric by casting to dense before initializing DataFrameFormatter. Results in many fewer calls to `to_native_types` which saves time.
1 parent 7aa391e commit 71f92a9

File tree

3 files changed

+23
-0
lines changed

3 files changed

+23
-0
lines changed

asv_bench/benchmarks/io/csv.py

+20
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,26 @@ def time_head_of_multiindex(self):
172172
self.df_custom_index_then_head.to_csv(self.fname)
173173

174174

175+
class ToCSVSparse(BaseIO):
176+
177+
fname = "__test__.csv"
178+
179+
def setup(self):
180+
from scipy import sparse as sc
181+
182+
vals = np.random.randint(0, 10, size=(10000, 1000))
183+
keep = vals > 3
184+
vals[keep] = 0
185+
sparse_mtx = sc.coo_matrix(vals)
186+
self.data = DataFrame.sparse.from_spmatrix(sparse_mtx)
187+
188+
def time_sparse_to_csv(self):
189+
self.data.to_csv("sparse_pd.csv")
190+
191+
def time_sparse_to_dense_to_csv(self):
192+
self.data.sparse.to_dense().to_csv("sparse_pd.csv")
193+
194+
175195
class StringIORewind:
176196
def data(self, stringio_object):
177197
stringio_object.seek(0)

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ Performance improvements
158158
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
159159
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
160160
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
161+
- Performance improvement for :meth:`NDFrame.to_csv` when data frame is sparse (:issue:`41023`)
161162

162163
.. ---------------------------------------------------------------------------
163164
.. _whatsnew_200.bug_fixes:

pandas/core/generic.py

+2
Original file line numberDiff line numberDiff line change
@@ -3717,6 +3717,8 @@ def to_csv(
37173717
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
37183718
"""
37193719
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
3720+
if hasattr(df, "sparse"):
3721+
df = df.sparse.to_dense() # fixes 41023
37203722

37213723
formatter = DataFrameFormatter(
37223724
frame=df,

0 commit comments

Comments
 (0)