Skip to content

BUG/REG: file-handle object handled incorrectly in to_csv #21478

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Jun 18, 2018
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version.
Fixed Regressions
~~~~~~~~~~~~~~~~~

-
- Fixed Regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: "Regression" --> "regression"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

-

.. _whatsnew_0232.performance:
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1690,7 +1690,8 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
line_terminator : string, default ``'\n'``
The newline character or character sequence to use in the output
file
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3790,7 +3790,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
non-ascii, for python versions prior to 3
compression : string, optional
A string representing the compression to use in the output file.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'.
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
used when the first argument is a filename.
date_format: string, default None
Format string for datetime objects.
decimal: string, default '.'
Expand Down
7 changes: 7 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,13 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
def write(self, data):
super(BytesZipFile, self).writestr(self.filename, data)

@property
def closed(self):
if compat.PY2:
return self.fp is None
else:
return super(BytesZipFile, self).closed


class MMapWrapper(BaseIterator):
"""
Expand Down
60 changes: 40 additions & 20 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@

from __future__ import print_function

import warnings

import csv as csvlib
from zipfile import ZipFile
import numpy as np

from pandas.core.dtypes.missing import notna
from pandas.core.dtypes.inference import is_file_like
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)
Expand Down Expand Up @@ -128,19 +130,31 @@ def save(self):
else:
encoding = self.encoding

# PR 21300 uses string buffer to receive csv writing and dump into
# file-like output with compression as option. GH 21241, 21118
f = StringIO()
if not is_file_like(self.path_or_buf):
# path_or_buf is path
path_or_buf = self.path_or_buf
elif hasattr(self.path_or_buf, 'name'):
# path_or_buf is file handle
path_or_buf = self.path_or_buf.name
else:
# path_or_buf is file-like IO objects.
# GH 21227 internal compression is not used when file-like passed.
if self.compression and hasattr(self.path_or_buf, 'write'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can uou add a comment here (sure the warning says it all, but helpful nonetheless), also an issue reference

msg = ("compression has no effect when passing file-like "
"object as input.")
warnings.warn(msg, RuntimeWarning, stacklevel=2)

# when zip compression is called.
is_zip = isinstance(self.path_or_buf, ZipFile) or (
not hasattr(self.path_or_buf, 'write')
and self.compression == 'zip')

if is_zip:
# zipfile doesn't support writing string to archive. uses string
# buffer to receive csv writing and dump into zip compression
# file handle. GH 21241, 21118
f = StringIO()
close = False
elif hasattr(self.path_or_buf, 'write'):
f = self.path_or_buf
path_or_buf = None
close = False
else:
f, handles = _get_handle(self.path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
close = True

try:
writer_kwargs = dict(lineterminator=self.line_terminator,
Expand All @@ -157,13 +171,19 @@ def save(self):
self._save()

finally:
# GH 17778 handles zip compression for byte strings separately.
buf = f.getvalue()
if path_or_buf:
f, handles = _get_handle(path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
f.write(buf)
if is_zip:
# GH 17778 handles zip compression separately.
buf = f.getvalue()
try:
self.path_or_buf.write(buf)
Copy link
Member

@gfyoung gfyoung Jun 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They say that it's sometimes easier to ask for forgiveness than it is for permission, hence the try/except block that you choose here. That being said, your treatment of self.path_or_buf.write is inconsistent here compared to:

https://github.com/pandas-dev/pandas/pull/21478/files#diff-eaa887c826bfb361d98db1ddb668c7deR150

Where you ask for permission instead of forgiveness. In addition, the logic seems somewhat similar. Can we potentially deduplicate it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see your point. changed to if else to make it consistent.

except AttributeError:
# when path_or_buf is not file-like, create handle.
f, handles = _get_handle(self.path_or_buf, self.mode,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a comment on when the except happens?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

encoding=encoding,
compression=self.compression)
f.write(buf)
close = True
if close:
f.close()
for _fh in handles:
_fh.close()
Expand Down
16 changes: 9 additions & 7 deletions pandas/tests/frame/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np

from pandas.compat import (lmap, range, lrange, StringIO, u)
from pandas.io.common import _get_handle
import pandas.core.common as com
from pandas.errors import ParserError
from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
Expand Down Expand Up @@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression):
with ensure_clean() as filename:

df.to_csv(filename, compression=compression, encoding=encoding)

# test the round trip - to_csv -> read_csv
result = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
assert_frame_equal(df, result)

with open(filename, 'w') as fh:
df.to_csv(fh, compression=compression, encoding=encoding)

result_fh = read_csv(filename, compression=compression,
index_col=0, encoding=encoding)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
df.to_csv(f, encoding=encoding)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_frame_equal(df, result)
assert_frame_equal(df, result_fh)

# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
18 changes: 9 additions & 9 deletions pandas/tests/series/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas import Series, DataFrame

from pandas.compat import StringIO, u
from pandas.io.common import _get_handle
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, ensure_clean)
import pandas.util.testing as tm
Expand Down Expand Up @@ -151,20 +152,19 @@ def test_to_csv_compression(self, s, encoding, compression):

s.to_csv(filename, compression=compression, encoding=encoding,
header=True)

# test the round trip - to_csv -> read_csv
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)

with open(filename, 'w') as fh:
s.to_csv(fh, compression=compression, encoding=encoding,
header=True)

result_fh = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0,
squeeze=True)
# test the round trip using file handle - to_csv -> read_csv
f, _handles = _get_handle(filename, 'w', compression=compression,
encoding=encoding)
with f:
s.to_csv(f, encoding=encoding, header=True)
result = pd.read_csv(filename, compression=compression,
encoding=encoding, index_col=0, squeeze=True)
assert_series_equal(s, result)
assert_series_equal(s, result_fh)

# explicitly ensure file was compressed
with tm.decompress_file(filename, compression) as fh:
Expand Down
32 changes: 23 additions & 9 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas.compat import range, lmap
import pandas.core.common as com
from pandas.core import ops
from pandas.io.common import _get_handle
import pandas.util.testing as tm


Expand Down Expand Up @@ -246,19 +247,32 @@ def test_compression_size(obj, method, compression_only):
[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z']),
Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
@pytest.mark.parametrize('method', ['to_csv'])
@pytest.mark.parametrize('method', ['to_csv', 'to_json'])
def test_compression_size_fh(obj, method, compression_only):

with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=compression_only)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with f:
getattr(obj, method)(f)
assert not f.closed
compressed = os.path.getsize(filename)
with tm.ensure_clean() as filename:
with open(filename, 'w') as fh:
getattr(obj, method)(fh, compression=None)
assert not fh.closed
assert fh.closed
f, _handles = _get_handle(filename, 'w', compression=None)
with f:
getattr(obj, method)(f)
assert not f.closed
uncompressed = os.path.getsize(filename)
assert uncompressed > compressed


# GH 21227
def test_compression_warning(compression_only):
df = DataFrame(100 * [[0.123456, 0.234567, 0.567567],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added.

[12.32112, 123123.2, 321321.2]],
columns=['X', 'Y', 'Z'])
with tm.ensure_clean() as filename:
f, _handles = _get_handle(filename, 'w', compression=compression_only)
with tm.assert_produces_warning(RuntimeWarning,
check_stacklevel=False):
with f:
df.to_csv(f, compression=compression_only)