Skip to content

Commit 026ecc7

Browse files
author
maxim veksler
committed
FastParquet should fail when s3 write attempt detected.
1 parent 230c814 commit 026ecc7

File tree

5 files changed

+33
-24
lines changed

5 files changed

+33
-24
lines changed

pandas/io/common.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from pandas.io.formats.printing import pprint_thing
1212
from pandas.core.common import AbstractMethodError
1313
from pandas.core.dtypes.common import is_number, is_file_like
14+
from pandas.io.s3 import is_s3_url
1415

1516
# compat
1617
from pandas.errors import (ParserError, DtypeWarning, # noqa
@@ -91,14 +92,6 @@ def _is_url(url):
9192
return False
9293

9394

94-
def _is_s3_url(url):
95-
"""Check for an s3, s3n, or s3a url"""
96-
try:
97-
return parse_url(url).scheme in ['s3', 's3n', 's3a']
98-
except:
99-
return False
100-
101-
10295
def _expand_user(filepath_or_buffer):
10396
"""Return the argument with an initial component of ~ or ~user
10497
replaced by that user's home directory.
@@ -169,7 +162,7 @@ def _stringify_path(filepath_or_buffer):
169162

170163

171164
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
172-
compression=None, mode='rb'):
165+
compression=None, mode=None):
173166
"""
174167
If the filepath_or_buffer is a url, translate and return the buffer.
175168
Otherwise passthrough.
@@ -179,7 +172,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
179172
filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
180173
or buffer
181174
encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
182-
mode : {'rb', 'wb', 'ab'}
175+
mode : {'rb', 'wb', 'ab'} applies to S3 where a write mandates opening the
176+
file in 'wb' mode.
183177
184178
Returns
185179
-------
@@ -196,7 +190,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
196190
reader = BytesIO(req.read())
197191
return reader, encoding, compression
198192

199-
if _is_s3_url(filepath_or_buffer):
193+
if is_s3_url(filepath_or_buffer):
200194
from pandas.io import s3
201195
return s3.get_filepath_or_buffer(filepath_or_buffer,
202196
encoding=encoding,

pandas/io/parquet.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas.compat import string_types
77
from pandas.core.common import AbstractMethodError
88
from pandas.io.common import get_filepath_or_buffer
9-
9+
from pandas.io.s3 import is_s3_url
1010

1111
def get_engine(engine):
1212
""" return our implementation """
@@ -190,6 +190,10 @@ def __init__(self):
190190
self.api = fastparquet
191191

192192
def write(self, df, path, compression='snappy', **kwargs):
193+
if is_s3_url(path):
194+
raise NotImplementedError("fastparquet s3 write is not implemented."
195+
" Consider using pyarrow instead.")
196+
193197
self.validate_dataframe(df)
194198
# thriftpy/protocol/compact.py:339:
195199
# DeprecationWarning: tostring() is deprecated.

pandas/io/s3.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,20 @@ def _strip_schema(url):
1818
return result.netloc + result.path
1919

2020

21+
def is_s3_url(url):
22+
"""Check for an s3, s3n, or s3a url"""
23+
try:
24+
return parse_url(url).scheme in ['s3', 's3n', 's3a']
25+
except:
26+
return False
27+
28+
2129
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
22-
compression=None, mode='rb'):
30+
compression=None, mode=None):
31+
32+
if mode is None:
33+
mode = 'rb'
34+
2335
fs = s3fs.S3FileSystem(anon=False)
2436
try:
2537
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)

pandas/tests/io/test_parquet.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,14 @@ def check_round_trip(self, df, engine, expected=None,
236236

237237
with tm.ensure_clean() as path:
238238
self.do_round_trip(df, path, engine, expected,
239-
write_kwargs=write_kwargs, read_kwargs=read_kwargs,
239+
write_kwargs=write_kwargs,
240+
read_kwargs=read_kwargs,
240241
check_names=check_names)
241242

242243
# repeat
243244
self.do_round_trip(df, path, engine, expected,
244-
write_kwargs=write_kwargs, read_kwargs=read_kwargs,
245+
write_kwargs=write_kwargs,
246+
read_kwargs=read_kwargs,
245247
check_names=check_names)
246248

247249

@@ -433,7 +435,7 @@ def test_categorical_unsupported(self, pa_lt_070):
433435

434436
def test_s3_roundtrip(self, df_compat, s3_resource, pa):
435437
# GH #19134
436-
self.do_round_trip(df_compat, 's3://pandas-test/test.parquet', pa)
438+
self.do_round_trip(df_compat, 's3://pandas-test/pyarrow.parquet', pa)
437439

438440

439441
class TestParquetFastParquet(Base):
@@ -495,9 +497,6 @@ def test_filter_row_groups(self, fp):
495497
assert len(result) == 1
496498

497499
def test_s3_roundtrip(self, df_compat, s3_resource, fp):
498-
print(s3_resource, fp)
499-
500500
# GH #19134
501-
with pytest.raises(TypeError):
502-
self.do_round_trip(df_compat, 's3://pandas-test/test.parquet', fp)
503-
501+
with pytest.raises(NotImplementedError):
502+
self.do_round_trip(df_compat, 's3://pandas-test/fastparquet.parquet', fp)

pandas/tests/io/test_s3.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
from pandas.io.common import _is_s3_url
1+
from pandas.io.s3 import is_s3_url
22

33

44
class TestS3URL(object):
55

66
def test_is_s3_url(self):
7-
assert _is_s3_url("s3://pandas/somethingelse.com")
8-
assert not _is_s3_url("s4://pandas/somethingelse.com")
7+
assert is_s3_url("s3://pandas/somethingelse.com")
8+
assert not is_s3_url("s4://pandas/somethingelse.com")
99

0 commit comments

Comments
 (0)