FastParquet should fail when s3 write attempt detected.

maxim veksler · maxim veksler · commit 026ecc7ebb98 · 2018-01-11T23:03:09.000+02:00
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -11,6 +11,7 @@
 from pandas.io.formats.printing import pprint_thing
 from pandas.core.common import AbstractMethodError
 from pandas.core.dtypes.common import is_number, is_file_like
+from pandas.io.s3 import is_s3_url
 
 # compat
 from pandas.errors import (ParserError, DtypeWarning,  # noqa
@@ -91,14 +92,6 @@ def _is_url(url):
         return False
 
 
-def _is_s3_url(url):
-    """Check for an s3, s3n, or s3a url"""
-    try:
-        return parse_url(url).scheme in ['s3', 's3n', 's3a']
-    except:
-        return False
-
-
 def _expand_user(filepath_or_buffer):
     """Return the argument with an initial component of ~ or ~user
        replaced by that user's home directory.
@@ -169,7 +162,7 @@ def _stringify_path(filepath_or_buffer):
 
 
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None, mode='rb'):
+                           compression=None, mode=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
     Otherwise passthrough.
@@ -179,7 +172,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
                          or buffer
     encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
-    mode : {'rb', 'wb', 'ab'}
+    mode : {'rb', 'wb', 'ab'} applies to S3 where a write mandates opening the
+            file in 'wb' mode.
 
     Returns
     -------
@@ -196,7 +190,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
         reader = BytesIO(req.read())
         return reader, encoding, compression
 
-    if _is_s3_url(filepath_or_buffer):
+    if is_s3_url(filepath_or_buffer):
         from pandas.io import s3
         return s3.get_filepath_or_buffer(filepath_or_buffer,
                                          encoding=encoding,
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -6,7 +6,7 @@
 from pandas.compat import string_types
 from pandas.core.common import AbstractMethodError
 from pandas.io.common import get_filepath_or_buffer
-
+from pandas.io.s3 import is_s3_url
 
 def get_engine(engine):
     """ return our implementation """
@@ -190,6 +190,10 @@ def __init__(self):
         self.api = fastparquet
 
     def write(self, df, path, compression='snappy', **kwargs):
+        if is_s3_url(path):
+            raise NotImplementedError("fastparquet s3 write is not implemented."
+                                      " Consider using pyarrow instead.")
+
         self.validate_dataframe(df)
         # thriftpy/protocol/compact.py:339:
         # DeprecationWarning: tostring() is deprecated.
diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -18,8 +18,20 @@ def _strip_schema(url):
     return result.netloc + result.path
 
 
+def is_s3_url(url):
+    """Check for an s3, s3n, or s3a url"""
+    try:
+        return parse_url(url).scheme in ['s3', 's3n', 's3a']
+    except:
+        return False
+
+
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
-                           compression=None, mode='rb'):
+                           compression=None, mode=None):
+
+    if mode is None:
+        mode = 'rb'
+
     fs = s3fs.S3FileSystem(anon=False)
     try:
         filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -236,12 +236,14 @@ def check_round_trip(self, df, engine, expected=None,
 
         with tm.ensure_clean() as path:
             self.do_round_trip(df, path, engine, expected,
-                               write_kwargs=write_kwargs, read_kwargs=read_kwargs,
+                               write_kwargs=write_kwargs,
+                               read_kwargs=read_kwargs,
                                check_names=check_names)
 
             # repeat
             self.do_round_trip(df, path, engine, expected,
-                               write_kwargs=write_kwargs, read_kwargs=read_kwargs,
+                               write_kwargs=write_kwargs,
+                               read_kwargs=read_kwargs,
                                check_names=check_names)
 
 
@@ -433,7 +435,7 @@ def test_categorical_unsupported(self, pa_lt_070):
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):
         # GH #19134
-        self.do_round_trip(df_compat, 's3://pandas-test/test.parquet', pa)
+        self.do_round_trip(df_compat, 's3://pandas-test/pyarrow.parquet', pa)
 
 
 class TestParquetFastParquet(Base):
@@ -495,9 +497,6 @@ def test_filter_row_groups(self, fp):
         assert len(result) == 1
 
     def test_s3_roundtrip(self, df_compat, s3_resource, fp):
-        print(s3_resource, fp)
-
         # GH #19134
-        with pytest.raises(TypeError):
-            self.do_round_trip(df_compat, 's3://pandas-test/test.parquet', fp)
-
+        with pytest.raises(NotImplementedError):
+            self.do_round_trip(df_compat, 's3://pandas-test/fastparquet.parquet', fp)
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
@@ -1,9 +1,9 @@
-from pandas.io.common import _is_s3_url
+from pandas.io.s3 import is_s3_url
 
 
 class TestS3URL(object):
 
     def test_is_s3_url(self):
-        assert _is_s3_url("s3://pandas/somethingelse.com")
-        assert not _is_s3_url("s4://pandas/somethingelse.com")
+        assert is_s3_url("s3://pandas/somethingelse.com")
+        assert not is_s3_url("s4://pandas/somethingelse.com")