doc updates

jreback · jreback · commit d3ec8b55efa6 · 2017-07-27T07:26:52.000-04:00
diff --git a/ci/requirements-3.6_DOC.sh b/ci/requirements-3.6_DOC.sh
@@ -6,6 +6,6 @@ echo "[install DOC_BUILD deps]"
 
 pip install pandas-gbq
 
-conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc
+conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc fastparquet
 
 conda install -n pandas -c r r rpy2 --yes
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -213,7 +213,7 @@ buffer_lines : int, default None
   .. deprecated:: 0.19.0
 
      Argument removed because its value is not respected by the parser
-     
+
 compact_ints : boolean, default False
 
   .. deprecated:: 0.19.0
@@ -4093,7 +4093,7 @@ control compression: ``complevel`` and ``complib``.
 ``complevel`` specifies if and how hard data is to be compressed.
               ``complevel=0`` and ``complevel=None`` disables
               compression and ``0<complevel<10`` enables compression.
-              
+
 ``complib`` specifies which compression library to use. If nothing is
             specified the default library ``zlib`` is used. A
             compression library usually optimizes for either good
@@ -4108,9 +4108,9 @@ control compression: ``complevel`` and ``complib``.
              - `blosc <http://www.blosc.org/>`_: Fast compression and decompression.
 
              .. versionadded:: 0.20.2
-                               
+
                 Support for alternative blosc compressors:
-                  
+
                 - `blosc:blosclz <http://www.blosc.org/>`_ This is the
                   default compressor for ``blosc``
                 - `blosc:lz4
@@ -4559,28 +4559,30 @@ Parquet
 
 .. versionadded:: 0.21.0
 
-Parquet provides a sharded binary columnar serialization for data frames. It is designed to make reading and writing data
-frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a
-variety of compression techniques to shrink the file size as much as possible while still maintaining good read performance.
+`Parquet <https://parquet.apache.org/`__ provides a partitioned binary columnar serialization for data frames. It is designed to
+make reading and writing data frames efficient, and to make sharing data across data analysis
+languages easy. Parquet can use a variety of compression techniques to shrink the file size as much as possible
+while still maintaining good read performance.
 
-Parquet is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
-dtypes, including extension dtypes such as categorical and datetime with tz.
+Parquet is designed to faithfully serialize and de-serialize ``DataFrame`` s, supporting all of the pandas
+dtypes, including extension dtypes such as datetime with tz.
 
 Several caveats.
 
 - The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
-  error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index.
+  error if a non-default one is provided. You can simply ``.reset_index(drop=True)`` in order to store the index.
 - Duplicate column names and non-string columns names are not supported
+- Categorical dtypes are currently not-supported (for ``pyarrow``).
 - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
   on an attempt at serialization.
 
+You can specifiy an ``engine`` to direct the serialization, defaulting to ``pyarrow`` and controlled by the ``pd.options.io.parquet``.
 See the documentation for `pyarrow <http://arrow.apache.org/docs/python/`__ and `fastparquet <https://fastparquet.readthedocs.io/en/latest/>`__
 
 .. note::
 
    These engines are very similar and should read/write nearly identical parquet format files.
    These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
-   TODO: differing options to write non-standard columns & null treatment
 
 .. ipython:: python
 
@@ -4589,10 +4591,9 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/`__ and
                       'c': np.arange(3, 6).astype('u1'),
                       'd': np.arange(4.0, 7.0, dtype='float64'),
                       'e': [True, False, True],
-                      'f': pd.Categorical(list('abc')),
-                      'g': pd.date_range('20130101', periods=3),
-                      'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
-                      'i': pd.date_range('20130101', periods=3, freq='ns')})
+                      'f': pd.date_range('20130101', periods=3),
+                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'h': pd.date_range('20130101', periods=3, freq='ns')})
 
    df
    df.dtypes
@@ -4608,10 +4609,9 @@ Read from a parquet file.
 
 .. ipython:: python
 
-   result = pd.read_parquet('example_pa.parquet')
-   result = pd.read_parquet('example_fp.parquet')
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
+   result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
 
-   # we preserve dtypes
    result.dtypes
 
 .. ipython:: python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1601,7 +1601,7 @@ def to_feather(self, fname):
     def to_parquet(self, fname, engine=None, compression='snappy',
                    **kwargs):
         """
-        write out the binary parquet for DataFrames
+        Write a DataFrame to the binary parquet format.
 
         .. versionadded:: 0.21.0
 
@@ -1611,11 +1611,12 @@ def to_parquet(self, fname, engine=None, compression='snappy',
             string file path
         engine : str, optional
             The parquet engine, one of {'pyarrow', 'fastparquet'}
-            if None, will use the option: `io.parquet.engine`
+            If None, will use the option: `io.parquet.engine`, which
+            defaults to 'pyarrow'
         compression : str, optional, default 'snappy'
             compression method, includes {'gzip', 'snappy', 'brotli'}
-        kwargs passed to the engine
-
+        kwargs
+            Additional keyword arguments passed to the engine
         """
         from pandas.io.parquet import to_parquet
         to_parquet(self, fname, engine,
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -19,7 +19,7 @@ def _try_import():
                           "you can install via conda\n"
                           "conda install feather-format -c conda-forge\n"
                           "or via pip\n"
-                          "pip install feather-format\n")
+                          "pip install -U feather-format\n")
 
     try:
         feather.__version__ >= LooseVersion('0.3.1')
@@ -29,7 +29,7 @@ def _try_import():
                           "you can install via conda\n"
                           "conda install feather-format -c conda-forge"
                           "or via pip\n"
-                          "pip install feather-format\n")
+                          "pip install -U feather-format\n")
 
     return feather
 
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -36,15 +36,15 @@ def __init__(self):
                               "you can install via conda\n"
                               "conda install pyarrow -c conda-forge\n"
                               "\nor via pip\n"
-                              "pip install pyarrow\n")
+                              "pip install -U pyarrow\n")
 
         if LooseVersion(pyarrow.__version__) < '0.4.1':
             raise ImportError("pyarrow >= 0.4.1 is required for parquet"
                               "support\n\n"
                               "you can install via conda\n"
                               "conda install pyarrow -c conda-forge\n"
                               "\nor via pip\n"
-                              "pip install pyarrow\n")
+                              "pip install -U pyarrow\n")
 
         self.api = pyarrow
 
@@ -72,15 +72,15 @@ def __init__(self):
                               "you can install via conda\n"
                               "conda install fastparquet -c conda-forge\n"
                               "\nor via pip\n"
-                              "pip install fastparquet")
+                              "pip install -U fastparquet")
 
         if LooseVersion(fastparquet.__version__) < '0.1.0':
             raise ImportError("fastparquet >= 0.1.0 is required for parquet "
                               "support\n\n"
                               "you can install via conda\n"
                               "conda install fastparquet -c conda-forge\n"
                               "\nor via pip\n"
-                              "pip install fastparquet")
+                              "pip install -U fastparquet")
 
         self.api = fastparquet
 
@@ -109,10 +109,12 @@ def to_parquet(df, path, engine=None, compression='snappy', **kwargs):
         File path
     engine : str, optional
         The parquet engine, one of {'pyarrow', 'fastparquet'}
-        if None, will use the option: `io.parquet.engine`
+        If None, will use the option: `io.parquet.engine`, which
+        defaults to 'pyarrow'
     compression : str, optional, default 'snappy'
         compression method, includes {'gzip', 'snappy', 'brotli'}
-    kwargs are passed to the engine
+    kwargs
+        Additional keyword arguments passed to the engine
     """
 
     impl = get_engine(engine)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -54,6 +54,20 @@ def df_compat():
     return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
 
 
+@pytest.fixture
+def df_cross_compat():
+    df = pd.DataFrame({'a': list('abc'),
+                       'b': list(range(1, 4)),
+                       'c': np.arange(3, 6).astype('u1'),
+                       'd': np.arange(4.0, 7.0, dtype='float64'),
+                       'e': [True, False, True],
+                       'f': pd.date_range('20130101', periods=3),
+                       'g': pd.date_range('20130101', periods=3,
+                                          tz='US/Eastern'),
+                       'h': pd.date_range('20130101', periods=3, freq='ns')})
+    return df
+
+
 def test_invalid_engine(df_compat):
 
     with pytest.raises(ValueError):
@@ -87,21 +101,22 @@ def test_options_fp(df_compat, fp):
 
 
 @pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
-def test_cross_engine_pa_fp(df_compat, pa, fp):
+def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 
-    df = df_compat
+    df = df_cross_compat
     with tm.ensure_clean() as path:
         df.to_parquet(path, engine=pa, compression=None)
 
         result = read_parquet(path, engine=fp, compression=None)
         tm.assert_frame_equal(result, df)
 
 
-def test_cross_engine_fp_pa(df_compat, pa, fp):
+@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
+def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 
-    df = df_compat
+    df = df_cross_compat
     with tm.ensure_clean() as path:
         df.to_parquet(path, engine=fp, compression=None)