Skip to content

ENH: add use_nullable_dtypes option in read_parquet #31242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 29, 2020
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,10 @@ Other enhancements
- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
nullable dtypes that use ``pd.NA`` as missing value indicator where possible
for the resulting DataFrame (default is False, and only applicable for
``engine="pyarrow"``) (:issue:`31242`)
- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`)
- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`)
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`)
Expand Down
61 changes: 57 additions & 4 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" parquet compat """

from distutils.version import LooseVersion
import io
import os
from typing import Any, AnyStr, Dict, List, Optional, Tuple
Expand Down Expand Up @@ -177,10 +178,39 @@ def write(
handles.close()

def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
self,
path,
columns=None,
use_nullable_dtypes=False,
storage_options: StorageOptions = None,
**kwargs,
):
kwargs["use_pandas_metadata"] = True

to_pandas_kwargs = {}
if use_nullable_dtypes:
if LooseVersion(self.api.__version__) >= "0.16":
import pandas as pd

mapping = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you instead import from the arrays locations.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also import eg DataFrame from the main namespace in this file

self.api.int8(): pd.Int8Dtype(),
self.api.int16(): pd.Int16Dtype(),
self.api.int32(): pd.Int32Dtype(),
self.api.int64(): pd.Int64Dtype(),
self.api.uint8(): pd.UInt8Dtype(),
self.api.uint16(): pd.UInt16Dtype(),
self.api.uint32(): pd.UInt32Dtype(),
self.api.uint64(): pd.UInt64Dtype(),
self.api.bool_(): pd.BooleanDtype(),
self.api.string(): pd.StringDtype(),
}
to_pandas_kwargs["types_mapper"] = mapping.get
else:
raise ValueError(
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
f"({self.api.__version__} is installed"
)

path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
path,
kwargs.pop("filesystem", None),
Expand All @@ -190,7 +220,7 @@ def read(
try:
return self.api.parquet.read_table(
path_or_handle, columns=columns, **kwargs
).to_pandas()
).to_pandas(**to_pandas_kwargs)
finally:
if handles is not None:
handles.close()
Expand Down Expand Up @@ -258,6 +288,12 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should have a global option to turn this on (pls add an issue for this)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this generally worth it, if you can add an issue for this / PR welcome too! (bot blocking for this PR)

if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
path = stringify_path(path)
parquet_kwargs = {}
handles = None
Expand Down Expand Up @@ -368,7 +404,13 @@ def to_parquet(
return None


def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
def read_parquet(
path,
engine: str = "auto",
columns=None,
use_nullable_dtypes: bool = False,
**kwargs,
):
"""
Load a parquet object from the file path, returning a DataFrame.

Expand Down Expand Up @@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
'pyarrow' is unavailable.
columns : list, default=None
If not None, only these columns will be read from the file.
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.

.. versionadded:: 1.2.0
**kwargs
Any additional kwargs are passed to the engine.

Expand All @@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
DataFrame
"""
impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)
return impl.read(
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
)
37 changes: 37 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,35 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow", min_version="0.16")
def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

@td.skip_if_no("pyarrow", min_version="0.14")
def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
Expand Down Expand Up @@ -1001,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected = df.copy()
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_use_nullable_dtypes_not_supported(self, fp):
df = pd.DataFrame({"a": [1, 2]})

with tm.ensure_clean() as path:
df.to_parquet(path)
with pytest.raises(ValueError, match="not supported for the fastparquet"):
read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)