Skip to content

ENH: add use_nullable_dtypes option in read_parquet #31242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 29, 2020
51 changes: 45 additions & 6 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" parquet compat """

from distutils.version import LooseVersion
from typing import Any, Dict, Optional
from warnings import catch_warnings

Expand Down Expand Up @@ -116,13 +117,32 @@ def write(
**kwargs,
)

def read(self, path, columns=None, **kwargs):
def read(self, path, columns=None, use_nullable_dtypes=False, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about use_extension_dtypes as more descriptive

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how about use_extension_dtypes as more descriptive

It doesn't use extension dtypes in general, only those types that use pd.NA

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See also #29752 for some discussion about naming this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I would also prefer use_extension_dtypes

path, _, _, should_close = get_filepath_or_buffer(path)

kwargs["use_pandas_metadata"] = True
result = self.api.parquet.read_table(
path, columns=columns, **kwargs
).to_pandas()
to_pandas_kwargs = {}
if use_nullable_dtypes:
if LooseVersion(self.api.__version__) > "0.15.1.dev":
import pandas as pd

mapping = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you instead import from the arrays locations.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also import eg DataFrame from the main namespace in this file

self.api.int8(): pd.Int8Dtype(),
self.api.int16(): pd.Int16Dtype(),
self.api.int32(): pd.Int32Dtype(),
self.api.int64(): pd.Int64Dtype(),
self.api.uint8(): pd.UInt8Dtype(),
self.api.uint16(): pd.UInt16Dtype(),
self.api.uint32(): pd.UInt32Dtype(),
self.api.uint64(): pd.UInt64Dtype(),
self.api.bool_(): pd.BooleanDtype(),
self.api.string(): pd.StringDtype(),
}
to_pandas_kwargs["types_mapper"] = mapping.get

result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas(
**to_pandas_kwargs
)
if should_close:
path.close()

Expand Down Expand Up @@ -184,6 +204,12 @@ def write(
)

def read(self, path, columns=None, **kwargs):
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should have a global option to turn this on (pls add an issue for this)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this generally worth it, if you can add an issue for this / PR welcome too! (bot blocking for this PR)

if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
if is_s3_url(path):
from pandas.io.s3 import get_file_and_filesystem

Expand Down Expand Up @@ -263,7 +289,13 @@ def to_parquet(
)


def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
def read_parquet(
path,
engine: str = "auto",
columns=None,
use_nullable_dtypes: bool = False,
**kwargs,
):
"""
Load a parquet object from the file path, returning a DataFrame.

Expand Down Expand Up @@ -296,6 +328,11 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
If not None, only these columns will be read from the file.

.. versionadded:: 0.21.1
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
**kwargs
Any additional kwargs are passed to the engine.

Expand All @@ -305,4 +342,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
"""

impl = get_engine(engine)
return impl.read(path, columns=columns, **kwargs)
return impl.read(
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
)
25 changes: 25 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,31 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow", min_version="0.15.1.dev")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array(["a", "b", "c", None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array(["a", "b", "c", None], dtype="string"),
}
)
tm.assert_frame_equal(result2, expected)


class TestParquetFastParquet(Base):
@td.skip_if_no("fastparquet", min_version="0.3.2")
Expand Down