-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: add use_nullable_dtypes option in read_parquet #31242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b3053fd
434442a
f617a7e
af81de9
60a1c0e
18c93b5
0f691be
46932f4
1375bad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
""" parquet compat """ | ||
|
||
from distutils.version import LooseVersion | ||
import io | ||
import os | ||
from typing import Any, AnyStr, Dict, List, Optional, Tuple | ||
|
@@ -177,10 +178,39 @@ def write( | |
handles.close() | ||
|
||
def read( | ||
self, path, columns=None, storage_options: StorageOptions = None, **kwargs | ||
self, | ||
path, | ||
columns=None, | ||
use_nullable_dtypes=False, | ||
storage_options: StorageOptions = None, | ||
**kwargs, | ||
): | ||
kwargs["use_pandas_metadata"] = True | ||
|
||
to_pandas_kwargs = {} | ||
if use_nullable_dtypes: | ||
if LooseVersion(self.api.__version__) >= "0.16": | ||
import pandas as pd | ||
|
||
mapping = { | ||
self.api.int8(): pd.Int8Dtype(), | ||
self.api.int16(): pd.Int16Dtype(), | ||
self.api.int32(): pd.Int32Dtype(), | ||
self.api.int64(): pd.Int64Dtype(), | ||
self.api.uint8(): pd.UInt8Dtype(), | ||
self.api.uint16(): pd.UInt16Dtype(), | ||
self.api.uint32(): pd.UInt32Dtype(), | ||
self.api.uint64(): pd.UInt64Dtype(), | ||
self.api.bool_(): pd.BooleanDtype(), | ||
self.api.string(): pd.StringDtype(), | ||
} | ||
to_pandas_kwargs["types_mapper"] = mapping.get | ||
else: | ||
raise ValueError( | ||
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
f"({self.api.__version__} is installed" | ||
) | ||
|
||
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( | ||
path, | ||
kwargs.pop("filesystem", None), | ||
|
@@ -190,7 +220,7 @@ def read( | |
try: | ||
return self.api.parquet.read_table( | ||
path_or_handle, columns=columns, **kwargs | ||
).to_pandas() | ||
).to_pandas(**to_pandas_kwargs) | ||
finally: | ||
if handles is not None: | ||
handles.close() | ||
|
@@ -258,6 +288,12 @@ def write( | |
def read( | ||
self, path, columns=None, storage_options: StorageOptions = None, **kwargs | ||
): | ||
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should have a global option to turn this on (pls add an issue for this) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this generally worth it, if you can add an issue for this / PR welcome too! (bot blocking for this PR) |
||
if use_nullable_dtypes: | ||
raise ValueError( | ||
"The 'use_nullable_dtypes' argument is not supported for the " | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"fastparquet engine" | ||
) | ||
path = stringify_path(path) | ||
parquet_kwargs = {} | ||
handles = None | ||
|
@@ -368,7 +404,13 @@ def to_parquet( | |
return None | ||
|
||
|
||
def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | ||
def read_parquet( | ||
path, | ||
engine: str = "auto", | ||
columns=None, | ||
use_nullable_dtypes: bool = False, | ||
**kwargs, | ||
): | ||
""" | ||
Load a parquet object from the file path, returning a DataFrame. | ||
|
||
|
@@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | |
'pyarrow' is unavailable. | ||
columns : list, default=None | ||
If not None, only these columns will be read from the file. | ||
use_nullable_dtypes : bool, default False | ||
If True, use dtypes that use ``pd.NA`` as missing value indicator | ||
for the resulting DataFrame (only applicable for ``engine="pyarrow"``). | ||
As new dtypes are added that support ``pd.NA`` in the future, the | ||
output with this option will change to use those dtypes. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Note: this is an experimental option, and behaviour (e.g. additional | ||
support dtypes) may change without notice. | ||
|
||
.. versionadded:: 1.2.0 | ||
**kwargs | ||
Any additional kwargs are passed to the engine. | ||
|
||
|
@@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | |
DataFrame | ||
""" | ||
impl = get_engine(engine) | ||
return impl.read(path, columns=columns, **kwargs) | ||
return impl.read( | ||
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you instead import from the arrays locations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We also import eg DataFrame from the main namespace in this file