-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: add use_nullable_dtypes option in read_parquet #31242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b3053fd
434442a
f617a7e
af81de9
60a1c0e
18c93b5
0f691be
46932f4
1375bad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
""" parquet compat """ | ||
|
||
from distutils.version import LooseVersion | ||
from typing import Any, Dict, Optional | ||
from warnings import catch_warnings | ||
|
||
|
@@ -116,13 +117,32 @@ def write( | |
**kwargs, | ||
) | ||
|
||
def read(self, path, columns=None, **kwargs): | ||
def read(self, path, columns=None, use_nullable_dtypes=False, **kwargs): | ||
path, _, _, should_close = get_filepath_or_buffer(path) | ||
|
||
kwargs["use_pandas_metadata"] = True | ||
result = self.api.parquet.read_table( | ||
path, columns=columns, **kwargs | ||
).to_pandas() | ||
to_pandas_kwargs = {} | ||
if use_nullable_dtypes: | ||
if LooseVersion(self.api.__version__) > "0.15.1.dev": | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
import pandas as pd | ||
|
||
mapping = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you instead import from the arrays locations. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also import eg DataFrame from the main namespace in this file |
||
self.api.int8(): pd.Int8Dtype(), | ||
self.api.int16(): pd.Int16Dtype(), | ||
self.api.int32(): pd.Int32Dtype(), | ||
self.api.int64(): pd.Int64Dtype(), | ||
self.api.uint8(): pd.UInt8Dtype(), | ||
self.api.uint16(): pd.UInt16Dtype(), | ||
self.api.uint32(): pd.UInt32Dtype(), | ||
self.api.uint64(): pd.UInt64Dtype(), | ||
self.api.bool_(): pd.BooleanDtype(), | ||
self.api.string(): pd.StringDtype(), | ||
} | ||
to_pandas_kwargs["types_mapper"] = mapping.get | ||
|
||
result = self.api.parquet.read_table(path, columns=columns, **kwargs).to_pandas( | ||
**to_pandas_kwargs | ||
) | ||
if should_close: | ||
path.close() | ||
|
||
|
@@ -184,6 +204,12 @@ def write( | |
) | ||
|
||
def read(self, path, columns=None, **kwargs): | ||
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should have a global option to turn this on (pls add an issue for this) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this generally worth it, if you can add an issue for this / PR welcome too! (bot blocking for this PR) |
||
if use_nullable_dtypes: | ||
raise ValueError( | ||
"The 'use_nullable_dtypes' argument is not supported for the " | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"fastparquet engine" | ||
) | ||
if is_s3_url(path): | ||
from pandas.io.s3 import get_file_and_filesystem | ||
|
||
|
@@ -263,7 +289,13 @@ def to_parquet( | |
) | ||
|
||
|
||
def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | ||
def read_parquet( | ||
path, | ||
engine: str = "auto", | ||
columns=None, | ||
use_nullable_dtypes: bool = False, | ||
**kwargs, | ||
): | ||
""" | ||
Load a parquet object from the file path, returning a DataFrame. | ||
|
||
|
@@ -296,6 +328,11 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | |
If not None, only these columns will be read from the file. | ||
|
||
.. versionadded:: 0.21.1 | ||
use_nullable_dtypes : bool, default False | ||
If True, use dtypes that use ``pd.NA`` as missing value indicator | ||
for the resulting DataFrame (only applicable for ``engine="pyarrow"``). | ||
As new dtypes are added that support ``pd.NA`` in the future, the | ||
output with this option will change to use those dtypes. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
**kwargs | ||
Any additional kwargs are passed to the engine. | ||
|
||
|
@@ -305,4 +342,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): | |
""" | ||
|
||
impl = get_engine(engine) | ||
return impl.read(path, columns=columns, **kwargs) | ||
return impl.read( | ||
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -564,6 +564,31 @@ def test_additional_extension_types(self, pa): | |
) | ||
check_round_trip(df, pa) | ||
|
||
@td.skip_if_no("pyarrow", min_version="0.15.1.dev") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above |
||
def test_use_nullable_dtypes(self, pa): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
import pyarrow.parquet as pq | ||
|
||
table = pyarrow.table( | ||
{ | ||
"a": pyarrow.array([1, 2, 3, None], "int64"), | ||
"b": pyarrow.array(["a", "b", "c", None]), | ||
} | ||
) | ||
with tm.ensure_clean() as path: | ||
# write manually with pyarrow to write integers | ||
pq.write_table(table, path) | ||
result1 = read_parquet(path) | ||
result2 = read_parquet(path, use_nullable_dtypes=True) | ||
|
||
assert result1["a"].dtype == np.dtype("float64") | ||
expected = pd.DataFrame( | ||
{ | ||
"a": pd.array([1, 2, 3, None], dtype="Int64"), | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"b": pd.array(["a", "b", "c", None], dtype="string"), | ||
} | ||
) | ||
tm.assert_frame_equal(result2, expected) | ||
|
||
|
||
class TestParquetFastParquet(Base): | ||
@td.skip_if_no("fastparquet", min_version="0.3.2") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about use_extension_dtypes as more descriptive
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't use extension dtypes in general, only those types that use
pd.NA
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See also #29752 for some discussion about naming this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW I would also prefer
use_extension_dtypes