-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: general concat with ExtensionArrays through find_common_type #33607
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
3464e95
b1d9d68
bb398e7
83fdc91
7f2ac2a
d0f90de
2d5fcb0
a68206b
fc98b65
b072591
91c984a
2a2b9d5
8893165
e19e3ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2349,9 +2349,9 @@ def _can_hold_na(self): | |
|
||
@classmethod | ||
def _concat_same_type(self, to_concat): | ||
from pandas.core.dtypes.concat import concat_categorical | ||
from pandas.core.dtypes.concat import union_categoricals | ||
|
||
return concat_categorical(to_concat) | ||
return union_categoricals(to_concat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIRC only a relatively small part of the logic of concat_categorical/union_categoricals is actually needed here. I'd prefer for that to live here and for union_categoricals to call it, rather than the other way around (since union_categoricals handles a lot of cases). Could be considered orthogonally to this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's indeed a good idea (union_categoricals does way much more that is all not needed here) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are you planning to update this, or is that a topic for a separate PR? |
||
|
||
def isin(self, values): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
import numbers | ||
from typing import TYPE_CHECKING, Tuple, Type, Union | ||
from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib, missing as libmissing | ||
from pandas._typing import ArrayLike | ||
from pandas._typing import ArrayLike, DtypeObj | ||
from pandas.compat import set_function_name | ||
from pandas.util._decorators import cache_readonly | ||
|
||
|
@@ -95,6 +95,15 @@ def construct_array_type(cls) -> Type["IntegerArray"]: | |
""" | ||
return IntegerArray | ||
|
||
def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be common_type or common_dtype? we've been loose about this distinction so far and i think it has caused amibiguity There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't care that much. I mainly used "type", because it is meant to be used in (that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed to "common_dtype" instead of "common_type". The internal function that uses this is still There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for indulging me on this nitpick There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this a private method on the Dtype? get_common_type (or get_common_dtype) seems fine |
||
# for now only handle other integer types | ||
if not all(isinstance(t, _IntegerDtype) for t in dtypes): | ||
return None | ||
np_dtype = np.find_common_type([t.numpy_dtype for t in dtypes], []) | ||
if np.issubdtype(np_dtype, np.integer): | ||
return _dtypes[str(np_dtype)] | ||
return None | ||
TomAugspurger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __from_arrow__( | ||
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] | ||
) -> "IntegerArray": | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._typing import DtypeObj | ||
from pandas.errors import AbstractMethodError | ||
|
||
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries | ||
|
@@ -322,3 +323,34 @@ def _is_boolean(self) -> bool: | |
bool | ||
""" | ||
return False | ||
|
||
def _get_common_type(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mmm can we keep the return type as Oh... I suppose tz-naive DatetimeArray might break this, since it wants to return a NumPy dtype... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that was my first thought as well. But, right now, eg Categorical can end up with any kind of numpy dtype (depending on the dtype of its categories). As long as not yet all dtypes have a EA version, I don't think it is feasible to require ExtensionDtype here |
||
""" | ||
Return the common dtype, if one exists. | ||
|
||
Used in `find_common_type` implementation. This is for example used | ||
to determine the resulting dtype in a concat operation. | ||
|
||
If no common dtype exists, return None. If all dtypes in the list | ||
will return None, then the common dtype will be "object" dtype. | ||
|
||
Parameters | ||
---------- | ||
dtypes : list of dtypes | ||
The dtypes for which to determine a common dtype. This is a list | ||
of np.dtype or ExtensionDtype instances. | ||
|
||
Returns | ||
------- | ||
Common dtype (np.dtype or ExtensionDtype) or None | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
# QUESTIONS: | ||
# - do we guarantee that `dtypes` is already deduplicated? (list of uniques) | ||
# - do we call this method if `len(dtypes) == 1`, or does this method | ||
# need to handle that case | ||
# - does this method need to handle "non-fully-initialized" dtypes? | ||
if len(set(dtypes)) == 1: | ||
# only itself | ||
return self | ||
else: | ||
return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,9 @@ | |
|
||
import numpy as np | ||
|
||
from pandas._typing import ArrayLike, DtypeObj | ||
|
||
from pandas.core.dtypes.cast import find_common_type | ||
from pandas.core.dtypes.common import ( | ||
is_bool_dtype, | ||
is_categorical_dtype, | ||
|
@@ -17,6 +20,9 @@ | |
) | ||
from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.construction import array | ||
|
||
|
||
def get_dtype_kinds(l): | ||
""" | ||
|
@@ -58,6 +64,40 @@ def get_dtype_kinds(l): | |
return typs | ||
|
||
|
||
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this private? everything else is not, so this seems odd There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an internal helper function for this file, which we indicate by using a leading underscore (the other functions here are used elsewhere in the pandas codebase) |
||
""" | ||
Helper function for `arr.astype(common_type)` but handling all special | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cases. | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you have a plan in mind to make this function unnecessary? it is special-casing pandas-internal EAs which i think we want to avoid (Not a blocker for this PR because AFAICT this is necessary and we do this in the status quo anyway, just curious) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, so this is indeed only needed to preserve value-based specific behaviour, not because I like this :) Ideally we get rid of this, but I am not sure it is entirely possible with the numpy integer dtypes, since it will always depend on the presence of NaNs whether it can preserve int or not. The way to go here is probably the nullable integer dtypes to have consistent behaviour. |
||
if ( | ||
is_categorical_dtype(arr.dtype) | ||
and isinstance(dtype, np.dtype) | ||
and np.issubdtype(dtype, np.integer) | ||
): | ||
# problem case: categorical of int -> gives int as result dtype, | ||
# but categorical can contain NAs -> fall back to object dtype | ||
try: | ||
return arr.astype(dtype, copy=False) | ||
except ValueError: | ||
return arr.astype(object, copy=False) | ||
|
||
if ( | ||
isinstance(arr, np.ndarray) | ||
and arr.dtype.kind in ["m", "M"] | ||
and dtype is np.dtype("object") | ||
): | ||
# wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 quality comment |
||
# this can happen when concat_compat is called directly on arrays (when arrays | ||
# are not coming from Index/Series._values), eg in BlockManager.quantile | ||
arr = array(arr) | ||
|
||
if is_extension_array_dtype(dtype): | ||
if isinstance(arr, np.ndarray): | ||
# numpy's astype cannot handle ExtensionDtypes | ||
return array(arr, dtype=dtype, copy=False) | ||
return arr.astype(dtype, copy=False) | ||
|
||
|
||
def concat_compat(to_concat, axis: int = 0): | ||
""" | ||
provide concatenation of an array of arrays each of which is a single | ||
|
@@ -93,28 +133,25 @@ def is_nonempty(x) -> bool: | |
|
||
typs = get_dtype_kinds(to_concat) | ||
_contains_datetime = any(typ.startswith("datetime") for typ in typs) | ||
_contains_period = any(typ.startswith("period") for typ in typs) | ||
|
||
all_empty = not len(non_empties) | ||
single_dtype = len({x.dtype for x in to_concat}) == 1 | ||
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) | ||
|
||
if any_ea and single_dtype and axis == 0: | ||
cls = type(to_concat[0]) | ||
return cls._concat_same_type(to_concat) | ||
if any_ea and axis == 0: | ||
if not single_dtype: | ||
target_dtype = find_common_type([x.dtype for x in to_concat]) | ||
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] | ||
|
||
elif "category" in typs: | ||
# this must be prior to concat_datetime, | ||
# to support Categorical + datetime-like | ||
return concat_categorical(to_concat, axis=axis) | ||
if isinstance(to_concat[0], ExtensionArray): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cls = type(to_concat[0]) | ||
return cls._concat_same_type(to_concat) | ||
else: | ||
return np.concatenate(to_concat) | ||
|
||
elif _contains_datetime or "timedelta" in typs or _contains_period: | ||
elif _contains_datetime or "timedelta" in typs: | ||
return concat_datetime(to_concat, axis=axis, typs=typs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we do the DTA/TDA casting above, and do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, because they are not using ExtensionDtype. |
||
|
||
# these are mandated to handle empties as well | ||
elif "sparse" in typs: | ||
return _concat_sparse(to_concat, axis=axis, typs=typs) | ||
|
||
elif any_ea and axis == 1: | ||
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] | ||
return np.concatenate(to_concat, axis=axis) | ||
|
@@ -136,52 +173,6 @@ def is_nonempty(x) -> bool: | |
return np.concatenate(to_concat, axis=axis) | ||
|
||
|
||
def concat_categorical(to_concat, axis: int = 0): | ||
""" | ||
Concatenate an object/categorical array of arrays, each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : int | ||
Axis to provide concatenation in the current implementation this is | ||
always 0, e.g. we only have 1D categoricals | ||
|
||
Returns | ||
------- | ||
Categorical | ||
A single array, preserving the combined dtypes | ||
""" | ||
# we could have object blocks and categoricals here | ||
# if we only have a single categoricals then combine everything | ||
# else its a non-compat categorical | ||
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] | ||
|
||
# validate the categories | ||
if len(categoricals) != len(to_concat): | ||
pass | ||
else: | ||
# when all categories are identical | ||
first = to_concat[0] | ||
if all(first.is_dtype_equal(other) for other in to_concat[1:]): | ||
return union_categoricals(categoricals) | ||
|
||
# extract the categoricals & coerce to object if needed | ||
to_concat = [ | ||
x._internal_get_values() | ||
if is_categorical_dtype(x.dtype) | ||
else np.asarray(x).ravel() | ||
if not is_datetime64tz_dtype(x) | ||
else np.asarray(x.astype(object)) | ||
for x in to_concat | ||
] | ||
result = concat_compat(to_concat) | ||
if axis == 1: | ||
result = result.reshape(1, len(result)) | ||
return result | ||
|
||
|
||
def union_categoricals( | ||
to_union, sort_categories: bool = False, ignore_order: bool = False | ||
): | ||
|
@@ -414,34 +405,3 @@ def _wrap_datetimelike(arr): | |
if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: | ||
arr = pd_array(arr) | ||
return arr | ||
|
||
|
||
def _concat_sparse(to_concat, axis=0, typs=None): | ||
""" | ||
provide concatenation of an sparse/dense array of arrays each of which is a | ||
single dtype | ||
|
||
Parameters | ||
---------- | ||
to_concat : array of arrays | ||
axis : axis to provide concatenation | ||
typs : set of to_concat dtypes | ||
|
||
Returns | ||
------- | ||
a single array, preserving the combined dtypes | ||
""" | ||
from pandas.core.arrays import SparseArray | ||
|
||
fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] | ||
fill_value = fill_values[0] | ||
|
||
# TODO: Fix join unit generation so we aren't passed this. | ||
to_concat = [ | ||
x | ||
if isinstance(x, SparseArray) | ||
else SparseArray(x.squeeze(), fill_value=fill_value) | ||
for x in to_concat | ||
] | ||
|
||
return SparseArray._concat_same_type(to_concat) |
Uh oh!
There was an error while loading. Please reload this page.