-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: DataFrame reductions dtypes on object input #51335
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
d0bc834
873c309
c10c6b3
115b5c4
be8c27a
a93adf0
9471c13
ecee6cc
c9eaf90
acdaf4c
baff37e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -141,7 +141,6 @@ | |||||||
is_integer_dtype, | ||||||||
is_iterator, | ||||||||
is_list_like, | ||||||||
is_object_dtype, | ||||||||
is_scalar, | ||||||||
is_sequence, | ||||||||
needs_i8_conversion, | ||||||||
|
@@ -10461,54 +10460,44 @@ def _get_data() -> DataFrame: | |||||||
data = self._get_bool_data() | ||||||||
return data | ||||||||
|
||||||||
if numeric_only or axis == 0: | ||||||||
# For numeric_only non-None and axis non-None, we know | ||||||||
# which blocks to use and no try/except is needed. | ||||||||
# For numeric_only=None only the case with axis==0 and no object | ||||||||
# dtypes are unambiguous can be handled with BlockManager.reduce | ||||||||
# Case with EAs see GH#35881 | ||||||||
df = self | ||||||||
if numeric_only: | ||||||||
df = _get_data() | ||||||||
if axis == 1: | ||||||||
df = df.T | ||||||||
axis = 0 | ||||||||
|
||||||||
# After possibly _get_data and transposing, we are now in the | ||||||||
# simple case where we can use BlockManager.reduce | ||||||||
res = df._mgr.reduce(blk_func) | ||||||||
out = df._constructor(res).iloc[0] | ||||||||
if out_dtype is not None: | ||||||||
out = out.astype(out_dtype) | ||||||||
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: | ||||||||
# Even if we are object dtype, follow numpy and return | ||||||||
# float64, see test_apply_funcs_over_empty | ||||||||
out = out.astype(np.float64) | ||||||||
|
||||||||
return out | ||||||||
|
||||||||
assert not numeric_only and axis in (1, None) | ||||||||
|
||||||||
data = self | ||||||||
values = data.values | ||||||||
result = func(values) | ||||||||
|
||||||||
if hasattr(result, "dtype"): | ||||||||
if filter_type == "bool" and notna(result).all(): | ||||||||
result = result.astype(np.bool_) | ||||||||
elif filter_type is None and is_object_dtype(result.dtype): | ||||||||
try: | ||||||||
result = result.astype(np.float64) | ||||||||
except (ValueError, TypeError): | ||||||||
# try to coerce to the original dtypes item by item if we can | ||||||||
pass | ||||||||
|
||||||||
# Case with EAs see GH#35881 | ||||||||
df = self | ||||||||
if numeric_only: | ||||||||
df = _get_data() | ||||||||
if axis is None: | ||||||||
return result | ||||||||
return func(df.values) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm .values can be expensive, might be better to reduce twice? (... which can also be expensive. darn). Is punting on this viable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using Lines 10494 to 10496 in c7fa611
In fact, main is currently broken when
Assuming this is okay for now, I can add a test for when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel - friendly ping |
||||||||
elif axis == 1: | ||||||||
if len(df.index) == 0: | ||||||||
# Taking a transpose would result in no columns, losing the dtype. | ||||||||
# In the empty case, reducing along axis 0 or 1 gives the same | ||||||||
# result dtype, so reduce with axis=0 and ignore values | ||||||||
result = df._reduce( | ||||||||
op, | ||||||||
name, | ||||||||
axis=0, | ||||||||
skipna=skipna, | ||||||||
numeric_only=False, | ||||||||
filter_type=filter_type, | ||||||||
**kwds, | ||||||||
).iloc[:0] | ||||||||
result.index = df.index | ||||||||
return result | ||||||||
df = df.T | ||||||||
|
||||||||
# After possibly _get_data and transposing, we are now in the | ||||||||
# simple case where we can use BlockManager.reduce | ||||||||
res = df._mgr.reduce(blk_func) | ||||||||
out = df._constructor(res).iloc[0] | ||||||||
if out_dtype is not None: | ||||||||
out = out.astype(out_dtype) | ||||||||
elif (df._mgr.get_dtypes() == object).any(): | ||||||||
out = out.astype(object) | ||||||||
elif len(self) == 0 and name in ("sum", "prod"): | ||||||||
# Even if we are object dtype, follow numpy and return | ||||||||
# float64, see test_apply_funcs_over_empty | ||||||||
out = out.astype(np.float64) | ||||||||
|
||||||||
labels = self._get_agg_axis(axis) | ||||||||
result = self._constructor_sliced(result, index=labels) | ||||||||
return result | ||||||||
return out | ||||||||
|
||||||||
def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: | ||||||||
""" | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1533,7 +1533,15 @@ def _maybe_null_out( | |
result[null_mask] = None | ||
elif result is not NaT: | ||
if check_below_min_count(shape, mask, min_count): | ||
result = np.nan | ||
result_dtype = getattr(result, "dtype", None) | ||
if is_float_dtype(result_dtype): | ||
# Preserve dtype when possible | ||
# mypy doesn't infer result_dtype is not None | ||
result = getattr( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
np, f"float{8 * result_dtype.itemsize}" # type: ignore[union-attr] | ||
)("nan") | ||
else: | ||
result = np.nan | ||
|
||
return result | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does NA here refer to pd.NaT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - will fix.