Skip to content

CLN: avoid values_from_object in Series #32426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 11, 2020
12 changes: 10 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
ensure_platform_int,
infer_dtype_from_object,
is_bool_dtype,
is_datetime64_any_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
Expand All @@ -88,6 +89,7 @@
is_list_like,
is_named_tuple,
is_object_dtype,
is_period_dtype,
is_scalar,
is_sequence,
needs_i8_conversion,
Expand Down Expand Up @@ -7789,11 +7791,13 @@ def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
):

dtype_is_dt = self.dtypes.apply(lambda x: x.kind == "M")
dtype_is_dt = self.dtypes.apply(
lambda x: is_datetime64_any_dtype(x) or is_period_dtype(x)
)
if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any():
warnings.warn(
"DataFrame.mean and DataFrame.median with numeric_only=None "
"will include datetime64 and datetime64tz columns in a "
"will include datetime64, datetime64tz, and PeriodDtype columns in a "
"future version.",
FutureWarning,
stacklevel=3,
Expand Down Expand Up @@ -7854,6 +7858,10 @@ def blk_func(values):
assert len(res) == max(list(res.keys())) + 1, res.keys()
out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
out.index = df.columns
if axis == 0 and df.dtypes.apply(needs_i8_conversion).any():
# FIXME: needs_i8_conversion check is kludge, not sure
# why it is necessary in this case and this case alone
out[:] = coerce_to_dtypes(out.values, df.dtypes)
return out

if numeric_only is None:
Expand Down
49 changes: 26 additions & 23 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pandas._config import get_option

from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib
from pandas._libs import NaT, Period, Timedelta, Timestamp, iNaT, lib
from pandas._typing import Dtype, Scalar
from pandas.compat._optional import import_optional_dependency

Expand All @@ -17,9 +17,7 @@
is_any_int_dtype,
is_bool_dtype,
is_complex,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_datetime_or_timedelta_dtype,
is_datetime64_any_dtype,
is_float,
is_float_dtype,
is_integer,
Expand All @@ -28,8 +26,10 @@
is_object_dtype,
is_scalar,
is_timedelta64_dtype,
needs_i8_conversion,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import PeriodDtype
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna

from pandas.core.construction import extract_array
Expand Down Expand Up @@ -134,10 +134,8 @@ def f(


def _bn_ok_dtype(dtype: Dtype, name: str) -> bool:
# Bottleneck chokes on datetime64
if not is_object_dtype(dtype) and not (
is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype)
):
# Bottleneck chokes on datetime64, PeriodDtype (or and EA)
if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):

# GH 15507
# bottleneck does not properly upcast during the sum
Expand Down Expand Up @@ -283,17 +281,16 @@ def _get_values(
# with scalar fill_value. This guarantee is important for the
# maybe_upcast_putmask call below
assert is_scalar(fill_value)
values = extract_array(values, extract_numpy=True)

mask = _maybe_get_mask(values, skipna, mask)

values = extract_array(values, extract_numpy=True)
dtype = values.dtype

if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
if needs_i8_conversion(values):
# changing timedelta64/datetime64 to int64 needs to happen after
# finding `mask` above
values = getattr(values, "asi8", values)
values = values.view(np.int64)
values = np.asarray(values.view("i8"))

dtype_ok = _na_ok_dtype(dtype)

Expand All @@ -307,7 +304,8 @@ def _get_values(

if skipna and copy:
values = values.copy()
if dtype_ok:
assert mask is not None # for mypy
if dtype_ok and mask.any():
np.putmask(values, mask, fill_value)

# promote if needed
Expand All @@ -325,13 +323,14 @@ def _get_values(


def _na_ok_dtype(dtype) -> bool:
# TODO: what about datetime64tz? PeriodDtype?
return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64))
if needs_i8_conversion(dtype):
return False
return not issubclass(dtype.type, np.integer)


def _wrap_results(result, dtype: Dtype, fill_value=None):
""" wrap our results if needed """
if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
if is_datetime64_any_dtype(dtype):
if fill_value is None:
# GH#24293
fill_value = iNaT
Expand All @@ -342,7 +341,8 @@ def _wrap_results(result, dtype: Dtype, fill_value=None):
result = np.nan
result = Timestamp(result, tz=tz)
else:
result = result.view(dtype)
# If we have float dtype, taking a view will give the wrong result
result = result.astype(dtype)
elif is_timedelta64_dtype(dtype):
if not isinstance(result, np.ndarray):
if result == fill_value:
Expand All @@ -356,6 +356,14 @@ def _wrap_results(result, dtype: Dtype, fill_value=None):
else:
result = result.astype("m8[ns]").view(dtype)

elif isinstance(dtype, PeriodDtype):
if is_float(result) and result.is_integer():
result = int(result)
if is_integer(result):
result = Period._from_ordinal(result, freq=dtype.freq)
else:
raise NotImplementedError(type(result), result)

return result


Expand Down Expand Up @@ -542,12 +550,7 @@ def nanmean(values, axis=None, skipna=True, mask=None):
)
dtype_sum = dtype_max
dtype_count = np.float64
if (
is_integer_dtype(dtype)
or is_timedelta64_dtype(dtype)
or is_datetime64_dtype(dtype)
or is_datetime64tz_dtype(dtype)
):
if is_integer_dtype(dtype) or needs_i8_conversion(dtype):
dtype_sum = np.float64
elif is_float_dtype(dtype):
dtype_sum = dtype
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1984,7 +1984,7 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs):
nan
"""
skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
i = nanops.nanargmin(com.values_from_object(self), skipna=skipna)
i = nanops.nanargmin(self._values, skipna=skipna)
if i == -1:
return np.nan
return self.index[i]
Expand Down Expand Up @@ -2055,7 +2055,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs):
nan
"""
skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
i = nanops.nanargmax(com.values_from_object(self), skipna=skipna)
i = nanops.nanargmax(self._values, skipna=skipna)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nanargmax/nanargmin expect to get an ndarray. Due to this change, it is no longer guaranteed to be an ndarray. Reported this as #32749

So those lines should either be reverted, or another "convert to ndarray" function should be used (or nanargmax/nanargmin could be rewritten to support EAs, but personally I think it is much cleaner to keep those algos based on numpy arrays)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel can you respond to this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(or nanargmax/nanargmin could be rewritten to support EAs, but personally I think it is much cleaner to keep those algos based on numpy arrays)

I'd be fine with either of these options. Probably prefer both actually: a EA-supporting public method and an ndarray-only private method for each of the relevant nanops funcs.

if i == -1:
return np.nan
return self.index[i]
Expand Down Expand Up @@ -2093,7 +2093,7 @@ def round(self, decimals=0, *args, **kwargs) -> "Series":
dtype: float64
"""
nv.validate_round(args, kwargs)
result = com.values_from_object(self).round(decimals)
result = self._values.round(decimals)
result = self._constructor(result, index=self.index).__finalize__(self)

return result
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -875,11 +875,6 @@ def test_mean_datetimelike(self):
expected = pd.Series({"A": 1.0, "C": df.loc[1, "C"]})
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess you should technically have a whatsnew note as this 'bug' is fixed (do in followon)

reason="casts to object-dtype and then tries to add timestamps",
raises=TypeError,
strict=True,
)
def test_mean_datetimelike_numeric_only_false(self):
df = pd.DataFrame(
{
Expand Down