Skip to content

DEPR/ENH: support axis=None in min/max #45072

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Dec 28, 2021
10 changes: 6 additions & 4 deletions pandas/core/arraylike.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,10 +521,12 @@ def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwar

if "axis" not in kwargs:
# For DataFrame reductions we don't want the default axis=0
# FIXME: DataFrame.min ignores axis=None
# FIXME: np.minimum.reduce(df) gets here bc axis is not in kwargs,
# but np.minimum.reduce(df.values) behaves as if axis=0
kwargs["axis"] = None
# Note: np.min is not a ufunc, but uses array_function_dispatch,
# so calls DataFrame.min (without ever getting here) with the np.min
# default of axis=None, which DataFrame.min catches and changes to axis=0.
# np.minimum.reduce(df) gets here bc axis is not in kwargs,
# so we set axis=0 to match the behaviorof np.minimum.reduce(df.values)
kwargs["axis"] = 0

# By default, numpy's reductions do not skip NaNs, so we have to
# pass skipna=False
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,14 @@ def require_length_match(data, index: Index):
)


_builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min}
# the ufuncs np.maximum.reduce and np.minimum.reduce default to axis=0,
# whereas np.min and np.max (which directly call obj.min and obj.max)
# default to axis=None.
_builtin_table = {
builtins.sum: np.sum,
builtins.max: np.maximum.reduce,
builtins.min: np.minimum.reduce,
}

_cython_table = {
builtins.sum: "sum",
Expand Down
103 changes: 86 additions & 17 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10556,7 +10556,7 @@ def _stat_function(
self,
name: str,
func,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = None,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
Expand All @@ -10569,8 +10569,22 @@ def _stat_function(

validate_bool_kwarg(skipna, "skipna", none_allowed=False)

if axis is None and level is None and self.ndim > 1:
# user must have explicitly passed axis=None
# GH#21597
warnings.warn(
f"In a future version, DataFrame.{name}(axis=None) will return a "
f"scalar {name} over the entire DataFrame. To retain the old "
f"behavior, use 'frame.{name}(axis=0)' or just 'frame.{name}()'",
FutureWarning,
stacklevel=find_stack_level(),
)
if axis is lib.no_default:
axis = None

if axis is None:
axis = self._stat_axis_number
axis = cast(Axis, axis)
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
Expand All @@ -10588,31 +10602,43 @@ def _stat_function(

def min(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
**kwargs,
):
return self._stat_function(
"min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs
"min",
nanops.nanmin,
axis,
skipna,
level,
numeric_only,
**kwargs,
)

def max(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
**kwargs,
):
return self._stat_function(
"max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs
"max",
nanops.nanmax,
axis,
skipna,
level,
numeric_only,
**kwargs,
)

def mean(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
Expand All @@ -10624,7 +10650,7 @@ def mean(

def median(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
Expand All @@ -10636,7 +10662,7 @@ def median(

def skew(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
Expand All @@ -10648,7 +10674,7 @@ def skew(

def kurt(
self,
axis: Axis | None = None,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna: bool_t = True,
level: Level | None = None,
numeric_only: bool_t | None = None,
Expand Down Expand Up @@ -10699,6 +10725,7 @@ def _min_count_stat_function(
min_count=min_count,
numeric_only=numeric_only,
)

return self._reduce(
func,
name=name,
Expand Down Expand Up @@ -11039,7 +11066,14 @@ def prod(
see_also="",
examples="",
)
def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
def mean(
self,
axis: int | None | lib.NoDefault = lib.no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "mean", mean)
Expand All @@ -11054,7 +11088,14 @@ def mean(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
see_also="",
examples="",
)
def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
def skew(
self,
axis: int | None | lib.NoDefault = lib.no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "skew", skew)
Expand All @@ -11072,7 +11113,14 @@ def skew(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
see_also="",
examples="",
)
def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
def kurt(
self,
axis: Axis | None | lib.NoDefault = lib.no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "kurt", kurt)
Expand All @@ -11089,13 +11137,19 @@ def kurt(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
examples="",
)
def median(
self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
self,
axis: int | None | lib.NoDefault = lib.no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "median", median)

@doc(
# error: Untyped decorator makes function "max" untyped
@doc( # type: ignore[misc]
_num_doc,
desc="Return the maximum of the values over the requested axis.\n\n"
"If you want the *index* of the maximum, use ``idxmax``. This is "
Expand All @@ -11107,12 +11161,20 @@ def median(
see_also=_stat_func_see_also,
examples=_max_examples,
)
def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
def max(
self,
axis: int | None | lib.NoDefault = lib.no_default,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simonjayhawkins is this going to render in the docs in a way we don't want?

(similar question bugs me in a WIP branch that changes the default from method="pad" in .replace)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can be a followup

skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "max", max)

@doc(
# error: Untyped decorator makes function "max" untyped
@doc( # type: ignore[misc]
_num_doc,
desc="Return the minimum of the values over the requested axis.\n\n"
"If you want the *index* of the minimum, use ``idxmin``. This is "
Expand All @@ -11124,7 +11186,14 @@ def max(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
see_also=_stat_func_see_also,
examples=_min_examples,
)
def min(self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs):
def min(
self,
axis: int | None | lib.NoDefault = lib.no_default,
skipna=True,
level=None,
numeric_only=None,
**kwargs,
):
return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs)

setattr(cls, "min", min)
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1765,3 +1765,20 @@ def test_prod_sum_min_count_mixed_object():
msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
with pytest.raises(TypeError, match=msg):
df.sum(axis=0, min_count=1, numeric_only=False)


@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
def test_reduction_axis_none_deprecation(method):
# GH#21597 deprecate axis=None defaulting to axis=0 so that we can change it
# to reducing over all axes.

df = DataFrame(np.random.randn(4, 4))
meth = getattr(df, method)

msg = f"scalar {method} over the entire DataFrame"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = meth(axis=None)
with tm.assert_produces_warning(None):
expected = meth()
tm.assert_series_equal(res, expected)
tm.assert_series_equal(res, meth(axis=0))
23 changes: 19 additions & 4 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_stats(group):
assert result.index.names[0] == "C"


def test_basic():
def test_basic(): # TODO: split this test

cats = Categorical(
["a", "a", "a", "b", "b", "b", "c", "c", "c"],
Expand Down Expand Up @@ -142,9 +142,24 @@ def f(x):
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
)
tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]]
)

gbc = df.groupby(c, observed=False)
with tm.assert_produces_warning(
FutureWarning, match="scalar max", check_stacklevel=False
):
# stacklevel is thrown off (i think) bc the stack goes through numpy C code
result = gbc.transform(lambda xs: np.max(xs))
tm.assert_frame_equal(result, df[["a"]])

with tm.assert_produces_warning(None):
result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
result3 = gbc.transform(max)
result4 = gbc.transform(np.maximum.reduce)
result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
tm.assert_frame_equal(result4, df[["a"]])
tm.assert_frame_equal(result5, df[["a"]])

# Filter
tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
Expand Down
25 changes: 19 additions & 6 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,33 @@ def test_builtins_apply(keys, f):
df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
df["jolie"] = np.random.randn(1000)

gb = df.groupby(keys)

fname = f.__name__
result = df.groupby(keys).apply(f)
result = gb.apply(f)
ngroups = len(df.drop_duplicates(subset=keys))

assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
assert result.shape == (ngroups, 3), assert_msg

tm.assert_frame_equal(
result, # numpy's equivalent function
df.groupby(keys).apply(getattr(np, fname)),
)
npfunc = getattr(np, fname) # numpy's equivalent function
if f in [max, min]:
warn = FutureWarning
else:
warn = None
msg = "scalar (max|min) over the entire DataFrame"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
# stacklevel can be thrown off because (i think) the stack
# goes through some of numpy's C code.
expected = gb.apply(npfunc)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
expected2 = gb.apply(lambda x: npfunc(x, axis=0))
tm.assert_frame_equal(result, expected2)

if f != sum:
expected = df.groupby(keys).agg(fname).reset_index()
expected = gb.agg(fname).reset_index()
expected.set_index(keys, inplace=True, drop=False)
tm.assert_frame_equal(result, expected, check_dtype=False)

Expand Down
9 changes: 8 additions & 1 deletion pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,16 @@ def test_transform_coercion():
g = df.groupby("A")

expected = g.transform(np.mean)
result = g.transform(lambda x: np.mean(x))

msg = "will return a scalar mean"
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False):
result = g.transform(lambda x: np.mean(x))
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(None):
result2 = g.transform(lambda x: np.mean(x, axis=0))
tm.assert_frame_equal(result2, expected)


def test_groupby_transform_with_int():

Expand Down
Loading