Skip to content

Commit 624cd16

Browse files
jbrockmendelJulianWgs
authored andcommitted
BUG/API: SeriesGroupBy reduction with numeric_only=True (pandas-dev#41342)
1 parent 6ced67d commit 624cd16

File tree

6 files changed

+110
-27
lines changed

6 files changed

+110
-27
lines changed

doc/source/whatsnew/v1.3.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,6 +1038,7 @@ Groupby/resample/rolling
10381038
- Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`)
10391039
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
10401040
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
1041+
- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
10411042
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
10421043
- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)
10431044

pandas/core/groupby/generic.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -323,15 +323,18 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame:
323323
return output
324324

325325
def _cython_agg_general(
326-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
326+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
327327
):
328328

329329
obj = self._selected_obj
330330
objvals = obj._values
331331
data = obj._mgr
332332

333333
if numeric_only and not is_numeric_dtype(obj.dtype):
334-
raise DataError("No numeric types to aggregate")
334+
# GH#41291 match Series behavior
335+
raise NotImplementedError(
336+
f"{type(self).__name__}.{how} does not implement numeric_only."
337+
)
335338

336339
# This is overkill because it is only called once, but is here to
337340
# mirror the array_func used in DataFrameGroupBy._cython_agg_general
@@ -1056,7 +1059,7 @@ def _iterate_slices(self) -> Iterable[Series]:
10561059
yield values
10571060

10581061
def _cython_agg_general(
1059-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1062+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
10601063
) -> DataFrame:
10611064
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy
10621065

pandas/core/groupby/groupby.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,34 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]):
11061106
def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False):
11071107
raise AbstractMethodError(self)
11081108

1109+
def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool:
1110+
"""
1111+
Determine subclass-specific default value for 'numeric_only'.
1112+
1113+
For SeriesGroupBy we want the default to be False (to match Series behavior).
1114+
For DataFrameGroupBy we want it to be True (for backwards-compat).
1115+
1116+
Parameters
1117+
----------
1118+
numeric_only : bool or lib.no_default
1119+
1120+
Returns
1121+
-------
1122+
bool
1123+
"""
1124+
# GH#41291
1125+
if numeric_only is lib.no_default:
1126+
# i.e. not explicitly passed by user
1127+
if self.obj.ndim == 2:
1128+
# i.e. DataFrameGroupBy
1129+
numeric_only = True
1130+
else:
1131+
numeric_only = False
1132+
1133+
# error: Incompatible return value type (got "Union[bool, NoDefault]",
1134+
# expected "bool")
1135+
return numeric_only # type: ignore[return-value]
1136+
11091137
# -----------------------------------------------------------------
11101138
# numba
11111139

@@ -1313,6 +1341,7 @@ def _agg_general(
13131341
alias: str,
13141342
npfunc: Callable,
13151343
):
1344+
13161345
with group_selection_context(self):
13171346
# try a cython aggregation if we can
13181347
result = None
@@ -1372,7 +1401,7 @@ def _agg_py_fallback(
13721401
return ensure_block_shape(res_values, ndim=ndim)
13731402

13741403
def _cython_agg_general(
1375-
self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
1404+
self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1
13761405
):
13771406
raise AbstractMethodError(self)
13781407

@@ -1592,7 +1621,7 @@ def count(self):
15921621
@final
15931622
@Substitution(name="groupby")
15941623
@Substitution(see_also=_common_see_also)
1595-
def mean(self, numeric_only: bool = True):
1624+
def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
15961625
"""
15971626
Compute mean of groups, excluding missing values.
15981627
@@ -1640,6 +1669,8 @@ def mean(self, numeric_only: bool = True):
16401669
2 4.0
16411670
Name: B, dtype: float64
16421671
"""
1672+
numeric_only = self._resolve_numeric_only(numeric_only)
1673+
16431674
result = self._cython_agg_general(
16441675
"mean",
16451676
alt=lambda x: Series(x).mean(numeric_only=numeric_only),
@@ -1650,7 +1681,7 @@ def mean(self, numeric_only: bool = True):
16501681
@final
16511682
@Substitution(name="groupby")
16521683
@Appender(_common_see_also)
1653-
def median(self, numeric_only=True):
1684+
def median(self, numeric_only: bool | lib.NoDefault = lib.no_default):
16541685
"""
16551686
Compute median of groups, excluding missing values.
16561687
@@ -1667,6 +1698,8 @@ def median(self, numeric_only=True):
16671698
Series or DataFrame
16681699
Median of values within each group.
16691700
"""
1701+
numeric_only = self._resolve_numeric_only(numeric_only)
1702+
16701703
result = self._cython_agg_general(
16711704
"median",
16721705
alt=lambda x: Series(x).median(numeric_only=numeric_only),
@@ -1724,8 +1757,9 @@ def var(self, ddof: int = 1):
17241757
Variance of values within each group.
17251758
"""
17261759
if ddof == 1:
1760+
numeric_only = self._resolve_numeric_only(lib.no_default)
17271761
return self._cython_agg_general(
1728-
"var", alt=lambda x: Series(x).var(ddof=ddof)
1762+
"var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only
17291763
)
17301764
else:
17311765
func = lambda x: x.var(ddof=ddof)
@@ -1790,7 +1824,10 @@ def size(self) -> FrameOrSeriesUnion:
17901824

17911825
@final
17921826
@doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
1793-
def sum(self, numeric_only: bool = True, min_count: int = 0):
1827+
def sum(
1828+
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
1829+
):
1830+
numeric_only = self._resolve_numeric_only(numeric_only)
17941831

17951832
# If we are grouping on categoricals we want unobserved categories to
17961833
# return zero, rather than the default of NaN which the reindexing in
@@ -1807,7 +1844,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0):
18071844

18081845
@final
18091846
@doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
1810-
def prod(self, numeric_only: bool = True, min_count: int = 0):
1847+
def prod(
1848+
self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0
1849+
):
1850+
numeric_only = self._resolve_numeric_only(numeric_only)
1851+
18111852
return self._agg_general(
18121853
numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
18131854
)
@@ -2736,7 +2777,7 @@ def _get_cythonized_result(
27362777
how: str,
27372778
cython_dtype: np.dtype,
27382779
aggregate: bool = False,
2739-
numeric_only: bool = True,
2780+
numeric_only: bool | lib.NoDefault = lib.no_default,
27402781
needs_counts: bool = False,
27412782
needs_values: bool = False,
27422783
needs_2d: bool = False,
@@ -2804,6 +2845,8 @@ def _get_cythonized_result(
28042845
-------
28052846
`Series` or `DataFrame` with filled values
28062847
"""
2848+
numeric_only = self._resolve_numeric_only(numeric_only)
2849+
28072850
if result_is_index and aggregate:
28082851
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
28092852
if post_processing and not callable(post_processing):

pandas/tests/groupby/aggregate/test_cython.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,16 @@ def test_cython_agg_boolean():
8989

9090
def test_cython_agg_nothing_to_agg():
9191
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
92-
msg = "No numeric types to aggregate"
9392

94-
with pytest.raises(DataError, match=msg):
93+
with pytest.raises(NotImplementedError, match="does not implement"):
94+
frame.groupby("a")["b"].mean(numeric_only=True)
95+
96+
with pytest.raises(TypeError, match="Could not convert (foo|bar)*"):
9597
frame.groupby("a")["b"].mean()
9698

9799
frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
100+
101+
msg = "No numeric types to aggregate"
98102
with pytest.raises(DataError, match=msg):
99103
frame[["b"]].groupby(frame["a"]).mean()
100104

@@ -107,9 +111,8 @@ def test_cython_agg_nothing_to_agg_with_dates():
107111
"dates": pd.date_range("now", periods=50, freq="T"),
108112
}
109113
)
110-
msg = "No numeric types to aggregate"
111-
with pytest.raises(DataError, match=msg):
112-
frame.groupby("b").dates.mean()
114+
with pytest.raises(NotImplementedError, match="does not implement"):
115+
frame.groupby("b").dates.mean(numeric_only=True)
113116

114117

115118
def test_cython_agg_frame_columns():
@@ -170,7 +173,7 @@ def test__cython_agg_general(op, targop):
170173
df = DataFrame(np.random.randn(1000))
171174
labels = np.random.randint(0, 50, size=1000).astype(float)
172175

173-
result = df.groupby(labels)._cython_agg_general(op)
176+
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
174177
expected = df.groupby(labels).agg(targop)
175178
tm.assert_frame_equal(result, expected)
176179

@@ -192,7 +195,7 @@ def test_cython_agg_empty_buckets(op, targop, observed):
192195
# calling _cython_agg_general directly, instead of via the user API
193196
# which sets different values for min_count, so do that here.
194197
g = df.groupby(pd.cut(df[0], grps), observed=observed)
195-
result = g._cython_agg_general(op)
198+
result = g._cython_agg_general(op, alt=None, numeric_only=True)
196199

197200
g = df.groupby(pd.cut(df[0], grps), observed=observed)
198201
expected = g.agg(lambda x: targop(x))
@@ -206,7 +209,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
206209
grps = range(0, 25, 5)
207210
# add / sum
208211
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
209-
"add"
212+
"add", alt=None, numeric_only=True
210213
)
211214
intervals = pd.interval_range(0, 20, freq=5)
212215
expected = DataFrame(
@@ -220,7 +223,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
220223

221224
# prod
222225
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
223-
"prod"
226+
"prod", alt=None, numeric_only=True
224227
)
225228
expected = DataFrame(
226229
{"a": [1, 1, 1716, 1]},

pandas/tests/groupby/test_groupby.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,7 +1764,15 @@ def test_empty_groupby(columns, keys, values, method, op, request):
17641764
# GH8093 & GH26411
17651765
override_dtype = None
17661766

1767-
if isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
1767+
if (
1768+
isinstance(values, Categorical)
1769+
and not isinstance(columns, list)
1770+
and op in ["sum", "prod"]
1771+
and method != "apply"
1772+
):
1773+
# handled below GH#41291
1774+
pass
1775+
elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply":
17681776
mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable")
17691777
request.node.add_marker(mark)
17701778
elif (
@@ -1825,11 +1833,36 @@ def test_empty_groupby(columns, keys, values, method, op, request):
18251833
df = df.iloc[:0]
18261834

18271835
gb = df.groupby(keys)[columns]
1828-
if method == "attr":
1829-
result = getattr(gb, op)()
1830-
else:
1831-
result = getattr(gb, method)(op)
18321836

1837+
def get_result():
1838+
if method == "attr":
1839+
return getattr(gb, op)()
1840+
else:
1841+
return getattr(gb, method)(op)
1842+
1843+
if columns == "C":
1844+
# i.e. SeriesGroupBy
1845+
if op in ["prod", "sum"]:
1846+
# ops that require more than just ordered-ness
1847+
if method != "apply":
1848+
# FIXME: apply goes through different code path
1849+
if df.dtypes[0].kind == "M":
1850+
# GH#41291
1851+
# datetime64 -> prod and sum are invalid
1852+
msg = "datetime64 type does not support"
1853+
with pytest.raises(TypeError, match=msg):
1854+
get_result()
1855+
1856+
return
1857+
elif isinstance(values, Categorical):
1858+
# GH#41291
1859+
msg = "category type does not support"
1860+
with pytest.raises(TypeError, match=msg):
1861+
get_result()
1862+
1863+
return
1864+
1865+
result = get_result()
18331866
expected = df.set_index(keys)[columns]
18341867
if override_dtype is not None:
18351868
expected = expected.astype(override_dtype)

pandas/tests/resample/test_datetime_index.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ def test_custom_grouper(index):
6161
g.ohlc() # doesn't use _cython_agg_general
6262
funcs = ["add", "mean", "prod", "min", "max", "var"]
6363
for f in funcs:
64-
g._cython_agg_general(f)
64+
g._cython_agg_general(f, alt=None, numeric_only=True)
6565

6666
b = Grouper(freq=Minute(5), closed="right", label="right")
6767
g = s.groupby(b)
6868
# check all cython functions work
6969
g.ohlc() # doesn't use _cython_agg_general
7070
funcs = ["add", "mean", "prod", "min", "max", "var"]
7171
for f in funcs:
72-
g._cython_agg_general(f)
72+
g._cython_agg_general(f, alt=None, numeric_only=True)
7373

7474
assert g.ngroups == 2593
7575
assert notna(g.mean()).all()
@@ -417,7 +417,7 @@ def test_resample_frame_basic():
417417
# check all cython functions work
418418
funcs = ["add", "mean", "prod", "min", "max", "var"]
419419
for f in funcs:
420-
g._cython_agg_general(f)
420+
g._cython_agg_general(f, alt=None, numeric_only=True)
421421

422422
result = df.resample("A").mean()
423423
tm.assert_series_equal(result["A"], df["A"].resample("A").mean())

0 commit comments

Comments
 (0)