Skip to content

Commit e58a193

Browse files
REGR: fix case all-NaN/numeric object column in groupby (#39655)
1 parent d4eee37 commit e58a193

File tree

4 files changed

+65
-4
lines changed

4 files changed

+65
-4
lines changed

doc/source/whatsnew/v1.2.2.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ Fixed regressions
2323
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
2424
- Fixed regression in :meth:`~DataFrame.to_excel` creating corrupt files when appending (``mode="a"``) to an existing file (:issue:`39576`)
2525
- Fixed regression in :meth:`DataFrame.transform` failing in case of an empty DataFrame or Series (:issue:`39636`)
26-
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
26+
- Fixed regression in :meth:`~DataFrame.groupby` or :meth:`~DataFrame.resample` when aggregating an all-NaN or numeric object dtype column (:issue:`39329`)
27+
- Fixed regression in :meth:`.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
2728
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
2829

2930
.. ---------------------------------------------------------------------------

pandas/core/groupby/generic.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,11 +1102,16 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike:
11021102
assert isinstance(result, (Series, DataFrame)) # for mypy
11031103
mgr = result._mgr
11041104
assert isinstance(mgr, BlockManager)
1105-
assert len(mgr.blocks) == 1
11061105

11071106
# unwrap DataFrame to get array
1108-
result = mgr.blocks[0].values
1109-
return result
1107+
if len(mgr.blocks) != 1:
1108+
# We've split an object block! Everything we've assumed
1109+
# about a single block input returning a single block output
1110+
# is a lie. See eg GH-39329
1111+
return mgr.as_array()
1112+
else:
1113+
result = mgr.blocks[0].values
1114+
return result
11101115

11111116
def blk_func(bvalues: ArrayLike) -> ArrayLike:
11121117

pandas/tests/groupby/aggregate/test_aggregate.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,3 +1187,27 @@ def test_aggregate_datetime_objects():
11871187
result = df.groupby("A").B.max()
11881188
expected = df.set_index("A")["B"]
11891189
tm.assert_series_equal(result, expected)
1190+
1191+
1192+
def test_aggregate_numeric_object_dtype():
1193+
# https://github.com/pandas-dev/pandas/issues/39329
1194+
# simplified case: multiple object columns where one is all-NaN
1195+
# -> gets split as the all-NaN is inferred as float
1196+
df = DataFrame(
1197+
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
1198+
).astype(object)
1199+
result = df.groupby("key").min()
1200+
expected = DataFrame(
1201+
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]}
1202+
).set_index("key")
1203+
tm.assert_frame_equal(result, expected)
1204+
1205+
# same but with numbers
1206+
df = DataFrame(
1207+
{"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
1208+
).astype(object)
1209+
result = df.groupby("key").min()
1210+
expected = DataFrame(
1211+
{"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]}
1212+
).set_index("key")
1213+
tm.assert_frame_equal(result, expected)

pandas/tests/resample/test_resampler_grouper.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,3 +392,34 @@ def test_resample_groupby_agg():
392392
result = resampled.agg({"num": "sum"})
393393

394394
tm.assert_frame_equal(result, expected)
395+
396+
397+
@pytest.mark.parametrize("consolidate", [True, False])
398+
def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
399+
# https://github.com/pandas-dev/pandas/issues/39329
400+
401+
dates = pd.date_range("2020-01-01", periods=15, freq="D")
402+
df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"})
403+
df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)})
404+
df = pd.concat([df1, df2], ignore_index=True)
405+
if consolidate:
406+
df = df._consolidate()
407+
408+
result = df.groupby(["key"]).resample("W", on="date").min()
409+
idx = pd.MultiIndex.from_arrays(
410+
[
411+
["A"] * 3 + ["B"] * 3,
412+
pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2),
413+
],
414+
names=["key", "date"],
415+
)
416+
expected = DataFrame(
417+
{
418+
"key": ["A"] * 3 + ["B"] * 3,
419+
"date": pd.to_datetime(["2020-01-01", "2020-01-06", "2020-01-13"] * 2),
420+
"col1": [0, 5, 12] * 2,
421+
"col_object": ["val"] * 3 + [np.nan] * 3,
422+
},
423+
index=idx,
424+
)
425+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)