Skip to content

Commit 8111099

Browse files
authored
BUG: agg with dictlike and non-unique col will return wrong type (#52115)
* BUG: Agg in non-unique col * what is new * Fix bug but add more codes * Fix mypy and improve what's new * Improve preformance * Improve preformance * Improve what'new and comment
1 parent 8a1a3f1 commit 8111099

File tree

3 files changed

+51
-12
lines changed

3 files changed

+51
-12
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ Groupby/resample/rolling
364364
Reshaping
365365
^^^^^^^^^
366366
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
367+
- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
367368
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
368369
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
369370
-

pandas/core/apply.py

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -412,29 +412,55 @@ def agg_dict_like(self) -> DataFrame | Series:
412412
context_manager = com.temp_setattr(obj, "as_index", True)
413413
else:
414414
context_manager = nullcontext()
415+
416+
is_non_unique_col = (
417+
selected_obj.ndim == 2
418+
and selected_obj.columns.nunique() < len(selected_obj.columns)
419+
)
420+
415421
with context_manager:
416422
if selected_obj.ndim == 1:
417423
# key only used for output
418424
colg = obj._gotitem(selection, ndim=1)
419-
results = {key: colg.agg(how) for key, how in arg.items()}
425+
result_data = [colg.agg(how) for _, how in arg.items()]
426+
result_index = list(arg.keys())
427+
elif is_non_unique_col:
428+
# key used for column selection and output
429+
# GH#51099
430+
result_data = []
431+
result_index = []
432+
for key, how in arg.items():
433+
indices = selected_obj.columns.get_indexer_for([key])
434+
labels = selected_obj.columns.take(indices)
435+
label_to_indices = defaultdict(list)
436+
for index, label in zip(indices, labels):
437+
label_to_indices[label].append(index)
438+
439+
key_data = [
440+
selected_obj._ixs(indice, axis=1).agg(how)
441+
for label, indices in label_to_indices.items()
442+
for indice in indices
443+
]
444+
445+
result_index += [key] * len(key_data)
446+
result_data += key_data
420447
else:
421448
# key used for column selection and output
422-
results = {
423-
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
424-
}
425-
426-
# set the final keys
427-
keys = list(arg.keys())
449+
result_data = [
450+
obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
451+
]
452+
result_index = list(arg.keys())
428453

429454
# Avoid making two isinstance calls in all and any below
430-
is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]
455+
is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]
431456

432457
# combine results
433458
if all(is_ndframe):
459+
results = dict(zip(result_index, result_data))
434460
keys_to_use: Iterable[Hashable]
435-
keys_to_use = [k for k in keys if not results[k].empty]
461+
keys_to_use = [k for k in result_index if not results[k].empty]
436462
# Have to check, if at least one DataFrame is not empty.
437-
keys_to_use = keys_to_use if keys_to_use != [] else keys
463+
keys_to_use = keys_to_use if keys_to_use != [] else result_index
438464
if selected_obj.ndim == 2:
439465
# keys are columns, so we can preserve names
440466
ktu = Index(keys_to_use)
@@ -457,15 +483,15 @@ def agg_dict_like(self) -> DataFrame | Series:
457483
else:
458484
from pandas import Series
459485

460-
# we have a dict of scalars
486+
# we have a list of scalars
461487
# GH 36212 use name only if obj is a series
462488
if obj.ndim == 1:
463489
obj = cast("Series", obj)
464490
name = obj.name
465491
else:
466492
name = None
467493

468-
result = Series(results, name=name)
494+
result = Series(result_data, index=result_index, name=name)
469495

470496
return result
471497

pandas/tests/apply/test_frame_apply.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,3 +1496,15 @@ def test_agg_std():
14961496
result = df.agg([np.std])
14971497
expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
14981498
tm.assert_frame_equal(result, expected)
1499+
1500+
1501+
def test_agg_dist_like_and_nonunique_columns():
1502+
# GH#51099
1503+
df = DataFrame(
1504+
{"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
1505+
)
1506+
df.columns = ["A", "A", "C"]
1507+
1508+
result = df.agg({"A": "count"})
1509+
expected = df["A"].count()
1510+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)