BUG: agg with dictlike and non-unique col will return wrong type (#52115)

luke396 · web-flow · commit 8111099370fd · 2023-04-11T09:25:45.000-07:00
* BUG: Agg in non-unique col

* what is new

* Fix bug but add more codes

* Fix mypy and improve what's new

* Improve preformance

* Improve preformance

* Improve what'new and comment
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -364,6 +364,7 @@ Groupby/resample/rolling
 Reshaping
 ^^^^^^^^^
 - Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
+- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
 - Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
 - Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
 -
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -412,29 +412,55 @@ def agg_dict_like(self) -> DataFrame | Series:
             context_manager = com.temp_setattr(obj, "as_index", True)
         else:
             context_manager = nullcontext()
+
+        is_non_unique_col = (
+            selected_obj.ndim == 2
+            and selected_obj.columns.nunique() < len(selected_obj.columns)
+        )
+
         with context_manager:
             if selected_obj.ndim == 1:
                 # key only used for output
                 colg = obj._gotitem(selection, ndim=1)
-                results = {key: colg.agg(how) for key, how in arg.items()}
+                result_data = [colg.agg(how) for _, how in arg.items()]
+                result_index = list(arg.keys())
+            elif is_non_unique_col:
+                # key used for column selection and output
+                # GH#51099
+                result_data = []
+                result_index = []
+                for key, how in arg.items():
+                    indices = selected_obj.columns.get_indexer_for([key])
+                    labels = selected_obj.columns.take(indices)
+                    label_to_indices = defaultdict(list)
+                    for index, label in zip(indices, labels):
+                        label_to_indices[label].append(index)
+
+                    key_data = [
+                        selected_obj._ixs(indice, axis=1).agg(how)
+                        for label, indices in label_to_indices.items()
+                        for indice in indices
+                    ]
+
+                    result_index += [key] * len(key_data)
+                    result_data += key_data
             else:
                 # key used for column selection and output
-                results = {
-                    key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
-                }
-
-        # set the final keys
-        keys = list(arg.keys())
+                result_data = [
+                    obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
+                ]
+                result_index = list(arg.keys())
 
         # Avoid making two isinstance calls in all and any below
-        is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]
+        is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]
 
         # combine results
         if all(is_ndframe):
+            results = dict(zip(result_index, result_data))
             keys_to_use: Iterable[Hashable]
-            keys_to_use = [k for k in keys if not results[k].empty]
+            keys_to_use = [k for k in result_index if not results[k].empty]
             # Have to check, if at least one DataFrame is not empty.
-            keys_to_use = keys_to_use if keys_to_use != [] else keys
+            keys_to_use = keys_to_use if keys_to_use != [] else result_index
             if selected_obj.ndim == 2:
                 # keys are columns, so we can preserve names
                 ktu = Index(keys_to_use)
@@ -457,15 +483,15 @@ def agg_dict_like(self) -> DataFrame | Series:
         else:
             from pandas import Series
 
-            # we have a dict of scalars
+            # we have a list of scalars
             # GH 36212 use name only if obj is a series
             if obj.ndim == 1:
                 obj = cast("Series", obj)
                 name = obj.name
             else:
                 name = None
 
-            result = Series(results, name=name)
+            result = Series(result_data, index=result_index, name=name)
 
         return result
 
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
@@ -1496,3 +1496,15 @@ def test_agg_std():
     result = df.agg([np.std])
     expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
     tm.assert_frame_equal(result, expected)
+
+
+def test_agg_dist_like_and_nonunique_columns():
+    # GH#51099
+    df = DataFrame(
+        {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
+    )
+    df.columns = ["A", "A", "C"]
+
+    result = df.agg({"A": "count"})
+    expected = df["A"].count()
+    tm.assert_series_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -364,6 +364,7 @@ Groupby/resample/rolling`
`364`	`364`	`Reshaping`
`365`	`365`	`^^^^^^^^^`
`366`	`366`	- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
	`367`	+- Bug in :meth:`DataFrame.agg` and :meth:`Series.agg` on non-unique columns would return incorrect type when dist-like argument passed in (:issue:`51099`)
`367`	`368`	- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
`368`	`369`	- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
`369`	`370`	`-`