Skip to content

ENH: Implement nlargest and nsmallest for DataFrameGroupBy #46986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ Other enhancements
- Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
- Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
- Implemented :meth:`nlargest` and :meth:`nsmallest` methods for :class:`DataFrameGroupBy` (:issue:`46924`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/groupby/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,16 @@ class OutputKey:
"corr",
"cov",
"diff",
"nlargest",
"nsmallest",
]
)
| plotting_methods
)

series_apply_allowlist: frozenset[str] = (
common_apply_allowlist
| frozenset(
{"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"}
)
| frozenset({"is_monotonic_increasing", "is_monotonic_decreasing"})
) | frozenset(["dtype", "unique"])

dataframe_apply_allowlist: frozenset[str] = common_apply_allowlist | frozenset(
Expand Down Expand Up @@ -155,6 +155,8 @@ def maybe_normalize_deprecated_kernels(kernel):
"transform",
"sample",
"value_counts",
"nlargest",
"nsmallest",
]
)
# Valid values of `name` for `groupby.transform(name)`
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1812,6 +1812,24 @@ def value_counts(
result = result_frame
return result.__finalize__(self.obj, method="value_counts")

@doc(DataFrame.nlargest)
def nlargest(self, n, columns, keep: str = "first"):
f = partial(DataFrame.nlargest, n=n, columns=columns, keep=keep)
data = self._obj_with_exclusions
# Don't change behavior if result index happens to be the same, i.e.
# already ordered and n >= all group sizes.
result = self._python_apply_general(f, data, not_indexed_same=True)
return result

@doc(DataFrame.nsmallest)
def nsmallest(self, n, columns, keep: str = "first"):
f = partial(DataFrame.nsmallest, n=n, columns=columns, keep=keep)
data = self._obj_with_exclusions
# Don't change behavior if result index happens to be the same, i.e.
# already ordered and n >= all group sizes.
result = self._python_apply_general(f, data, not_indexed_same=True)
return result


def _wrap_transform_general_frame(
obj: DataFrame, group: DataFrame, res: DataFrame | Series
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/groupby/test_allowlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
"corr",
"cov",
"diff",
"nlargest",
"nsmallest",
]


Expand Down Expand Up @@ -322,6 +324,8 @@ def test_tab_completion(mframe):
"sample",
"ewm",
"value_counts",
"nlargest",
"nsmallest",
}
assert results == expected

Expand Down
67 changes: 67 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2721,3 +2721,70 @@ def test_by_column_values_with_same_starting_value():
).set_index("Name")

tm.assert_frame_equal(result, expected_result)


@pytest.mark.parametrize(
"function, keep, indices, name, data",
[
(
"nlargest",
"first",
[("bar", 1), ("bar", 2), ("foo", 5), ("foo", 3)],
["b2", "b3", "f3", "f1"],
[3, 3, 3, 1],
),
(
"nlargest",
"last",
[("bar", 2), ("bar", 1), ("foo", 5), ("foo", 4)],
["b3", "b2", "f3", "f2"],
[3, 3, 3, 1],
),
(
"nlargest",
"all",
[("bar", 1), ("bar", 2), ("foo", 5), ("foo", 3), ("foo", 4)],
["b2", "b3", "f3", "f1", "f2"],
[3, 3, 3, 1, 1],
),
(
"nsmallest",
"first",
[("bar", 0), ("bar", 1), ("foo", 3), ("foo", 4)],
["b1", "b2", "f1", "f2"],
[1, 3, 1, 1],
),
(
"nsmallest",
"last",
[("bar", 0), ("bar", 2), ("foo", 4), ("foo", 3)],
["b1", "b3", "f2", "f1"],
[1, 3, 1, 1],
),
(
"nsmallest",
"all",
[("bar", 0), ("bar", 1), ("bar", 2), ("foo", 3), ("foo", 4)],
["b1", "b2", "b3", "f1", "f2"],
[1, 3, 3, 1, 1],
),
],
)
def test_nlargest_nsmallest(function, keep, indices, name, data):
# test nlargest and nsmallest for DataFrameGroupBy
# GH46924
df = DataFrame(
{
"group": ["bar", "bar", "bar", "foo", "foo", "foo"],
"name": ["b1", "b2", "b3", "f1", "f2", "f3"],
"data": [1, 3, 3, 1, 1, 3],
}
)
grouped = df.groupby("group")
func = getattr(grouped, function)
result = func(n=2, keep=keep, columns="data")

expected_index = MultiIndex.from_tuples(indices, names=["group", None])
expected = DataFrame({"name": name, "data": data}, index=expected_index)

tm.assert_frame_equal(result, expected)