Skip to content

BUG: Fixed groupby quantile for listlike q #27827

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^

- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: pandas

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple -> list-like

- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
- Bug in windowing over read-only arrays (:issue:`27766`)
- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`)
Expand Down
60 changes: 48 additions & 12 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1870,6 +1870,7 @@ def quantile(self, q=0.5, interpolation="linear"):
a 2.0
b 3.0
"""
from pandas import concat

def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
if is_object_dtype(vals):
Expand Down Expand Up @@ -1897,18 +1898,53 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:

return vals

return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
if is_scalar(q):
return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
else:
results = [
self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=qi,
interpolation=interpolation,
)
for qi in q
]
# fix levels to place quantiles on the inside
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would welcome improvements to this. Essentially, we have two things to do after concating the results

  1. Reorder the levels. By default concat places the keys on the outermost level, we need it on the innermost
  2. Re-sort things. Initially we're sorted by quantile (the q-th quantile for every group is together, then the next quantile). We need to be grouped by group key, then quantiles. We can't just use sort_index, since we need to preserve the user-provided group key order with sort=False and the order of the quintile list.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah this isn't fun. just checking: we can have mixed dtypes here? if we had a single dtype then stack/unstack would be cheap and there might be a prettier workaround

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

concat(results, axis=1, keys=q).stack(0) gives the same answer for at least one of the test cases

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately that sorts the axis. I theory, we should be able to follow it up with a loc to restore the correct order on that level,

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index c00bd8398..617466b8d 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1928,23 +1928,10 @@ class GroupBy(_GroupBy):
                 for qi in q
             ]
             # fix levels to place quantiles on the inside
-            result = concat(results, axis=0, keys=q)
-            order = np.roll(list(range(result.index.nlevels)), -1)
-            result = result.reorder_levels(order)
-            result = result.reindex(q, level=-1)
-
-            # fix order.
-            hi = len(q) * self.ngroups
-            arr = np.arange(0, hi, self.ngroups)
-            arrays = []
-
-            for i in range(self.ngroups):
-                arr = arr + i
-                arrays.append(arr)
-
-            indices = np.concatenate(arrays)
-            assert len(indices) == len(result)
-            return result.take(indices)
+            result = concat(results, axis=1, keys=q).stack(0)
+            slices = [slice(None)] * result.index.ndim
+            result = result.loc[tuple(slices), :]
+            return result
 
     @Substitution(name="groupby")
     def ngroup(self, ascending=True):

But that runs into the (I think known) issue with loc & a MultiIndex

In [21]: s = pd.Series(list(range(12)), index=pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3, 4]]))

In [22]: s.loc[pd.IndexSlice[:, [3, 1, 2, 4]]]
Out[22]:
a  1     0
   2     1
   3     2
   4     3
b  1     4
   2     5
   3     6
   4     7
c  1     8
   2     9
   3    10
   4    11
dtype: int64

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Darn. Can you add a my-idea-doesnt-work case to the tests

result = concat(results, axis=0, keys=q)
order = np.roll(list(range(result.index.nlevels)), -1)
result = result.reorder_levels(order)
result = result.reindex(q, level=-1)

# fix order.
hi = len(q) * self.ngroups
arr = np.arange(0, hi, self.ngroups)
arrays = []

for i in range(self.ngroups):
arr = arr + i
arrays.append(arr)

indices = np.concatenate(arrays)
assert len(indices) == len(result)
return result.take(indices)

@Substitution(name="groupby")
def ngroup(self, ascending=True):
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,57 @@ def test_quantile(interpolation, a_vals, b_vals, q):
tm.assert_frame_equal(result, expected)


def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = pd.DataFrame({"A": [0, 1, 2, 3, 4]})
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])

index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)

df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])

result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_no_sort():
df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
expected = pd.DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)

result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
expected = pd.DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_multiple_levels():
df = pd.DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_raises():
df = pd.DataFrame(
[["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]
Expand Down