Skip to content

REF: make Grouping less stateful #41529

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 33 additions & 34 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np

from pandas._typing import (
ArrayLike,
FrameOrSeries,
final,
)
Expand Down Expand Up @@ -587,20 +588,23 @@ def indices(self):

@property
def codes(self) -> np.ndarray:
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
cat = self.grouper
return cat.codes
if self._codes is not None:
# _codes is set in __init__ for MultiIndex cases
return self._codes

if self._codes is None:
self._make_codes()
# error: Incompatible return value type (got "Optional[ndarray]",
# expected "ndarray")
return self._codes # type: ignore[return-value]
return self._codes_and_uniques[0]

@cache_readonly
def group_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but holding an ArrayLike to ensure
we can can retain ExtensionDtypes.
"""
return self._codes_and_uniques[1]

@cache_readonly
def result_index(self) -> Index:
# TODO: what's the difference between result_index vs group_index?
if self.all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
Expand All @@ -609,40 +613,37 @@ def result_index(self) -> Index:

@cache_readonly
def group_index(self) -> Index:
if self._group_index is not None:
# _group_index is set in __init__ for MultiIndex cases
return self._group_index
uniques = self.group_arraylike
return Index(uniques, name=self.name)

@cache_readonly
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]:
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
cat = self.grouper
categories = cat.categories

if self.observed:
codes = algorithms.unique1d(cat.codes)
codes = codes[codes != -1]
ucodes = algorithms.unique1d(cat.codes)
ucodes = ucodes[ucodes != -1]
if self.sort or cat.ordered:
codes = np.sort(codes)
ucodes = np.sort(ucodes)
else:
codes = np.arange(len(categories))
ucodes = np.arange(len(categories))

return CategoricalIndex(
Categorical.from_codes(
codes=codes, categories=categories, ordered=cat.ordered
),
name=self.name,
uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)
return cat.codes, uniques

if self._group_index is None:
self._make_codes()
assert self._group_index is not None
return self._group_index

def _make_codes(self) -> None:
if self._codes is not None and self._group_index is not None:
return

# we have a list of groupers
if isinstance(self.grouper, ops.BaseGrouper):
elif isinstance(self.grouper, ops.BaseGrouper):
# we have a list of groupers
codes = self.grouper.codes_info
uniques = self.grouper.result_index
uniques = self.grouper.result_arraylike
else:
# GH35667, replace dropna=False with na_sentinel=None
if not self.dropna:
Expand All @@ -652,9 +653,7 @@ def _make_codes(self) -> None:
codes, uniques = algorithms.factorize(
self.grouper, sort=self.sort, na_sentinel=na_sentinel
)
uniques = Index(uniques, name=self.name)
self._codes = codes
self._group_index = uniques
return codes, uniques

@cache_readonly
def groups(self) -> dict[Hashable, np.ndarray]:
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,19 @@ def reconstructed_codes(self) -> list[np.ndarray]:
ids, obs_ids, _ = self.group_info
return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)

@cache_readonly
def result_arraylike(self) -> ArrayLike:
"""
Analogous to result_index, but returning an ndarray/ExtensionArray
allowing us to retain ExtensionDtypes not supported by Index.
"""
# TODO: once Index supports arbitrary EAs, this can be removed in favor
# of result_index
if len(self.groupings) == 1:
return self.groupings[0].group_arraylike

return self.result_index._values

@cache_readonly
def result_index(self) -> Index:
if len(self.groupings) == 1:
Expand All @@ -919,7 +932,7 @@ def result_index(self) -> Index:
)

@final
def get_group_levels(self) -> list[Index]:
def get_group_levels(self) -> list[ArrayLike]:
# Note: only called from _insert_inaxis_grouper_inplace, which
# is only called for BaseGrouper, never for BinGrouper
if len(self.groupings) == 1:
Expand Down