-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
CLN: Unify signatures in _libs.groupby #34372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
53ae9d6
7985efb
98f1c0d
5e21c72
4d62493
f1c868f
4d2d332
33bf96a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: | |
return self._get_cythonized_result( | ||
"group_any_all", | ||
aggregate=True, | ||
numeric_only=False, | ||
cython_dtype=np.dtype(np.uint8), | ||
needs_values=True, | ||
needs_mask=True, | ||
|
@@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1): | |
Series or DataFrame | ||
Standard deviation of values within each group. | ||
""" | ||
result = self.var(ddof=ddof) | ||
if result.ndim == 1: | ||
result = np.sqrt(result) | ||
else: | ||
cols = result.columns.get_indexer_for( | ||
result.columns.difference(self.exclusions).unique() | ||
) | ||
# TODO(GH-22046) - setting with iloc broken if labels are not unique | ||
# .values to remove labels | ||
result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values | ||
|
||
return result | ||
return self._get_cythonized_result( | ||
"group_var_float64", | ||
aggregate=True, | ||
needs_counts=True, | ||
needs_values=True, | ||
needs_2d=True, | ||
cython_dtype=np.dtype(np.float64), | ||
post_processing=lambda vals, inference: np.sqrt(vals), | ||
ddof=ddof, | ||
) | ||
|
||
@Substitution(name="groupby") | ||
@Appender(_common_see_also) | ||
|
@@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None): | |
|
||
return self._get_cythonized_result( | ||
"group_fillna_indexer", | ||
numeric_only=False, | ||
needs_mask=True, | ||
cython_dtype=np.dtype(np.int64), | ||
result_is_index=True, | ||
|
@@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: | |
return self._get_cythonized_result( | ||
"group_quantile", | ||
aggregate=True, | ||
numeric_only=False, | ||
needs_values=True, | ||
needs_mask=True, | ||
cython_dtype=np.dtype(np.float64), | ||
|
@@ -2367,7 +2368,11 @@ def _get_cythonized_result( | |
how: str, | ||
cython_dtype: np.dtype, | ||
aggregate: bool = False, | ||
numeric_only: bool = True, | ||
needs_counts: bool = False, | ||
needs_values: bool = False, | ||
needs_2d: bool = False, | ||
min_count: Optional[int] = None, | ||
needs_mask: bool = False, | ||
needs_ngroups: bool = False, | ||
result_is_index: bool = False, | ||
|
@@ -2386,9 +2391,18 @@ def _get_cythonized_result( | |
aggregate : bool, default False | ||
Whether the result should be aggregated to match the number of | ||
groups | ||
numeric_only : bool, default True | ||
Whether only numeric datatypes should be computed | ||
needs_counts : bool, default False | ||
Whether the counts should be a part of the Cython call | ||
needs_values : bool, default False | ||
Whether the values should be a part of the Cython call | ||
signature | ||
needs_2d : bool, default False | ||
Whether the values and result of the Cython call signature | ||
are at least 2-dimensional. | ||
min_count : int, default None | ||
When not None, min_count for the Cython call | ||
needs_mask : bool, default False | ||
Whether boolean mask needs to be part of the Cython call | ||
signature | ||
|
@@ -2418,7 +2432,7 @@ def _get_cythonized_result( | |
if result_is_index and aggregate: | ||
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") | ||
if post_processing: | ||
if not callable(pre_processing): | ||
if not callable(post_processing): | ||
raise ValueError("'post_processing' must be a callable!") | ||
if pre_processing: | ||
if not callable(pre_processing): | ||
|
@@ -2438,21 +2452,39 @@ def _get_cythonized_result( | |
name = obj.name | ||
values = obj._values | ||
|
||
if numeric_only and not is_numeric_dtype(values): | ||
continue | ||
|
||
if aggregate: | ||
result_sz = ngroups | ||
else: | ||
result_sz = len(values) | ||
|
||
result = np.zeros(result_sz, dtype=cython_dtype) | ||
func = partial(base_func, result, labels) | ||
if needs_2d: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use at_least2d or just reshape here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you do this here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you're asking to replace
with
I think the reshape version is less performant when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe at_least2d is applicable, it will turn a 1d into a single row (1xn) whereas we need a column (nx1). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yes this would be an improvement |
||
result = result.reshape((-1, 1)) | ||
func = partial(base_func, result) | ||
|
||
inferences = None | ||
|
||
if needs_counts: | ||
counts = np.zeros(self.ngroups, dtype=np.int64) | ||
func = partial(func, counts) | ||
|
||
if needs_values: | ||
vals = values | ||
if pre_processing: | ||
vals, inferences = pre_processing(vals) | ||
if needs_2d: | ||
vals = vals.reshape((-1, 1)) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
vals = vals.astype(cython_dtype, copy=False) | ||
func = partial(func, vals) | ||
|
||
func = partial(func, labels) | ||
|
||
if min_count is not None: | ||
func = partial(func, min_count) | ||
|
||
if needs_mask: | ||
mask = isna(values).view(np.uint8) | ||
func = partial(func, mask) | ||
|
@@ -2462,6 +2494,9 @@ def _get_cythonized_result( | |
|
||
func(**kwargs) # Call func to modify indexer values in place | ||
|
||
if needs_2d: | ||
result = result.reshape(-1) | ||
|
||
if result_is_index: | ||
result = algorithms.take_nd(values, result) | ||
|
||
|
@@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): | |
|
||
return self._get_cythonized_result( | ||
"group_shift_indexer", | ||
numeric_only=False, | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cython_dtype=np.dtype(np.int64), | ||
needs_ngroups=True, | ||
result_is_index=True, | ||
|
Uh oh!
There was an error while loading. Please reload this page.