Skip to content

BUG: make dense ranks results scale to 100 percent #21203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Groupby/Resample/Rolling

- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``

Strings
^^^^^^^
Expand Down
18 changes: 12 additions & 6 deletions pandas/_libs/groupby_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
bint is_datetimelike, object ties_method,
bint ascending, bint pct, object na_option):
"""
Provides the rank of values within each group.
Provides the rank of values within each group.

Parameters
----------
Expand Down Expand Up @@ -451,8 +451,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
"""
cdef:
TiebreakEnumType tiebreak
Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0
Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0
Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0
ndarray[int64_t] _as
ndarray[float64_t, ndim=2] grp_sizes
ndarray[{{c_type}}] masked_vals
Expand Down Expand Up @@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
dups = sum_ranks = 0
val_start = i
grp_vals_seen += 1
grp_tie_count +=1

# Similar to the previous conditional, check now if we are moving
# to a new group. If so, keep track of the index where the new
Expand All @@ -571,11 +572,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
# (used by pct calculations later). also be sure to reset any of
# the items helping to calculate dups
if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
if tiebreak != TIEBREAK_DENSE:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count
else:
for j in range(grp_start, i + 1):
grp_sizes[_as[j], 0] = (grp_tie_count -
(grp_na_count > 0))
dups = sum_ranks = 0
grp_na_count = 0
val_start = i + 1
grp_tie_count = 0
grp_start = i + 1
grp_vals_seen = 1

Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/groupby/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def test_rank_apply():
('first', False, False, [3., 4., 1., 5., 2.]),
('first', False, True, [.6, .8, .2, 1., .4]),
('dense', True, False, [1., 1., 3., 1., 2.]),
('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]),
('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]),
('dense', False, False, [3., 3., 1., 3., 2.]),
('dense', False, True, [.6, .6, .2, .6, .4]),
('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]),
])
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
Expand Down Expand Up @@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
@pytest.mark.parametrize("grps", [
['qux'], ['qux', 'quux']])
@pytest.mark.parametrize("vals", [
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats
[2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
[pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan,
pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'),
pd.Timestamp('2018-01-06'), np.nan, np.nan]
Expand Down Expand Up @@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
('dense', True, 'keep', False,
[1., 1., np.nan, 3., 1., 2., np.nan, np.nan]),
('dense', True, 'keep', True,
[0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]),
[1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use literals instead of expressions here?

Copy link
Contributor Author

@peterpanmj peterpanmj May 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@that will be 0.3333333333333333. A little hard to read. Still use literal ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm yea don’t do that. If it’s minimal effort to replace the data with something more easily divisible please do so. Otherwise then I’m ok with it as is

('dense', False, 'keep', False,
[3., 3., np.nan, 1., 3., 2., np.nan, np.nan]),
('dense', False, 'keep', True,
[.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
[3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]),
('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]),
('average', True, 'no_na', True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]),
Expand All @@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]),
('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]),
('dense', True, 'no_na', True,
[0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]),
[0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]),
('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]),
('dense', False, 'no_na', True,
[0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5])
[0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.])
])
def test_rank_args_missing(grps, vals, ties_method, ascending,
na_option, pct, exp):
Expand Down