-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: df.sort_values() not respecting na_position with categoricals #22556 #22640
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 17 commits
d119a00
f36346b
1e26989
dc4b4c4
a4ee75f
2b6f0b0
229fff5
fbc8a4e
41b29fb
1999d58
76da923
8f81649
8943280
1709c91
8983dd9
7f6c68a
9f0343b
9e7afb1
8d4aedc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'): | |
|
||
# specially handle Categorical | ||
if is_categorical_dtype(items): | ||
return items.argsort(ascending=ascending, kind=kind) | ||
if na_position not in {'first', 'last'}: | ||
raise ValueError('invalid na_position: {!r}'.format(na_position)) | ||
|
||
mask = isna(items) | ||
cnt_null = mask.sum() | ||
sorted_idx = items.argsort(ascending=ascending, kind=kind) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we not have a .sort_values on Categorical that accepts na_position already? if we don’t all of this code should live there anyhow There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the way I see it, |
||
if ascending and na_position == 'last': | ||
# NaN is coded as -1 and is listed in front after sorting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this duplicates some code in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. great suggestion. updated. |
||
sorted_idx = np.roll(sorted_idx, -cnt_null) | ||
elif not ascending and na_position == 'first': | ||
# NaN is coded as -1 and is listed in the end after sorting | ||
sorted_idx = np.roll(sorted_idx, cnt_null) | ||
return sorted_idx | ||
|
||
items = np.asanyarray(items) | ||
idx = np.arange(len(items)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
from pandas.compat import lrange | ||
from pandas.api.types import CategoricalDtype | ||
from pandas import (DataFrame, Series, MultiIndex, Timestamp, | ||
date_range, NaT, IntervalIndex) | ||
date_range, NaT, IntervalIndex, Categorical) | ||
|
||
from pandas.util.testing import assert_series_equal, assert_frame_equal | ||
|
||
|
@@ -161,7 +161,7 @@ def test_sort_nan(self): | |
'B': [5, 9, 2, nan, 5, 5, 4]}, | ||
index=[2, 0, 3, 1, 6, 4, 5]) | ||
sorted_df = df.sort_values(['A', 'B'], ascending=[ | ||
1, 0], na_position='first') | ||
1, 0], na_position='first') | ||
assert_frame_equal(sorted_df, expected) | ||
|
||
# na_position='last', not order | ||
|
@@ -170,7 +170,7 @@ def test_sort_nan(self): | |
'B': [4, 5, 5, nan, 2, 9, 5]}, | ||
index=[5, 4, 6, 1, 3, 0, 2]) | ||
sorted_df = df.sort_values(['A', 'B'], ascending=[ | ||
0, 1], na_position='last') | ||
0, 1], na_position='last') | ||
assert_frame_equal(sorted_df, expected) | ||
|
||
# Test DataFrame with nan label | ||
|
@@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self): | |
|
||
df = (DataFrame({'A': np.arange(6, dtype='int64'), | ||
'B': Series(list('aabbca')) | ||
.astype(CategoricalDtype(list('cab')))}) | ||
.astype(CategoricalDtype(list('cab')))}) | ||
.set_index('B')) | ||
|
||
result = df.sort_index() | ||
|
@@ -598,3 +598,86 @@ def test_sort_index_intervalindex(self): | |
closed='right') | ||
result = result.columns.levels[1].categories | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("categories", [['A', 'B', 'C']]) | ||
@pytest.mark.parametrize("category_indices", [[0, 2, 4]]) | ||
@pytest.mark.parametrize("list_of_nans", [[np.nan, np.nan]]) | ||
@pytest.mark.parametrize("na_indices", [[1, 3]]) | ||
@pytest.mark.parametrize("na_position_first", ['first']) | ||
@pytest.mark.parametrize("na_position_last", ['last']) | ||
@pytest.mark.parametrize("column_name", ['c']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still not quite it as this will generate the Cartesian product of all of the items here. Giving it a second look, this might be more complicated than I originally thought with parametrization to the point that it might overcomplicated things. I'd be OK with reverting to what you had before my request, and this could maybe be a separate PR if it even makes sense. Sorry for the back and forth There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no problem. I have reverted the parametrization. Thanks for the careful checks. |
||
def test_sort_index_na_position_with_categories(self, categories, | ||
category_indices, | ||
na_indices, | ||
list_of_nans, | ||
na_position_first, | ||
na_position_last, | ||
column_name): | ||
# GH 22556 | ||
# Positioning missing value properly when column is Categorical. | ||
reversed_categories = sorted(categories, reverse=True) | ||
reversed_category_indices = sorted(category_indices, reverse=True) | ||
reversed_na_indices = sorted(na_indices, reverse=True) | ||
|
||
df = pd.DataFrame({ | ||
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], | ||
categories=categories, | ||
ordered=True)}) | ||
# sort ascending with na first | ||
result = df.sort_values(by=column_name, | ||
ascending=True, | ||
na_position=na_position_first) | ||
expected = DataFrame({ | ||
column_name: Categorical(list_of_nans + categories, | ||
categories=categories, | ||
ordered=True) | ||
}, index=na_indices + category_indices) | ||
|
||
assert_frame_equal(result, expected) | ||
|
||
# sort ascending with na last | ||
result = df.sort_values(by=column_name, | ||
ascending=True, | ||
na_position=na_position_last) | ||
expected = DataFrame({ | ||
column_name: Categorical(categories + list_of_nans, | ||
categories=categories, | ||
ordered=True) | ||
}, index=category_indices + na_indices) | ||
|
||
assert_frame_equal(result, expected) | ||
|
||
# sort descending with na first | ||
result = df.sort_values(by=column_name, | ||
ascending=False, | ||
na_position=na_position_first) | ||
expected = DataFrame({ | ||
column_name: Categorical(list_of_nans + reversed_categories, | ||
categories=categories, | ||
ordered=True) | ||
}, index=reversed_na_indices + reversed_category_indices) | ||
|
||
assert_frame_equal(result, expected) | ||
|
||
# sort descending with na last | ||
result = df.sort_values(by=column_name, | ||
ascending=False, | ||
na_position=na_position_last) | ||
expected = DataFrame({ | ||
column_name: Categorical(reversed_categories + list_of_nans, | ||
categories=categories, | ||
ordered=True) | ||
}, index=reversed_category_indices + reversed_na_indices) | ||
|
||
assert_frame_equal(result, expected) | ||
|
||
def test_sort_index_na_position_with_categories_raises(self): | ||
df = pd.DataFrame({ | ||
'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'], | ||
categories=['A', 'B', 'C'], | ||
ordered=True)}) | ||
|
||
with pytest.raises(ValueError): | ||
staftermath marked this conversation as resolved.
Show resolved
Hide resolved
|
||
df.sort_values(by='c', | ||
ascending=False, | ||
na_position='bad_position') |
Uh oh!
There was an error while loading. Please reload this page.