Skip to content

BUG: df.sort_values() not respecting na_position with categoricals #22556 #22640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Oct 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ Categorical
^^^^^^^^^^^

- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).

Expand Down
33 changes: 9 additions & 24 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@

import pandas.core.algorithms as algorithms

from pandas.core.sorting import nargsort

from pandas.io.formats import console
from pandas.io.formats.terminal import get_terminal_size
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
Expand Down Expand Up @@ -1605,32 +1607,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
msg = 'invalid na_position: {na_position!r}'
raise ValueError(msg.format(na_position=na_position))

codes = np.sort(self._codes)
if not ascending:
codes = codes[::-1]

# NaN handling
na_mask = (codes == -1)
if na_mask.any():
n_nans = len(codes[na_mask])
if na_position == "first":
# in this case sort to the front
new_codes = codes.copy()
new_codes[0:n_nans] = -1
new_codes[n_nans:] = codes[~na_mask]
codes = new_codes
elif na_position == "last":
# ... and to the end
new_codes = codes.copy()
pos = len(codes) - n_nans
new_codes[0:pos] = codes[~na_mask]
new_codes[pos:] = -1
codes = new_codes
sorted_idx = nargsort(self,
ascending=ascending,
na_position=na_position)

if inplace:
self._codes = codes
return
self._codes = self._codes[sorted_idx]
else:
return self._constructor(values=codes, dtype=self.dtype,
return self._constructor(values=self._codes[sorted_idx],
dtype=self.dtype,
fastpath=True)

def _values_for_rank(self):
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):

# specially handle Categorical
if is_categorical_dtype(items):
return items.argsort(ascending=ascending, kind=kind)
if na_position not in {'first', 'last'}:
raise ValueError('invalid na_position: {!r}'.format(na_position))

mask = isna(items)
cnt_null = mask.sum()
sorted_idx = items.argsort(ascending=ascending, kind=kind)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we not have a .sort_values on Categorical that accepts na_position already?

if we don’t all of this code should live there anyhow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the way I see it, df.sort_values is not calling sort_values method from column objects, it's calling nargsort. Since pd.Series also doesn't have nargsort, I feel it may fit better in here rather than Categorical.nargsort

if ascending and na_position == 'last':
# NaN is coded as -1 and is listed in front after sorting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this duplicates some code in Categorical.sort_values is it not possible to refactor this to a function, and then call this from Categorical.sort_values (as this works with the raw indexer), while Categorical.sort_values reconstructs a Categorical?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great suggestion. updated.

sorted_idx = np.roll(sorted_idx, -cnt_null)
elif not ascending and na_position == 'first':
# NaN is coded as -1 and is listed in the end after sorting
sorted_idx = np.roll(sorted_idx, cnt_null)
return sorted_idx

items = np.asanyarray(items)
idx = np.arange(len(items))
Expand Down
86 changes: 82 additions & 4 deletions pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pandas.compat import lrange
from pandas.api.types import CategoricalDtype
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
date_range, NaT, IntervalIndex)
date_range, NaT, IntervalIndex, Categorical)

from pandas.util.testing import assert_series_equal, assert_frame_equal

Expand Down Expand Up @@ -161,7 +161,7 @@ def test_sort_nan(self):
'B': [5, 9, 2, nan, 5, 5, 4]},
index=[2, 0, 3, 1, 6, 4, 5])
sorted_df = df.sort_values(['A', 'B'], ascending=[
1, 0], na_position='first')
1, 0], na_position='first')
assert_frame_equal(sorted_df, expected)

# na_position='last', not order
Expand All @@ -170,7 +170,7 @@ def test_sort_nan(self):
'B': [4, 5, 5, nan, 2, 9, 5]},
index=[5, 4, 6, 1, 3, 0, 2])
sorted_df = df.sort_values(['A', 'B'], ascending=[
0, 1], na_position='last')
0, 1], na_position='last')
assert_frame_equal(sorted_df, expected)

# Test DataFrame with nan label
Expand Down Expand Up @@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self):

df = (DataFrame({'A': np.arange(6, dtype='int64'),
'B': Series(list('aabbca'))
.astype(CategoricalDtype(list('cab')))})
.astype(CategoricalDtype(list('cab')))})
.set_index('B'))

result = df.sort_index()
Expand Down Expand Up @@ -598,3 +598,81 @@ def test_sort_index_intervalindex(self):
closed='right')
result = result.columns.levels[1].categories
tm.assert_index_equal(result, expected)

def test_sort_index_na_position_with_categories(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you parametrize this test? Believe you can use values, na_position and index as parametrized arguments

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd parametrized a bit. I am pretty new at this. Thanks for all the suggestions!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries - appreciate the help and we hope you learn as you go through this process.

FWIW this unfortunately not the parametrization we are looking for. There should be a decorator for the function which performs that actual task. What you want is the pytest.mark.parametrize decorator; you'll see this on one other function in the module though if you poke around other modules you'll see more complex applications which can help you out as well

# GH 22556
# Positioning missing value properly when column is Categorical.
categories = ['A', 'B', 'C']
category_indices = [0, 2, 4]
list_of_nans = [np.nan, np.nan]
na_indices = [1, 3]
na_position_first = 'first'
na_position_last = 'last'
column_name = 'c'

reversed_categories = sorted(categories, reverse=True)
reversed_category_indices = sorted(category_indices, reverse=True)
reversed_na_indices = sorted(na_indices, reverse=True)

df = pd.DataFrame({
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
categories=categories,
ordered=True)})
# sort ascending with na first
result = df.sort_values(by=column_name,
ascending=True,
na_position=na_position_first)
expected = DataFrame({
column_name: Categorical(list_of_nans + categories,
categories=categories,
ordered=True)
}, index=na_indices + category_indices)

assert_frame_equal(result, expected)

# sort ascending with na last
result = df.sort_values(by=column_name,
ascending=True,
na_position=na_position_last)
expected = DataFrame({
column_name: Categorical(categories + list_of_nans,
categories=categories,
ordered=True)
}, index=category_indices + na_indices)

assert_frame_equal(result, expected)

# sort descending with na first
result = df.sort_values(by=column_name,
ascending=False,
na_position=na_position_first)
expected = DataFrame({
column_name: Categorical(list_of_nans + reversed_categories,
categories=categories,
ordered=True)
}, index=reversed_na_indices + reversed_category_indices)

assert_frame_equal(result, expected)

# sort descending with na last
result = df.sort_values(by=column_name,
ascending=False,
na_position=na_position_last)
expected = DataFrame({
column_name: Categorical(reversed_categories + list_of_nans,
categories=categories,
ordered=True)
}, index=reversed_category_indices + reversed_na_indices)

assert_frame_equal(result, expected)

def test_sort_index_na_position_with_categories_raises(self):
df = pd.DataFrame({
'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
categories=['A', 'B', 'C'],
ordered=True)})

with pytest.raises(ValueError):
df.sort_values(by='c',
ascending=False,
na_position='bad_position')
2 changes: 1 addition & 1 deletion pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,7 @@ def test_categorical_sorting(self, file):
parsed = read_stata(getattr(self, file))

# Sort based on codes, not strings
parsed = parsed.sort_values("srh")
parsed = parsed.sort_values("srh", na_position='first')

# Don't sort index
parsed.index = np.arange(parsed.shape[0])
Expand Down