Skip to content

BUG: Fix #61221: Exception with unstack(sort=False) and NA in index. #61226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7503537
BUG: Fix #61221: Exception with unstack(sort=False) and NA in index.
Apr 3, 2025
c0a7c80
BUG: Fix #61221: Exception with unstack(sort=False) and NA in index.
Apr 3, 2025
7a8fddb
fixed formatting
Apr 3, 2025
a397466
fixed issue with unsorted unstack, should now work
Apr 3, 2025
eb2fb7a
Merge branch 'main' into fix-issue-61221
gsmll Apr 4, 2025
3539ac6
Instead of creating variable self.na, constructed na index locally
Apr 4, 2025
64f5173
fixed issues with local variable
Apr 4, 2025
e2b38b1
fixed the fix -oops
Apr 4, 2025
31d7b33
fixed up tests
Apr 9, 2025
05c8a02
Merge branch 'main' into fix-issue-61221
gsmll Apr 9, 2025
cc1deb6
Merge branch 'main' into fix-issue-61221
gsmll Apr 11, 2025
84d6bd3
Add Pandas Cookbook to Book Recommendations (#61271)
WillAyd Apr 11, 2025
f7e910e
shortened factorize
Apr 11, 2025
a147c6d
optimized shortened factorized
Apr 11, 2025
1abbc73
Merge branch 'main' into fix-issue-61221
gsmll Apr 11, 2025
c35e8cc
fixed typing issue
Apr 11, 2025
8afbad2
Merge branch 'main' into fix-issue-61221
gsmll Apr 13, 2025
8c19221
Merge branch 'main' into fix-issue-61221
gsmll Apr 14, 2025
555bad9
Merge branch 'main' into fix-issue-61221
gsmll Apr 16, 2025
cfb5e92
Merge branch 'main' into fix-issue-61221
gsmll Apr 22, 2025
a677315
Merge branch 'main' into fix-issue-61221
gsmll May 12, 2025
1f9cd53
Merge branch 'main' into fix-issue-61221
gsmll May 15, 2025
f72d1df
Merge branch 'main' into fix-issue-61221
gsmll May 15, 2025
c514d68
Merge branch 'main' into fix-issue-61221
gsmll May 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ Reshaping
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)

Sparse
Expand Down
40 changes: 33 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def __init__(
self.removed_level_full = index.levels[self.level]
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
# Bug Fix GH 61221
# The -1 in the unsorted unique codes causes for errors
# saving the NA location to be used in the repeater
unique_codes = unique_codes[unique_codes != -1]
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

Expand Down Expand Up @@ -170,7 +174,13 @@ def _indexer_and_to_sort(
codes = list(self.index.codes)
if not self.sort:
# Create new codes considering that labels are already sorted
codes = [factorize(code)[0] for code in codes]
# Make sure to preserve the -1 values before factorizing
codes = []
for code in self.index.codes:
mask = code != -1
factorized = np.full_like(code, -1)
factorized[mask] = factorize(code[mask])[0]
codes.append(factorized)
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
Expand All @@ -189,9 +199,15 @@ def sorted_labels(self) -> list[np.ndarray]:
return to_sort

def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
if self.sort:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
level_sizes = tuple(len(level) for level in self.new_index_levels)
group_ids = get_group_index(
self.sorted_labels[:-1], level_sizes, sort=False, xnull=False
)
return values[np.argsort(group_ids, kind="mergesort")]

def _make_selectors(self) -> None:
new_levels = self.new_index_levels
Expand Down Expand Up @@ -381,11 +397,22 @@ def _repeater(self) -> np.ndarray:
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
repeater = np.insert(repeater, 0, -1)
if not self.sort:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.insert(repeater, na_index, -1)
else:
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
repeater = np.arange(stride) - self.lift
if self.sort or not self.lift:
repeater = np.arange(stride) - self.lift
else:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.arange(stride) - self.lift
if na_index:
repeater[na_index] = -1
repeater[:na_index] += 1

return repeater

Expand Down Expand Up @@ -565,7 +592,6 @@ def _unstack_frame(
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor, sort=sort
)

if not obj._can_fast_transpose:
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
Expand Down
100 changes: 100 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,106 @@ def test_stack_sort_false(future_stack):
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na1():
# GH 61221
# Test unstacking with NA as the last value

levels1 = ["b", "a"]
levels2 = Index([1, 2, 3, None])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1.0): [0, 4],
("value", 2.0): [1, 5],
("value", 3.0): [2, 6],
("value", pd.NA): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", pd.NA)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na2():
# GH 61221
# Test unstacking with NA as the first value

levels1 = ["b", "a"]
levels2 = Index([None, 1, 2, 3])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", pd.NA): [0, 4],
("value", 1.0): [1, 5],
("value", 2.0): [2, 6],
("value", 3.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", pd.NA), ("value", 1.0), ("value", 2.0), ("value", 3.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na3():
# GH 61221
# Test unstacking with NA in the middle

levels1 = ["b", "a"]
levels2 = Index([1, None, 2, 3])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1.0): [0, 4],
("value", pd.NA): [1, 5],
("value", 2.0): [2, 6],
("value", 3.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 1.0), ("value", pd.NA), ("value", 2.0), ("value", 3.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na_mixed():
# GH 61221
# Test unstacking to see if order is maintained.

levels1 = ["b", "a"]
levels2 = Index([3, None, 1, 2])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 3.0): [0, 4],
("value", pd.NA): [1, 5],
("value", 1.0): [2, 6],
("value", 2.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 3.0), ("value", pd.NA), ("value", 1.0), ("value", 2.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
def test_stack_sort_false_multi_level(future_stack):
# GH 15105
Expand Down
Loading