Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix #61221: Exception with unstack(sort=False) and NA in index. #61226

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ Reshaping
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`DataFrame.unstack` where when sort is False, in frames with NA columns, unstacking causing errors or improper orders (:issue:`61221`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)

Sparse
Expand Down
41 changes: 34 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ def __init__(
self.removed_level_full = index.levels[self.level]
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
# Bug Fix GH 61221
# The -1 in the unsorted unique codes causes for errors
# saving the NA location to be used in the repeater
unique_codes = unique_codes[unique_codes != -1]
self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

Expand Down Expand Up @@ -170,7 +174,14 @@ def _indexer_and_to_sort(
codes = list(self.index.codes)
if not self.sort:
# Create new codes considering that labels are already sorted
codes = [factorize(code)[0] for code in codes]
# setting nans back to nan to maintain the -1 values
if self.lift:
codes = [
factorize(np.where(code == -1, np.nan, code))[0] for code in codes
]
else:
codes = [factorize(code)[0] for code in codes]

levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
Expand All @@ -189,9 +200,15 @@ def sorted_labels(self) -> list[np.ndarray]:
return to_sort

def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
if self.sort:
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
level_sizes = tuple(len(level) for level in self.new_index_levels)
group_ids = get_group_index(
self.sorted_labels[:-1], level_sizes, sort=False, xnull=False
)
return values[np.argsort(group_ids, kind="mergesort")]

def _make_selectors(self) -> None:
new_levels = self.new_index_levels
Expand Down Expand Up @@ -381,11 +398,22 @@ def _repeater(self) -> np.ndarray:
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
repeater = np.insert(repeater, 0, -1)
if not self.sort:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.insert(repeater, na_index, -1)
else:
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
repeater = np.arange(stride) - self.lift
if self.sort or not self.lift:
repeater = np.arange(stride) - self.lift
else:
na_index = (self.index.codes[self.level] == -1).nonzero()[0][0]
repeater = np.arange(stride) - self.lift
if na_index:
repeater[na_index] = -1
repeater[:na_index] += 1

return repeater

Expand Down Expand Up @@ -565,7 +593,6 @@ def _unstack_frame(
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor, sort=sort
)

if not obj._can_fast_transpose:
mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
return obj._constructor_from_mgr(mgr, axes=mgr.axes)
Expand Down
100 changes: 100 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,106 @@ def test_stack_sort_false(future_stack):
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na1():
# GH 61221
# Test unstacking with NA as the last value

levels1 = ["b", "a"]
levels2 = Index([1, 2, 3, None])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1.0): [0, 4],
("value", 2.0): [1, 5],
("value", 3.0): [2, 6],
("value", pd.NA): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", pd.NA)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na2():
# GH 61221
# Test unstacking with NA as the first value

levels1 = ["b", "a"]
levels2 = Index([None, 1, 2, 3])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", pd.NA): [0, 4],
("value", 1.0): [1, 5],
("value", 2.0): [2, 6],
("value", 3.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", pd.NA), ("value", 1.0), ("value", 2.0), ("value", 3.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na3():
# GH 61221
# Test unstacking with NA in the middle

levels1 = ["b", "a"]
levels2 = Index([1, None, 2, 3])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 1.0): [0, 4],
("value", pd.NA): [1, 5],
("value", 2.0): [2, 6],
("value", 3.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 1.0), ("value", pd.NA), ("value", 2.0), ("value", 3.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


def test_unstack_sort_false_na_mixed():
# GH 61221
# Test unstacking to see if order is maintained.

levels1 = ["b", "a"]
levels2 = Index([3, None, 1, 2])
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": range(len(index))}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
{
("value", 3.0): [0, 4],
("value", pd.NA): [1, 5],
("value", 1.0): [2, 6],
("value", 2.0): [3, 7],
},
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(
[("value", 3.0), ("value", pd.NA), ("value", 1.0), ("value", 2.0)],
names=[None, "level2"],
),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
def test_stack_sort_false_multi_level(future_stack):
# GH 15105
Expand Down