Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug: Save original index and remap after function completes #61116

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Other enhancements
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering.
- :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
Expand Down Expand Up @@ -591,6 +592,7 @@ Performance improvements
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`)
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)
- :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`)
- Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`)
Expand Down
43 changes: 32 additions & 11 deletions pandas/core/methods/selectn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from typing import (
TYPE_CHECKING,
Generic,
Literal,
cast,
final,
)
Expand Down Expand Up @@ -54,7 +55,9 @@


class SelectN(Generic[NDFrameT]):
def __init__(self, obj: NDFrameT, n: int, keep: str) -> None:
def __init__(
self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"]
) -> None:
self.obj = obj
self.n = n
self.keep = keep
Expand Down Expand Up @@ -111,15 +114,25 @@ def compute(self, method: str) -> Series:
if n <= 0:
return self.obj[[]]

dropped = self.obj.dropna()
nan_index = self.obj.drop(dropped.index)
# Save index and reset to default index to avoid performance impact
# from when index contains duplicates
original_index: Index = self.obj.index
default_index = self.obj.reset_index(drop=True)

# slow method
if n >= len(self.obj):
# Slower method used when taking the full length of the series
# In this case, it is equivalent to a sort.
if n >= len(default_index):
ascending = method == "nsmallest"
return self.obj.sort_values(ascending=ascending).head(n)
result = default_index.sort_values(ascending=ascending, kind="stable").head(
n
)
result.index = original_index.take(result.index)
return result

# Fast method used in the general case
dropped = default_index.dropna()
nan_index = default_index.drop(dropped.index)

# fast method
new_dtype = dropped.dtype

# Similar to algorithms._ensure_data
Expand Down Expand Up @@ -158,7 +171,7 @@ def compute(self, method: str) -> Series:
else:
kth_val = np.nan
(ns,) = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind="mergesort")]
inds = ns[arr[ns].argsort(kind="stable")]

if self.keep != "all":
inds = inds[:n]
Expand All @@ -173,7 +186,9 @@ def compute(self, method: str) -> Series:
# reverse indices
inds = narr - 1 - inds

return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
result = concat([dropped.iloc[inds], nan_index]).iloc[:findex]
result.index = original_index.take(result.index)
return result


class SelectNFrame(SelectN[DataFrame]):
Expand All @@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]):
nordered : DataFrame
"""

def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None:
def __init__(
self,
obj: DataFrame,
n: int,
keep: Literal["first", "last", "all"],
columns: IndexLabel,
) -> None:
super().__init__(obj, n, keep)
if not is_list_like(columns) or isinstance(columns, tuple):
columns = [columns]
Expand Down Expand Up @@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index:

ascending = method == "nsmallest"

return frame.sort_values(columns, ascending=ascending, kind="mergesort")
return frame.sort_values(columns, ascending=ascending, kind="stable")
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_nlargest.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request):
index=[0, 0, 1, 1, 1],
)
result = df.nsmallest(n, order)
expected = df.sort_values(order).head(n)
expected = df.sort_values(order, kind="stable").head(n)
tm.assert_frame_equal(result, expected)

result = df.nlargest(n, order)
expected = df.sort_values(order, ascending=False).head(n)
expected = df.sort_values(order, ascending=False, kind="stable").head(n)
if Version(np.__version__) >= Version("1.25") and (
(order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5)
):
Expand Down