diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3294af742fae0..2d74be6f503a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -61,6 +61,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.nlargest` uses a 'stable' sort internally and will preserve original ordering. - :class:`ArrowDtype` now supports ``pyarrow.JsonType`` (:issue:`60958`) - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`) - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) @@ -593,6 +594,7 @@ Performance improvements - :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`) - :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`) - :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`) +- :meth:`Series.nlargest` has improved performance when there are duplicate values in the index (:issue:`55767`) - :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`) - :meth:`Series.str.partition` with :class:`ArrowDtype` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57768`) - Performance improvement in :class:`DataFrame` when ``data`` is a ``dict`` and ``columns`` is specified (:issue:`24368`) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 02e7445f1d275..59516b16905dc 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Generic, + Literal, cast, final, ) @@ -54,7 +55,9 @@ class SelectN(Generic[NDFrameT]): - def __init__(self, obj: NDFrameT, n: int, keep: str) -> None: + def __init__( + self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] + ) -> None: self.obj = obj self.n = n self.keep = keep @@ -111,15 +114,25 @@ def compute(self, method: str) -> Series: if n <= 0: return self.obj[[]] - dropped = self.obj.dropna() - nan_index = self.obj.drop(dropped.index) + # Save index and reset to default index to avoid performance impact + # from when index contains duplicates + original_index: Index = self.obj.index + default_index = self.obj.reset_index(drop=True) - # slow method - if n >= len(self.obj): + # Slower method used when taking the full length of the series + # In this case, it is equivalent to a sort. + if n >= len(default_index): ascending = method == "nsmallest" - return self.obj.sort_values(ascending=ascending).head(n) + result = default_index.sort_values(ascending=ascending, kind="stable").head( + n + ) + result.index = original_index.take(result.index) + return result + + # Fast method used in the general case + dropped = default_index.dropna() + nan_index = default_index.drop(dropped.index) - # fast method new_dtype = dropped.dtype # Similar to algorithms._ensure_data @@ -158,7 +171,7 @@ def compute(self, method: str) -> Series: else: kth_val = np.nan (ns,) = np.nonzero(arr <= kth_val) - inds = ns[arr[ns].argsort(kind="mergesort")] + inds = ns[arr[ns].argsort(kind="stable")] if self.keep != "all": inds = inds[:n] @@ -173,7 +186,9 @@ def compute(self, method: str) -> Series: # reverse indices inds = narr - 1 - inds - return concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result = concat([dropped.iloc[inds], nan_index]).iloc[:findex] + result.index = original_index.take(result.index) + return result class SelectNFrame(SelectN[DataFrame]): @@ -192,7 +207,13 @@ class SelectNFrame(SelectN[DataFrame]): nordered : DataFrame """ - def __init__(self, obj: DataFrame, n: int, keep: str, columns: IndexLabel) -> None: + def __init__( + self, + obj: DataFrame, + n: int, + keep: Literal["first", "last", "all"], + columns: IndexLabel, + ) -> None: super().__init__(obj, n, keep) if not is_list_like(columns) or isinstance(columns, tuple): columns = [columns] @@ -277,4 +298,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="mergesort") + return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index c6e5304ae3cb4..08b7128e6ec11 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -153,11 +153,11 @@ def test_nlargest_n_duplicate_index(self, n, order, request): index=[0, 0, 1, 1, 1], ) result = df.nsmallest(n, order) - expected = df.sort_values(order).head(n) + expected = df.sort_values(order, kind="stable").head(n) tm.assert_frame_equal(result, expected) result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False).head(n) + expected = df.sort_values(order, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) ):