From a3aed2fd8db9b3ecb97081384cce3e6934437c32 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 13:35:49 +0530 Subject: [PATCH 1/7] Rename tests to nsorted and rename order to columns --- pandas/tests/frame/methods/test_nlargest.py | 46 +++++++++++---------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 08b7128e6ec11..9b45c92fd37d4 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -1,6 +1,7 @@ """ -Note: for naming purposes, most tests are title with as e.g. "test_nlargest_foo" -but are implicitly also testing nsmallest_foo. +Note: for naming purposes, most test method titles include "nsorted" +(e.g., "test_nlargest_foo") but are implicitly also testing "nsmallest" and +"nlargest". """ from string import ascii_lowercase @@ -41,11 +42,11 @@ def df_main_dtypes(): ) -class TestNLargestNSmallest: +class TestNSorted: # ---------------------------------------------------------------------- # Top / bottom @pytest.mark.parametrize( - "order", + "columns", [ ["a"], ["c"], @@ -63,7 +64,7 @@ class TestNLargestNSmallest: ], ) @pytest.mark.parametrize("n", range(1, 11)) - def test_nlargest_n(self, nselect_method, n, order): + def test_nsorted_n(self, nselect_method, n: int, columns): # GH#10393 df = pd.DataFrame( { @@ -72,24 +73,24 @@ def test_nlargest_n(self, nselect_method, n, order): "c": np.random.default_rng(2).permutation(10).astype("float64"), } ) - if "b" in order: + if "b" in columns: error_msg = ( f"Column 'b' has dtype (object|str), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): - getattr(df, nselect_method)(n, order) + getattr(df, nselect_method)(n, columns) else: ascending = nselect_method == "nsmallest" - result = getattr(df, nselect_method)(n, order) + result = getattr(df, nselect_method)(n, columns) result.index = pd.Index(list(result.index)) - expected = df.sort_values(order, ascending=ascending).head(n) + expected = df.sort_values(columns, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "columns", [["group", "category_string"], ["group", "string"]] ) - def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): + def test_nsorted_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes col = columns[1] error_msg = ( @@ -106,12 +107,12 @@ def test_nlargest_error(self, df_main_dtypes, nselect_method, columns): with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(2, columns) - def test_nlargest_all_dtypes(self, df_main_dtypes): + def test_nsorted_all_dtypes(self, df_main_dtypes): df = df_main_dtypes df.nsmallest(2, list(set(df) - {"category_string", "string"})) df.nlargest(2, list(set(df) - {"category_string", "string"})) - def test_nlargest_duplicates_on_starter_columns(self): + def test_nsorted_duplicates_on_starter_columns(self): # regression test for GH#22752 df = pd.DataFrame({"a": [2, 2, 2, 1, 1, 1], "b": [1, 2, 3, 3, 2, 1]}) @@ -128,7 +129,7 @@ def test_nlargest_duplicates_on_starter_columns(self): ) tm.assert_frame_equal(result, expected) - def test_nlargest_n_identical_values(self): + def test_nsorted_n_identical_values(self): # GH#15297 df = pd.DataFrame({"a": [1] * 5, "b": [1, 2, 3, 4, 5]}) @@ -141,25 +142,26 @@ def test_nlargest_n_identical_values(self): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( - "order", + "columns", [["a", "b", "c"], ["c", "b", "a"], ["a"], ["b"], ["a", "b"], ["c", "b"]], ) @pytest.mark.parametrize("n", range(1, 6)) - def test_nlargest_n_duplicate_index(self, n, order, request): + def test_nsorted_n_duplicate_index(self, n: int, columns, request): # GH#13412 df = pd.DataFrame( {"a": [1, 2, 3, 4, 4], "b": [1, 1, 1, 1, 1], "c": [0, 1, 2, 5, 4]}, index=[0, 0, 1, 1, 1], ) - result = df.nsmallest(n, order) - expected = df.sort_values(order, kind="stable").head(n) + result = df.nsmallest(n, columns) + expected = df.sort_values(columns, kind="stable").head(n) tm.assert_frame_equal(result, expected) - result = df.nlargest(n, order) - expected = df.sort_values(order, ascending=False, kind="stable").head(n) + result = df.nlargest(n, columns) + expected = df.sort_values(columns, ascending=False, kind="stable").head(n) if Version(np.__version__) >= Version("1.25") and ( - (order == ["a"] and n in (1, 2, 3, 4)) or ((order == ["a", "b"]) and n == 5) + (columns == ["a"] and n in (1, 2, 3, 4)) + or ((columns == ["a", "b"]) and n == 5) ): request.applymarker( pytest.mark.xfail( @@ -172,7 +174,7 @@ def test_nlargest_n_duplicate_index(self, n, order, request): ) tm.assert_frame_equal(result, expected) - def test_nlargest_duplicate_keep_all_ties(self): + def test_nsorted_duplicate_keep_all_ties(self): # GH#16818 df = pd.DataFrame( {"a": [5, 4, 4, 2, 3, 3, 3, 3], "b": [10, 9, 8, 7, 5, 50, 10, 20]} @@ -197,7 +199,7 @@ def test_nlargest_duplicate_keep_all_ties(self): ) tm.assert_frame_equal(result, expected) - def test_nlargest_multiindex_column_lookup(self): + def test_nsorted_multiindex_column_lookup(self): # Check whether tuples are correctly treated as multi-level lookups. # GH#23033 df = pd.DataFrame( From e476e18c2981d66ad1268157b62c7965f3f9b7a8 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 13:36:11 +0530 Subject: [PATCH 2/7] Add time_nsorted to benchmark --- asv_bench/benchmarks/frame_methods.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index cd7851acae3f2..a8af978998c2a 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -758,6 +758,9 @@ class NSort: params = ["first", "last", "all"] param_names = ["keep"] + def __init__(self): + self.df = None + def setup(self, keep): self.df = DataFrame(np.random.randn(100000, 3), columns=list("ABC")) @@ -773,6 +776,12 @@ def time_nsmallest_one_column(self, keep): def time_nsmallest_two_columns(self, keep): self.df.nsmallest(100, ["A", "B"], keep=keep) + def time_nsorted_one_column(self, keep): + self.df.nsorted(100, "A", keep=keep, ascending=True) + + def time_nsorted_two_columns(self, keep): + self.df.nsorted(100, ["A", "B"], keep=keep, ascending=[True, False]) + class Describe: def setup(self): From 92d53daedec8f88667f0ae153e2454e5658c6e34 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 16:32:44 +0530 Subject: [PATCH 3/7] Add nsorted method for dataframe and series --- pandas/_typing.py | 2 +- pandas/core/frame.py | 170 ++++++++++++++++++++++++++++++++- pandas/core/methods/selectn.py | 98 +++++++++++-------- pandas/core/series.py | 19 +++- 4 files changed, 238 insertions(+), 51 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 4365ee85f72e3..49ce35158bd0a 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -429,7 +429,7 @@ def closed(self) -> bool: SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] NaPosition = Literal["first", "last"] -# Arguments for nsmallest and nlargest +# Arguments for nsorted, nsmallest and nlargest NsmallestNlargestKeep = Literal["first", "last", "all"] # quantile interpolation diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6158e19737185..8156234a243f4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7447,6 +7447,160 @@ def value_counts( return counts + def nsorted( + self, + n: int, + columns: IndexLabel, + ascending: bool | Sequence[bool], + keep: NsmallestNlargestKeep = "first", + ) -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in the order defined by + `ascending`. + + The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=ascending).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + ascending : bool or list of bools + Whether to sort in ascending or descending order. + If a list, must be the same length as `columns`. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. + + Returns + ------- + DataFrame + The first `n` rows ordered by the given columns in the order given + in `ascending`. + + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 11300, + ... 11300, + ... 11300, + ... ], + ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nsorted`` to select the three + rows having the largest values in column "population". + + >>> df.nsorted(3, "population", ascending=False) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nsorted(3, "population", ascending=False, keep="last") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + + When using ``keep='all'``, the number of elements kept can go beyond ``n`` + if there are duplicate values for the smallest element. All the + ties are kept: + + >>> df.nsorted(3, "population", ascending=False, keep="all") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + However, ``nsorted`` does not keep ``n`` distinct largest elements: + + >>> df.nsorted(5, "population", ascending=False, keep="all") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + + To order by the largest values in column "population" and break ties + according to the smallest values in column "GDP", we can specify + multiple columns and ascending orders like in the next example. + + >>> df.nsorted(3, ["population", "GDP"], ascending=[False, True]) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Maldives 434000 4520 MV + """ + return selectn.SelectNFrame( + self, + n=n, + keep=keep, + columns=columns, + ).nsorted(ascending=ascending) + def nlargest( self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" ) -> DataFrame: @@ -7457,6 +7611,9 @@ def nlargest( descending order. The columns that are not specified are returned as well, but not used for ordering. + This method is equivalent to + ``df.nsorted(n, columns, ascending=False)``. + This method is equivalent to ``df.sort_values(columns, ascending=False).head(n)``, but more performant. @@ -7485,6 +7642,8 @@ def nlargest( -------- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in ascending order. + DataFrame.nsorted : Return the first `n` rows ordered by `columns` in + the order given in `ascending`. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. @@ -7553,7 +7712,7 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN - When using ``keep='all'``, the number of element kept can go beyond ``n`` + When using ``keep='all'``, the number of elements kept can go beyond ``n`` if there are duplicate values for the smallest element, all the ties are kept: @@ -7584,7 +7743,7 @@ def nlargest( Italy 59000000 1937894 IT Brunei 434000 12128 BN """ - return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + return self.nsorted(n=n, columns=columns, ascending=False, keep=keep) def nsmallest( self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" @@ -7596,6 +7755,9 @@ def nsmallest( ascending order. The columns that are not specified are returned as well, but not used for ordering. + This method is equivalent to + ``df.nsorted(n, columns, ascending=True)``. + This method is equivalent to ``df.sort_values(columns, ascending=True).head(n)``, but more performant. @@ -7623,6 +7785,8 @@ def nsmallest( -------- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in descending order. + DataFrame.nsorted : Return the first `n` rows ordered by `columns` in + the order given in `ascending`. DataFrame.sort_values : Sort DataFrame by the values. DataFrame.head : Return the first `n` rows without re-ordering. @@ -7715,7 +7879,7 @@ def nsmallest( Anguilla 11300 311 AI Nauru 337000 182 NR """ - return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() + return self.nsorted(n=n, columns=columns, ascending=True, keep=keep) def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: """ diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index 59516b16905dc..ee0ea5bf414d1 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -11,7 +11,6 @@ from typing import ( TYPE_CHECKING, Generic, - Literal, cast, final, ) @@ -37,6 +36,7 @@ DtypeObj, IndexLabel, NDFrameT, + NsmallestNlargestKeep, ) from pandas import ( @@ -55,9 +55,7 @@ class SelectN(Generic[NDFrameT]): - def __init__( - self, obj: NDFrameT, n: int, keep: Literal["first", "last", "all"] - ) -> None: + def __init__(self, obj: NDFrameT, n: int, keep: NsmallestNlargestKeep) -> None: self.obj = obj self.n = n self.keep = keep @@ -65,16 +63,20 @@ def __init__( if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') - def compute(self, method: str) -> NDFrameT: + def compute(self, ascending: bool | Sequence[bool]) -> NDFrameT: raise NotImplementedError + @final + def nsorted(self, ascending: bool | Sequence[bool]) -> NDFrameT: + return self.compute(ascending=ascending) + @final def nlargest(self) -> NDFrameT: - return self.compute("nlargest") + return self.nsorted(ascending=False) @final def nsmallest(self) -> NDFrameT: - return self.compute("nsmallest") + return self.nsorted(ascending=True) @final @staticmethod @@ -90,39 +92,40 @@ def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: class SelectNSeries(SelectN[Series]): """ - Implement n largest/smallest for Series + Implement n-sorting for Series Parameters ---------- obj : Series n : int - keep : {'first', 'last'}, default 'first' + keep : {'first', 'last', 'all'}, default 'first' Returns ------- nordered : Series """ - def compute(self, method: str) -> Series: + def compute(self, ascending: bool) -> Series: from pandas.core.reshape.concat import concat + assert isinstance(ascending, bool) + n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): - raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") + raise TypeError(f"Cannot use n-sorting with dtype {dtype}") if n <= 0: return self.obj[[]] - # Save index and reset to default index to avoid performance impact - # from when index contains duplicates + # Save index and reset to the default index to avoid performance impact + # from when the index contains duplicates original_index: Index = self.obj.index default_index = self.obj.reset_index(drop=True) - # Slower method used when taking the full length of the series + # Slower method used when taking the full length of the series. # In this case, it is equivalent to a sort. if n >= len(default_index): - ascending = method == "nsmallest" result = default_index.sort_values(ascending=ascending, kind="stable").head( n ) @@ -146,7 +149,7 @@ def compute(self, method: str) -> Series: if arr.dtype.kind == "b": arr = arr.view(np.uint8) - if method == "nlargest": + if not ascending: arr = -arr if is_integer_dtype(new_dtype): # GH 21426: ensure reverse ordering at boundaries @@ -164,8 +167,8 @@ def compute(self, method: str) -> Series: n = min(n, narr) # arr passed into kth_smallest must be contiguous. We copy - # here because kth_smallest will modify its input - # avoid OOB access with kth_smallest_c when n <= 0 + # here because kth_smallest will modify its input. + # Avoid OOB access with kth_smallest_c when n <= 0 if len(arr) > 0: kth_val = libalgos.kth_smallest(arr.copy(order="C"), n - 1) else: @@ -211,7 +214,7 @@ def __init__( self, obj: DataFrame, n: int, - keep: Literal["first", "last", "all"], + keep: NsmallestNlargestKeep, columns: IndexLabel, ) -> None: super().__init__(obj, n, keep) @@ -222,7 +225,7 @@ def __init__( columns = list(columns) self.columns = columns - def compute(self, method: str) -> DataFrame: + def compute(self, ascending: bool | Sequence[bool]) -> DataFrame: n = self.n frame = self.obj columns = self.columns @@ -232,58 +235,71 @@ def compute(self, method: str) -> DataFrame: if not self.is_valid_dtype_n_method(dtype): raise TypeError( f"Column {column!r} has dtype {dtype}, " - f"cannot use method {method!r} with this dtype" + f"cannot use n-sorting with this dtype" ) - def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: + if isinstance(ascending, bool): + ascending = [ascending] * len(columns) + + if len(ascending) != len(columns): + raise ValueError( + f"`ascending` must have the same length as columns" + f", {len(ascending)} != {len(columns)}" + ) + + def get_indexer( + current_indexer: Index, + other_indexer: Index, + asc: bool, + ) -> Index: """ Helper function to concat `current_indexer` and `other_indexer` - depending on `method` + depending on the sorting order `asc` (ascending or descending). """ - if method == "nsmallest": + if asc: return current_indexer.append(other_indexer) else: return other_indexer.append(current_indexer) - # Below we save and reset the index in case index contains duplicates + # Below we save and reset the index in case the index contains + # duplicates original_index = frame.index cur_frame = frame = frame.reset_index(drop=True) cur_n = n indexer: Index = default_index(0) for i, column in enumerate(columns): - # For each column we apply method to cur_frame[column]. + # For each column we apply n-sorting to cur_frame[column]. # If it's the last column or if we have the number of - # results desired we are done. - # Otherwise there are duplicates of the largest/smallest - # value and we need to look at the rest of the columns + # results desired, we are done. + # Otherwise, there are duplicates of the largest/smallest + # value, and we need to look at the rest of the columns # to determine which of the rows with the largest/smallest # value in the column to keep. series = cur_frame[column] is_last_column = len(columns) - 1 == i - values = getattr(series, method)( - cur_n, keep=self.keep if is_last_column else "all" + values = series.nsorted( + cur_n, + ascending=ascending[i], + keep=self.keep if is_last_column else "all", ) if is_last_column or len(values) <= cur_n: - indexer = get_indexer(indexer, values.index) + indexer = get_indexer(indexer, values.index, asc=ascending[i]) break - # Now find all values which are equal to - # the (nsmallest: largest)/(nlargest: smallest) - # from our series. + # Now find all the values equal to the largest (if ascending, else + # smallest) value in the current series. border_value = values == values[values.index[-1]] - # Some of these values are among the top-n - # some aren't. + # Some of these values are among the top-n, some aren't. unsafe_values = values[border_value] # These values are definitely among the top-n safe_values = values[~border_value] - indexer = get_indexer(indexer, safe_values.index) + indexer = get_indexer(indexer, safe_values.index, asc=ascending[i]) - # Go on and separate the unsafe_values on the remaining - # columns. + # Go on and separate the unsafe_values on the remaining columns. cur_frame = cur_frame.loc[unsafe_values.index] cur_n = n - len(indexer) @@ -296,6 +312,4 @@ def get_indexer(current_indexer: Index, other_indexer: Index) -> Index: if len(columns) == 1: return frame - ascending = method == "nsmallest" - return frame.sort_values(columns, ascending=ascending, kind="stable") diff --git a/pandas/core/series.py b/pandas/core/series.py index 5ed094349caaa..6aa054553b212 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -185,6 +185,7 @@ ListLike, MutableMappingT, NaPosition, + NsmallestNlargestKeep, NumpySorter, NumpyValueArrayLike, QuantileInterpolation, @@ -3832,9 +3833,19 @@ def argsort( ) return res.__finalize__(self, method="argsort") - def nlargest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" + def nsorted( + self, + n: int, + ascending: bool, + keep: NsmallestNlargestKeep = "first", ) -> Series: + return selectn.SelectNSeries( + self, + n=n, + keep=keep, + ).nsorted(ascending=ascending) + + def nlargest(self, n: int = 5, keep: NsmallestNlargestKeep = "first") -> Series: """ Return the largest `n` elements. @@ -3939,9 +3950,7 @@ def nlargest( """ return selectn.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" - ) -> Series: + def nsmallest(self, n: int = 5, keep: NsmallestNlargestKeep = "first") -> Series: """ Return the smallest `n` elements. From d771487642830aed62f40e0818885ec2f0ae1a26 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 16:33:31 +0530 Subject: [PATCH 4/7] Add test for nsorted method --- pandas/tests/frame/methods/test_nlargest.py | 29 ++++++++++++++++++-- pandas/tests/series/methods/test_nlargest.py | 7 +++-- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 9b45c92fd37d4..e99dc8395494c 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -75,8 +75,8 @@ def test_nsorted_n(self, nselect_method, n: int, columns): ) if "b" in columns: error_msg = ( - f"Column 'b' has dtype (object|str), " - f"cannot use method '{nselect_method}' with this dtype" + "Column 'b' has dtype (object|str), " + "cannot use n-sorting with this dtype" ) with pytest.raises(TypeError, match=error_msg): getattr(df, nselect_method)(n, columns) @@ -87,6 +87,29 @@ def test_nsorted_n(self, nselect_method, n: int, columns): expected = df.sort_values(columns, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) + def test_nsorted(self): + df = pd.DataFrame( + { + "x": [2, 2, 1], + "y": [3, 2, 1], + }, + index=["a", "b", "c"], + ) + cols = ["x", "y"] + ascending = [True, False] + n = 2 + df_sort_values = df.sort_values(cols, ascending=ascending).head(n) + result = df.nsorted(n, cols, ascending=ascending) + tm.assert_frame_equal(result, df_sort_values) + expected = pd.DataFrame( + { + "x": [1, 2], + "y": [1, 3], + }, + index=["c", "a"], + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "columns", [["group", "category_string"], ["group", "string"]] ) @@ -95,7 +118,7 @@ def test_nsorted_error(self, df_main_dtypes, nselect_method, columns): col = columns[1] error_msg = ( f"Column '{col}' has dtype {df[col].dtype}, " - f"cannot use method '{nselect_method}' with this dtype" + f"cannot use n-sorting with this dtype" ) # escape some characters that may be in the repr error_msg = ( diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 67ba1d7ca51b7..a1fb7e6c5e01b 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -20,7 +20,7 @@ def assert_check_nselect_boundary(vals, dtype, method): tm.assert_series_equal(result, expected) -class TestSeriesNLargestNSmallest: +class TestSeriesNSorted: @pytest.mark.parametrize( "r", [ @@ -37,7 +37,7 @@ class TestSeriesNLargestNSmallest: @pytest.mark.parametrize("arg", [2, 5, 0, -1]) def test_nlargest_error(self, r, method, arg): dt = r.dtype - msg = f"Cannot use method 'n(largest|smallest)' with dtype {dt}" + msg = f"Cannot use n-sorting with dtype {dt}" with pytest.raises(TypeError, match=msg): getattr(r, method)(arg) @@ -78,6 +78,9 @@ def test_nsmallest_nlargest(self, data): tm.assert_series_equal(ser.nlargest(len(ser)), ser.iloc[[4, 0, 1, 3, 2]]) tm.assert_series_equal(ser.nlargest(len(ser) + 1), ser.iloc[[4, 0, 1, 3, 2]]) + tm.assert_series_equal(ser.nsorted(2, True), ser.nsmallest(2)) + tm.assert_series_equal(ser.nsorted(2, False), ser.nlargest(2)) + def test_nlargest_misc(self): ser = Series([3.0, np.nan, 1, 2, 5]) result = ser.nlargest() From 89a656c1ae244037416e1f465acf7df5e1d28a32 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 16:38:53 +0530 Subject: [PATCH 5/7] Add whatsnew for nsorted --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6642f5855f4fe..c2f88948e6615 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -77,6 +77,7 @@ Other enhancements - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`). +- Added :meth:`DataFrame.nsorted` to select top ``n`` rows according to column-dependent order (:issue:`61166`) - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) - Added support to read from Apache Iceberg tables with the new :func:`read_iceberg` function (:issue:`61383`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) From 72f7f1331cbfc9fa276f0cfbaca28ac9ac692ef1 Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Mon, 19 May 2025 17:19:37 +0530 Subject: [PATCH 6/7] Fix index compute type --- pandas/core/methods/selectn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/methods/selectn.py b/pandas/core/methods/selectn.py index ee0ea5bf414d1..da12d16cff13c 100644 --- a/pandas/core/methods/selectn.py +++ b/pandas/core/methods/selectn.py @@ -105,10 +105,12 @@ class SelectNSeries(SelectN[Series]): nordered : Series """ - def compute(self, ascending: bool) -> Series: + def compute(self, ascending: bool | Sequence[bool]) -> Series: from pandas.core.reshape.concat import concat - assert isinstance(ascending, bool) + if isinstance(ascending, Sequence): + assert len(ascending) == 1 + ascending = ascending[0] n = self.n dtype = self.obj.dtype From 0e79e2b7777bcdd92c24e305536db56527234d3c Mon Sep 17 00:00:00 2001 From: Martin Braquet Date: Mon, 19 May 2025 19:13:11 +0530 Subject: [PATCH 7/7] Re-Trigger PR checks --- pandas/_typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 49ce35158bd0a..72aa77aee23e9 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -429,7 +429,7 @@ def closed(self) -> bool: SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"] NaPosition = Literal["first", "last"] -# Arguments for nsorted, nsmallest and nlargest +# Arguments for nsorted, nsmallest and nlargest. NsmallestNlargestKeep = Literal["first", "last", "all"] # quantile interpolation