Skip to content

Commit 47ffcd6

Browse files
makbigcjreback
authored andcommitted
API: ExtensionArray.argsort places the missing value at the end (#27137)
1 parent 903a09c commit 47ffcd6

File tree

7 files changed

+72
-28
lines changed

7 files changed

+72
-28
lines changed

asv_bench/benchmarks/algorithms.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,4 +141,16 @@ def time_quantile(self, quantile, interpolation, dtype):
141141
self.idx.quantile(quantile, interpolation=interpolation)
142142

143143

144+
class SortIntegerArray:
145+
params = [10**3, 10**5]
146+
147+
def setup(self, N):
148+
data = np.arange(N, dtype=float)
149+
data[40] = np.nan
150+
self.array = pd.array(data, dtype='Int64')
151+
152+
def time_argsort(self, N):
153+
self.array.argsort()
154+
155+
144156
from .pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v0.25.0.rst

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,7 @@ when both are :class:`Series` (:issue:`23293`).
661661
662662
*Previous behavior*
663663
664-
.. code-block:: python
664+
.. code-block:: ipython
665665
666666
In [5]: np.power(s1, s2)
667667
Out[5]:
@@ -684,6 +684,36 @@ applying the ufunc.
684684
685685
np.power(s1, s2.array)
686686
687+
Categorical.argsort now places missing values at the end
688+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
689+
690+
:meth:`Categorical.argsort` now places missing values at the end of the array, making it
691+
consistent with NumPy and the rest of pandas (:issue:`21801`).
692+
693+
.. ipython:: python
694+
695+
cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
696+
697+
*Previous behavior*
698+
699+
.. code-block:: ipython
700+
701+
In [2]: cat = pd.Categorical(['b', None, 'a'], categories=['a', 'b'], ordered=True)
702+
703+
In [3]: cat.argsort()
704+
Out[3]: array([1, 2, 0])
705+
706+
In [4]: cat[cat.argsort()]
707+
Out[4]:
708+
[NaN, a, b]
709+
categories (2, object): [a < b]
710+
711+
*New behavior*
712+
713+
.. ipython:: python
714+
715+
cat.argsort()
716+
cat[cat.argsort()]
687717
688718
.. _whatsnew_0250.api_breaking.deps:
689719
@@ -767,6 +797,7 @@ Other API changes
767797
- Removed support of gtk package for clipboards (:issue:`26563`)
768798
- Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`)
769799
- :meth:`Series.to_excel` and :meth:`DataFrame.to_excel` will now raise a ``ValueError`` when saving timezone aware data. (:issue:`27008`, :issue:`7056`)
800+
- :meth:`ExtensionArray.argsort` places NA values at the end of the sorted array. (:issue:`21801`)
770801
- :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` will now raise a ``NotImplementedError`` when saving a :class:`MultiIndex` with extention data types for a ``fixed`` format. (:issue:`7775`)
771802
- Passing duplicate ``names`` in :meth:`read_csv` will now raise a ``ValueError`` (:issue:`17346`)
772803

pandas/core/arrays/base.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from pandas._typing import ArrayLike
2525
from pandas.core import ops
26+
from pandas.core.sorting import nargsort
2627

2728
_not_implemented_message = "{} does not implement {}."
2829

@@ -409,7 +410,8 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
409410
Returns
410411
-------
411412
index_array : ndarray
412-
Array of indices that sort ``self``.
413+
Array of indices that sort ``self``. If NaN values are contained,
414+
NaN values are placed at the end.
413415
414416
See Also
415417
--------
@@ -420,10 +422,9 @@ def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
420422
# 1. _values_for_argsort : construct the values passed to np.argsort
421423
# 2. argsort : total control over sorting.
422424
ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)
423-
values = self._values_for_argsort()
424-
result = np.argsort(values, kind=kind, **kwargs)
425-
if not ascending:
426-
result = result[::-1]
425+
426+
result = nargsort(self, kind=kind, ascending=ascending,
427+
na_position='last')
427428
return result
428429

429430
def fillna(self, value=None, method=None, limit=None):

pandas/core/arrays/categorical.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,13 +1531,14 @@ def check_for_ordered(self, op):
15311531
def _values_for_argsort(self):
15321532
return self._codes.copy()
15331533

1534-
def argsort(self, *args, **kwargs):
1535-
# TODO(PY2): use correct signature
1536-
# We have to do *args, **kwargs to avoid a a py2-only signature
1537-
# issue since np.argsort differs from argsort.
1534+
def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
15381535
"""
15391536
Return the indices that would sort the Categorical.
15401537
1538+
.. versionchanged:: 0.25.0
1539+
1540+
Changed to sort missing values at the end.
1541+
15411542
Parameters
15421543
----------
15431544
ascending : bool, default True
@@ -1574,9 +1575,14 @@ def argsort(self, *args, **kwargs):
15741575
... ordered=True)
15751576
>>> cat.argsort()
15761577
array([3, 0, 1, 2])
1578+
1579+
Missing values are placed at the end
1580+
1581+
>>> cat = pd.Categorical([2, None, 1])
1582+
>>> cat.argsort()
1583+
array([2, 0, 1])
15771584
"""
1578-
# Keep the implementation here just for the docstring.
1579-
return super().argsort(*args, **kwargs)
1585+
return super().argsort(ascending=ascending, kind=kind, *args, **kwargs)
15801586

15811587
def sort_values(self, inplace=False, ascending=True, na_position='last'):
15821588
"""

pandas/core/sorting.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -240,20 +240,6 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
240240

241241
items = extract_array(items)
242242
mask = np.asarray(isna(items))
243-
# specially handle Categorical
244-
if is_categorical_dtype(items):
245-
if na_position not in {'first', 'last'}:
246-
raise ValueError('invalid na_position: {!r}'.format(na_position))
247-
248-
cnt_null = mask.sum()
249-
sorted_idx = items.argsort(ascending=ascending, kind=kind)
250-
if ascending and na_position == 'last':
251-
# NaN is coded as -1 and is listed in front after sorting
252-
sorted_idx = np.roll(sorted_idx, -cnt_null)
253-
elif not ascending and na_position == 'first':
254-
# NaN is coded as -1 and is listed in the end after sorting
255-
sorted_idx = np.roll(sorted_idx, cnt_null)
256-
return sorted_idx
257243

258244
if is_extension_array_dtype(items):
259245
items = items._values_for_argsort()

pandas/tests/extension/base/methods.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ def test_argsort(self, data_for_sorting):
4747
expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
4848
self.assert_series_equal(result, expected)
4949

50+
def test_argsort_missing_array(self, data_missing_for_sorting):
51+
result = data_missing_for_sorting.argsort()
52+
expected = np.array([2, 0, 1], dtype=np.dtype("int"))
53+
# we don't care whether it's int32 or int64
54+
result = result.astype("int64", casting="safe")
55+
expected = expected.astype("int64", casting="safe")
56+
tm.assert_numpy_array_equal(result, expected)
57+
5058
def test_argsort_missing(self, data_missing_for_sorting):
5159
result = pd.Series(data_missing_for_sorting).argsort()
5260
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))

pandas/tests/frame/test_sorting.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ def test_sort_index_categorical_index(self):
539539
assert_frame_equal(result, expected)
540540

541541
result = df.sort_index(ascending=False)
542-
expected = df.iloc[[3, 2, 5, 1, 0, 4]]
542+
expected = df.iloc[[2, 3, 0, 1, 5, 4]]
543543
assert_frame_equal(result, expected)
544544

545545
def test_sort_index(self):
@@ -629,7 +629,7 @@ def test_sort_index_na_position_with_categories(self):
629629

630630
reversed_categories = sorted(categories, reverse=True)
631631
reversed_category_indices = sorted(category_indices, reverse=True)
632-
reversed_na_indices = sorted(na_indices, reverse=True)
632+
reversed_na_indices = sorted(na_indices)
633633

634634
df = pd.DataFrame({
635635
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],

0 commit comments

Comments
 (0)