Skip to content

WIP/API: Implemented NDFrame.argsort() and NDFrame.ordering(). #12707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,9 @@ Reshaping, sorting
.. autosummary::
:toctree: generated/

Series.argsort
Series.reorder_levels
Series.argsort
Series.ordering
Series.sort_values
Series.sort_index
Series.sortlevel
Expand Down Expand Up @@ -909,6 +910,8 @@ Reshaping, sorting, transposing

DataFrame.pivot
DataFrame.reorder_levels
DataFrame.argsort
DataFrame.ordering
DataFrame.sort_values
DataFrame.sort_index
DataFrame.sortlevel
Expand Down Expand Up @@ -1181,6 +1184,9 @@ Reshaping, sorting, transposing
.. autosummary::
:toctree: generated/

Panel.argsort
Panel.ordering
Panel.sort_values
Panel.sort_index
Panel.swaplevel
Panel.transpose
Expand Down Expand Up @@ -1271,6 +1277,15 @@ Conversion
Panel4D.isnull
Panel4D.notnull

Reshaping, sorting, transposing
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: generated/

Panel4D.argsort
Panel4D.ordering
Panel4D.sort_values

.. _api.index:

Index
Expand Down
64 changes: 64 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ def _get_axis_number(self, axis):
if com.is_integer(axis):
if axis in self._AXIS_NAMES:
return axis
elif self.ndim + axis in self._AXIS_NAMES:
return self.ndim + axis
else:
try:
return self._AXIS_NUMBERS[axis]
Expand Down Expand Up @@ -931,6 +933,68 @@ def to_dense(self):
# compat
return self

# ----------------------------------------------------------------------
# sorting

_shared_docs['argsort'] = """
Returns the indices that would sort the %(klass)s.
Equivalent to ``self.values.argsort(axis, kind, order)``.

Parameters
----------
%(argsort_args)s

Returns
-------
index_array : numpy.ndarray
Array of indices that sort the %(klass)s along the specified axis.

See also
--------
numpy.ndarray.argsort
"""

_shared_doc_kwargs['argsort_args'] = """
axis : int or axis name, default -1
Axis along which to sort.
kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
Sorting algorithm. See np.sort for more information.
'mergesort' is the only stable algorithm.
order : ignored
"""

@Appender(_shared_docs['argsort'] % _shared_doc_kwargs)
def argsort(self, axis=-1, kind='quicksort', order=None):
return self.values.argsort(self._get_axis_number(axis), kind, order)

_shared_docs['ordering'] = """
Returns the order of each entry in the %(klass)s along the specified axis.

Parameters
----------
%(argsort_args)s
fill_value : default -1
Value to place in locations of NA/null values.

Returns
-------
ordering : %(klass)s
%(klass)s with the same shape and axes, with values equal to
the order of each entry along the specified axis.

See also
--------
%(klass)s.argsort
"""

@Appender(_shared_docs['ordering'] % _shared_doc_kwargs)
def ordering(self, axis=-1, kind='quicksort', order=None, fill_value=-1):
axis = self._get_axis_number(axis)
new_values = self.argsort(axis, kind, order).argsort(axis, kind, order)
result = self._constructor(new_values, *self.axes)
result[self.isnull()] = fill_value
return result

# ----------------------------------------------------------------------
# Picklability

Expand Down
36 changes: 0 additions & 36 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1802,42 +1802,6 @@ def order(self, na_last=None, ascending=True, kind='quicksort',
return self.sort_values(ascending=ascending, kind=kind,
na_position=na_position, inplace=inplace)

def argsort(self, axis=0, kind='quicksort', order=None):
"""
Overrides ndarray.argsort. Argsorts the value, omitting NA/null values,
and places the result in the same locations as the non-NA values

Parameters
----------
axis : int (can only be zero)
kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort'
Choice of sorting algorithm. See np.sort for more
information. 'mergesort' is the only stable algorithm
order : ignored

Returns
-------
argsorted : Series, with -1 indicated where nan values are present

See also
--------
numpy.ndarray.argsort
"""
values = self._values
mask = isnull(values)

if mask.any():
result = Series(-1, index=self.index, name=self.name,
dtype='int64')
notmask = ~mask
result[notmask] = np.argsort(values[notmask], kind=kind)
return self._constructor(result,
index=self.index).__finalize__(self)
else:
return self._constructor(
np.argsort(values, kind=kind), index=self.index,
dtype='int64').__finalize__(self)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
False: 'first'})
def nlargest(self, n=5, keep='first'):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def test_sort_index(self):

# by column
sorted_df = frame.sort_values(by='A')
indexer = frame['A'].argsort().values
indexer = frame['A'].argsort()
expected = frame.ix[frame.index[indexer]]
assert_frame_equal(sorted_df, expected)

Expand Down
70 changes: 53 additions & 17 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import numpy as np
import pandas as pd

from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
date_range, _np_version_under1p9)
from pandas import (Index, Series, DataFrame, Panel, Panel4D, isnull, notnull,
bdate_range, date_range, _np_version_under1p9)
from pandas.core.index import MultiIndex
from pandas.tseries.index import Timestamp
from pandas.tseries.tdi import Timedelta
Expand Down Expand Up @@ -262,11 +262,7 @@ def test_kurt(self):
self.assertEqual(0, s.kurt())
self.assertTrue((df.kurt() == 0).all())

def test_argsort(self):
self._check_accum_op('argsort')
argsorted = self.ts.argsort()
self.assertTrue(issubclass(argsorted.dtype.type, np.integer))

def test_argsort_timestamps(self):
# GH 2967 (introduced bug in 0.11-dev I think)
s = Series([Timestamp('201301%02d' % (i + 1)) for i in range(5)])
self.assertEqual(s.dtype, 'datetime64[ns]')
Expand All @@ -275,24 +271,64 @@ def test_argsort(self):
self.assertTrue(isnull(shifted[4]))

result = s.argsort()
expected = Series(lrange(5), dtype='int64')
expected = np.arange(5, dtype=np.int64)
assert_series_equal(result, expected)

result = shifted.argsort()
expected = Series(lrange(4) + [-1], dtype='int64')
expected = np.arange(5, dtype=np.int64)
assert_series_equal(result, expected)

def test_argsort_stable(self):
def test_argsort_and_ordering(self):
argsorted = self.ts.argsort()
self.assertTrue(issubclass(argsorted.dtype.type, np.integer))

s = Series(np.random.randint(0, 100, size=10000))
mindexer = s.argsort(kind='mergesort')
qindexer = s.argsort()
s[::21] = nan
df = DataFrame(s.values.reshape(100, 100))
p = Panel(s.values.reshape(100, 10, 10))
p4d = Panel4D(s.values.reshape(10, 10, 10, 10))

for x in [s, df, p, p4d]:
for axis in [-1] + list(x._AXIS_NAMES.keys()) + \
list(x._AXIS_NUMBERS.keys()):
for kind in ['quicksort', 'mergesort', 'heapsort']:

result = x.argsort(axis=axis, kind=kind)
expected = x.values.argsort(axis=x._get_axis_number(axis),
kind=kind)
self.assert_numpy_array_equal(result, expected)

result = x.ordering(axis=axis, kind=kind)
expected = x._constructor(
expected.argsort(axis=x._get_axis_number(axis),
kind=kind),
*x.axes)
expected[x.isnull()] = -1
self.assertEqual(result, expected)

s = Series([1, 5, nan, 0, 4], index=list('abcde'))
result = s.argsort()
expected = np.array([3, 0, 4, 1, 2], dtype=np.int64)
self.assert_numpy_array_equal(result, expected)

mexpected = np.argsort(s.values, kind='mergesort')
qexpected = np.argsort(s.values, kind='quicksort')
result = s.ordering()
expected = Series([1, 3, -1, 0, 2], index=list('abcde'))
self.assert_series_equal(result, expected)

df = DataFrame([[1, 5, nan, 0, 4],
[8, 2, 6, 9, 7]],
index=list('xy'), columns=list('abcde'))
result = df.argsort()
expected = np.array([[3, 0, 4, 1, 2],
[1, 2, 4, 0, 3]],
dtype=np.int64)
self.assert_numpy_array_equal(result, expected)

self.assert_numpy_array_equal(mindexer, mexpected)
self.assert_numpy_array_equal(qindexer, qexpected)
self.assertFalse(np.array_equal(qindexer, mindexer))
result = df.ordering()
expected = DataFrame([[1, 3, -1, 0, 2],
[3, 0, 1, 4, 2]],
index=list('xy'), columns=list('abcde'))
self.assert_frame_equal(result, expected)

def test_cumsum(self):
self._check_accum_op('cumsum')
Expand Down