Skip to content

Commit 529288e

Browse files
committed
Merge pull request #10236 from sinhrks/duplicated
ENH: duplicated and drop_duplicates now accept keep kw
2 parents 0259ace + 1b913ba commit 529288e

14 files changed

+448
-90
lines changed

Diff for: doc/source/indexing.rst

+6-4
Original file line numberDiff line numberDiff line change
@@ -1178,17 +1178,19 @@ takes as an argument the columns to use to identify duplicated rows.
11781178
- ``drop_duplicates`` removes duplicate rows.
11791179

11801180
By default, the first observed row of a duplicate set is considered unique, but
1181-
each method has a ``take_last`` parameter that indicates the last observed row
1182-
should be taken instead.
1181+
each method has a ``keep`` parameter to specify targets to be kept.
11831182

11841183
.. ipython:: python
11851184
11861185
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
11871186
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
11881187
'c' : np.random.randn(7)})
11891188
df2.duplicated(['a','b'])
1189+
df2.duplicated(['a','b'], keep='last')
1190+
df2.duplicated(['a','b'], keep=False)
11901191
df2.drop_duplicates(['a','b'])
1191-
df2.drop_duplicates(['a','b'], take_last=True)
1192+
df2.drop_duplicates(['a','b'], keep='last')
1193+
df2.drop_duplicates(['a','b'], keep=False)
11921194
11931195
An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.
11941196

@@ -1199,7 +1201,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
11991201
df3.groupby(level=0).first()
12001202
12011203
# a bit more verbose
1202-
df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
1204+
df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b')
12031205
12041206
.. _indexing.dictionarylike:
12051207

Diff for: doc/source/whatsnew/v0.17.0.txt

+10
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,15 @@ Other enhancements
142142
- ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`).
143143

144144
- ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`).
145+
- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)
146+
147+
.. ipython :: python
148+
149+
s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
150+
s.drop_duplicates()
151+
s.drop_duplicates(keep='last')
152+
s.drop_duplicates(keep=False)
153+
145154

146155
.. _whatsnew_0170.api:
147156

@@ -520,6 +529,7 @@ Deprecations
520529
===================== =================================
521530

522531
- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
532+
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
523533

524534
.. _whatsnew_0170.prior_deprecations:
525535

Diff for: pandas/core/base.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas.core import common as com
77
import pandas.core.nanops as nanops
88
import pandas.lib as lib
9-
from pandas.util.decorators import Appender, cache_readonly
9+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
1010
from pandas.core.strings import StringMethods
1111
from pandas.core.common import AbstractMethodError
1212

@@ -543,18 +543,23 @@ def _dir_deletions(self):
543543
544544
Parameters
545545
----------
546-
take_last : boolean, default False
547-
Take the last observed index in a group. Default first
546+
547+
keep : {'first', 'last', False}, default 'first'
548+
- ``first`` : Drop duplicates except for the first occurrence.
549+
- ``last`` : Drop duplicates except for the last occurrence.
550+
- False : Drop all duplicates.
551+
take_last : deprecated
548552
%(inplace)s
549553
550554
Returns
551555
-------
552556
deduplicated : %(klass)s
553557
""")
554558

559+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
555560
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
556-
def drop_duplicates(self, take_last=False, inplace=False):
557-
duplicated = self.duplicated(take_last=take_last)
561+
def drop_duplicates(self, keep='first', inplace=False):
562+
duplicated = self.duplicated(keep=keep)
558563
result = self[np.logical_not(duplicated)]
559564
if inplace:
560565
return self._update_inplace(result)
@@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
566571
567572
Parameters
568573
----------
569-
take_last : boolean, default False
570-
Take the last observed index in a group. Default first
574+
keep : {'first', 'last', False}, default 'first'
575+
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
576+
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
577+
- False : Mark all duplicates as ``True``.
578+
take_last : deprecated
571579
572580
Returns
573581
-------
574582
duplicated : %(duplicated)s
575583
""")
576584

585+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
577586
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
578-
def duplicated(self, take_last=False):
587+
def duplicated(self, keep='first'):
579588
keys = com._ensure_object(self.values)
580-
duplicated = lib.duplicated(keys, take_last=take_last)
589+
duplicated = lib.duplicated(keys, keep=keep)
581590
try:
582591
return self._constructor(duplicated,
583592
index=self.index).__finalize__(self)

Diff for: pandas/core/frame.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -2866,8 +2866,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
28662866
else:
28672867
return result
28682868

2869+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
28692870
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2870-
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
2871+
def drop_duplicates(self, subset=None, keep='first', inplace=False):
28712872
"""
28722873
Return DataFrame with duplicate rows removed, optionally only
28732874
considering certain columns
@@ -2877,8 +2878,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28772878
subset : column label or sequence of labels, optional
28782879
Only consider certain columns for identifying duplicates, by
28792880
default use all of the columns
2880-
take_last : boolean, default False
2881-
Take the last observed row in a row. Defaults to the first row
2881+
keep : {'first', 'last', False}, default 'first'
2882+
- ``first`` : Drop duplicates except for the first occurrence.
2883+
- ``last`` : Drop duplicates except for the last occurrence.
2884+
- False : Drop all duplicates.
2885+
take_last : deprecated
28822886
inplace : boolean, default False
28832887
Whether to drop duplicates in place or to return a copy
28842888
cols : kwargs only argument of subset [deprecated]
@@ -2887,7 +2891,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28872891
-------
28882892
deduplicated : DataFrame
28892893
"""
2890-
duplicated = self.duplicated(subset, take_last=take_last)
2894+
duplicated = self.duplicated(subset, keep=keep)
28912895

28922896
if inplace:
28932897
inds, = (-duplicated).nonzero()
@@ -2896,8 +2900,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
28962900
else:
28972901
return self[-duplicated]
28982902

2903+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
28992904
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
2900-
def duplicated(self, subset=None, take_last=False):
2905+
def duplicated(self, subset=None, keep='first'):
29012906
"""
29022907
Return boolean Series denoting duplicate rows, optionally only
29032908
considering certain columns
@@ -2907,9 +2912,13 @@ def duplicated(self, subset=None, take_last=False):
29072912
subset : column label or sequence of labels, optional
29082913
Only consider certain columns for identifying duplicates, by
29092914
default use all of the columns
2910-
take_last : boolean, default False
2911-
For a set of distinct duplicate rows, flag all but the last row as
2912-
duplicated. Default is for all but the first row to be flagged
2915+
keep : {'first', 'last', False}, default 'first'
2916+
- ``first`` : Mark duplicates as ``True`` except for the
2917+
first occurrence.
2918+
- ``last`` : Mark duplicates as ``True`` except for the
2919+
last occurrence.
2920+
- False : Mark all duplicates as ``True``.
2921+
take_last : deprecated
29132922
cols : kwargs only argument of subset [deprecated]
29142923
29152924
Returns
@@ -2935,7 +2944,7 @@ def f(vals):
29352944
labels, shape = map(list, zip( * map(f, vals)))
29362945

29372946
ids = get_group_index(labels, shape, sort=False, xnull=False)
2938-
return Series(duplicated_int64(ids, take_last), index=self.index)
2947+
return Series(duplicated_int64(ids, keep), index=self.index)
29392948

29402949
#----------------------------------------------------------------------
29412950
# Sorting

Diff for: pandas/core/index.py

+13-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas.lib import Timestamp, Timedelta, is_datetime_array
1717
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
1818
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
19-
deprecate)
19+
deprecate, deprecate_kwarg)
2020
import pandas.core.common as com
2121
from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
2222
_values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
@@ -2628,13 +2628,15 @@ def drop(self, labels, errors='raise'):
26282628
indexer = indexer[~mask]
26292629
return self.delete(indexer)
26302630

2631+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
26312632
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
2632-
def drop_duplicates(self, take_last=False):
2633-
return super(Index, self).drop_duplicates(take_last=take_last)
2633+
def drop_duplicates(self, keep='first'):
2634+
return super(Index, self).drop_duplicates(keep=keep)
26342635

2636+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
26352637
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
2636-
def duplicated(self, take_last=False):
2637-
return super(Index, self).duplicated(take_last=take_last)
2638+
def duplicated(self, keep='first'):
2639+
return super(Index, self).duplicated(keep=keep)
26382640

26392641
def _evaluate_with_timedelta_like(self, other, op, opstr):
26402642
raise TypeError("can only perform ops with timedelta like values")
@@ -3065,10 +3067,11 @@ def _engine(self):
30653067
def is_unique(self):
30663068
return not self.duplicated().any()
30673069

3070+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
30683071
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
3069-
def duplicated(self, take_last=False):
3072+
def duplicated(self, keep='first'):
30703073
from pandas.hashtable import duplicated_int64
3071-
return duplicated_int64(self.codes.astype('i8'), take_last)
3074+
return duplicated_int64(self.codes.astype('i8'), keep)
30723075

30733076
def get_loc(self, key, method=None):
30743077
"""
@@ -4228,15 +4231,16 @@ def _has_complex_internals(self):
42284231
def is_unique(self):
42294232
return not self.duplicated().any()
42304233

4234+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
42314235
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
4232-
def duplicated(self, take_last=False):
4236+
def duplicated(self, keep='first'):
42334237
from pandas.core.groupby import get_group_index
42344238
from pandas.hashtable import duplicated_int64
42354239

42364240
shape = map(len, self.levels)
42374241
ids = get_group_index(self.labels, shape, sort=False, xnull=False)
42384242

4239-
return duplicated_int64(ids, take_last)
4243+
return duplicated_int64(ids, keep)
42404244

42414245
def get_value(self, series, key):
42424246
# somewhat broken encapsulation

Diff for: pandas/core/series.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
import pandas.core.datetools as datetools
4747
import pandas.core.format as fmt
4848
import pandas.core.nanops as nanops
49-
from pandas.util.decorators import Appender, cache_readonly
49+
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
5050

5151
import pandas.lib as lib
5252
import pandas.tslib as tslib
@@ -1155,14 +1155,15 @@ def mode(self):
11551155
from pandas.core.algorithms import mode
11561156
return mode(self)
11571157

1158+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
11581159
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
1159-
def drop_duplicates(self, take_last=False, inplace=False):
1160-
return super(Series, self).drop_duplicates(take_last=take_last,
1161-
inplace=inplace)
1160+
def drop_duplicates(self, keep='first', inplace=False):
1161+
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
11621162

1163+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
11631164
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
1164-
def duplicated(self, take_last=False):
1165-
return super(Series, self).duplicated(take_last=take_last)
1165+
def duplicated(self, keep='first'):
1166+
return super(Series, self).duplicated(keep=keep)
11661167

11671168
def idxmin(self, axis=None, out=None, skipna=True):
11681169
"""

Diff for: pandas/hashtable.pyx

+22-6
Original file line numberDiff line numberDiff line change
@@ -1026,25 +1026,41 @@ def mode_int64(int64_t[:] values):
10261026

10271027
@cython.wraparound(False)
10281028
@cython.boundscheck(False)
1029-
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
1029+
def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
10301030
cdef:
1031-
int ret = 0
1031+
int ret = 0, value, k
10321032
Py_ssize_t i, n = len(values)
10331033
kh_int64_t * table = kh_init_int64()
10341034
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
10351035

10361036
kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))
10371037

1038-
with nogil:
1039-
if take_last:
1038+
if keep not in ('last', 'first', False):
1039+
raise ValueError('keep must be either "first", "last" or False')
1040+
1041+
if keep == 'last':
1042+
with nogil:
10401043
for i from n > i >=0:
10411044
kh_put_int64(table, values[i], &ret)
10421045
out[i] = ret == 0
1043-
else:
1046+
elif keep == 'first':
1047+
with nogil:
10441048
for i from 0 <= i < n:
10451049
kh_put_int64(table, values[i], &ret)
10461050
out[i] = ret == 0
1047-
1051+
else:
1052+
with nogil:
1053+
for i from 0 <= i < n:
1054+
value = values[i]
1055+
k = kh_get_int64(table, value)
1056+
if k != table.n_buckets:
1057+
out[table.vals[k]] = 1
1058+
out[i] = 1
1059+
else:
1060+
k = kh_put_int64(table, value, &ret)
1061+
table.keys[k] = value
1062+
table.vals[k] = i
1063+
out[i] = 0
10481064
kh_destroy_int64(table)
10491065
return out
10501066

Diff for: pandas/lib.pyx

+19-7
Original file line numberDiff line numberDiff line change
@@ -1348,35 +1348,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
13481348

13491349
return result
13501350

1351-
def duplicated(ndarray[object] values, take_last=False):
1351+
1352+
def duplicated(ndarray[object] values, object keep='first'):
13521353
cdef:
13531354
Py_ssize_t i, n
1354-
set seen = set()
1355+
dict seen = dict()
13551356
object row
13561357

13571358
n = len(values)
13581359
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)
13591360

1360-
if take_last:
1361+
if keep == 'last':
13611362
for i from n > i >= 0:
13621363
row = values[i]
1363-
13641364
if row in seen:
13651365
result[i] = 1
13661366
else:
1367-
seen.add(row)
1367+
seen[row] = i
13681368
result[i] = 0
1369-
else:
1369+
elif keep == 'first':
13701370
for i from 0 <= i < n:
13711371
row = values[i]
13721372
if row in seen:
13731373
result[i] = 1
13741374
else:
1375-
seen.add(row)
1375+
seen[row] = i
13761376
result[i] = 0
1377+
elif keep is False:
1378+
for i from 0 <= i < n:
1379+
row = values[i]
1380+
if row in seen:
1381+
result[i] = 1
1382+
result[seen[row]] = 1
1383+
else:
1384+
seen[row] = i
1385+
result[i] = 0
1386+
else:
1387+
raise ValueError('keep must be either "first", "last" or False')
13771388

13781389
return result.view(np.bool_)
13791390

1391+
13801392
def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
13811393
cdef:
13821394
Py_ssize_t i, group_size, n, start

0 commit comments

Comments
 (0)