Skip to content

Drop duplicate indices #59133

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Other enhancements
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.drop_duplicates` now supports a new parameter ``index`` to drop duplicate indices. (:issue:`58648`)
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
Expand Down
47 changes: 38 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
lib,
properties,
)
from pandas._libs.hashtable import duplicated
from pandas._libs.lib import is_range_indexer
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
Expand Down Expand Up @@ -175,7 +174,6 @@
treat_as_nested,
)
from pandas.core.methods import selectn
from pandas.core.reshape.melt import melt
from pandas.core.series import Series
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import (
Expand Down Expand Up @@ -1386,7 +1384,9 @@ def style(self) -> Styler:

return Styler(self)

_shared_docs["items"] = r"""
_shared_docs[
"items"
] = r"""
Iterate over (column name, Series) pairs.

Iterates over the DataFrame columns, returning a tuple with
Expand Down Expand Up @@ -6553,6 +6553,7 @@ def drop_duplicates(
keep: DropKeep = "first",
inplace: bool = False,
ignore_index: bool = False,
index: bool = False,
) -> DataFrame | None:
"""
Return DataFrame with duplicate rows removed.
Expand All @@ -6577,6 +6578,9 @@ def drop_duplicates(
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

index : bool, default ``False``
If ``True``, drop duplicates based on the index instead of columns.

Returns
-------
DataFrame or None
Expand Down Expand Up @@ -6633,14 +6637,33 @@ def drop_duplicates(
1 Yum Yum cup 4.0
2 Indomie cup 3.5
4 Indomie pack 5.0

To remove duplicates based on index, use ``index=True``.

>>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1])
>>> df.drop_duplicates(index=True)
A
0 1
1 2

To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``.

>>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1])
>>> df.drop_duplicates(index=True, keep="last")
A
0 1
1 3
"""
if self.empty:
return self.copy(deep=False)

inplace = validate_bool_kwarg(inplace, "inplace")
ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")

result = self[-self.duplicated(subset, keep=keep)]
if index:
subset = self.index.names

result = self[-self.duplicated(subset=subset, keep=keep)]
if ignore_index:
result.index = default_index(len(result))

Expand Down Expand Up @@ -9076,7 +9099,9 @@ def groupby(
dropna=dropna,
)

_shared_docs["pivot"] = """
_shared_docs[
"pivot"
] = """
Return reshaped DataFrame organized by given index / column values.

Reshape data (produce a "pivot" table) based on column values. Uses
Expand Down Expand Up @@ -9220,7 +9245,9 @@ def pivot(

return pivot(self, index=index, columns=columns, values=values)

_shared_docs["pivot_table"] = """
_shared_docs[
"pivot_table"
] = """
Create a spreadsheet-style pivot table as a DataFrame.

The levels in the pivot table will be stored in MultiIndex objects
Expand Down Expand Up @@ -10497,9 +10524,11 @@ def _append(

index = Index(
[other.name],
name=self.index.names
if isinstance(self.index, MultiIndex)
else self.index.name,
name=(
self.index.names
if isinstance(self.index, MultiIndex)
else self.index.name
),
)
row_df = other.to_frame().T
# infer_objects is needed for
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/frame/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,26 @@ def test_drop_duplicates_null_in_object_column(nulls_fixture):
tm.assert_frame_equal(result, df)


def test_drop_duplicates_index():
# Example 1: Basic usage with integer index and duplicate rows
df = DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1])
result = df.drop_duplicates(index=True)
expected = DataFrame({"A": [1, 2]}, index=[0, 1])
tm.assert_frame_equal(result, expected)

# Example 2: Using strings as index
df2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"])
result2 = df2.drop_duplicates(index=True)
expected2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"])
tm.assert_frame_equal(result2, expected2)

# Example 3: Index is not reset after dropping duplicates
df3 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "a"])
result3 = df3.drop_duplicates(index=True)
expected3 = DataFrame({"A": [1, 2]}, index=["a", "b"])
tm.assert_frame_equal(result3, expected3)


def test_drop_duplicates_series_vs_dataframe(keep):
# GH#14192
df = DataFrame(
Expand Down
Loading