diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f7039021ff276..c1df34c917fea 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -41,6 +41,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.drop_duplicates` now supports a new parameter ``index`` to drop duplicate indices. (:issue:`58648`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b156cd75e373..ed75a9aae538c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -46,7 +46,6 @@ lib, properties, ) -from pandas._libs.hashtable import duplicated from pandas._libs.lib import is_range_indexer from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT @@ -175,7 +174,6 @@ treat_as_nested, ) from pandas.core.methods import selectn -from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( @@ -1386,7 +1384,9 @@ def style(self) -> Styler: return Styler(self) - _shared_docs["items"] = r""" + _shared_docs[ + "items" + ] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -6553,6 +6553,7 @@ def drop_duplicates( keep: DropKeep = "first", inplace: bool = False, ignore_index: bool = False, + index: bool = False, ) -> DataFrame | None: """ Return DataFrame with duplicate rows removed. @@ -6577,6 +6578,9 @@ def drop_duplicates( ignore_index : bool, default ``False`` If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + index : bool, default ``False`` + If ``True``, drop duplicates based on the index instead of columns. + Returns ------- DataFrame or None @@ -6633,6 +6637,22 @@ def drop_duplicates( 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 + + To remove duplicates based on index, use ``index=True``. + + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True) + A + 0 1 + 1 2 + + To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``. + + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True, keep="last") + A + 0 1 + 1 3 """ if self.empty: return self.copy(deep=False) @@ -6640,7 +6660,10 @@ def drop_duplicates( inplace = validate_bool_kwarg(inplace, "inplace") ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") - result = self[-self.duplicated(subset, keep=keep)] + if index: + subset = self.index.names + + result = self[-self.duplicated(subset=subset, keep=keep)] if ignore_index: result.index = default_index(len(result)) @@ -9076,7 +9099,9 @@ def groupby( dropna=dropna, ) - _shared_docs["pivot"] = """ + _shared_docs[ + "pivot" + ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -9220,7 +9245,9 @@ def pivot( return pivot(self, index=index, columns=columns, values=values) - _shared_docs["pivot_table"] = """ + _shared_docs[ + "pivot_table" + ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -10497,9 +10524,11 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 6bea97b2cf189..89e4e7fec4243 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -441,6 +441,26 @@ def test_drop_duplicates_null_in_object_column(nulls_fixture): tm.assert_frame_equal(result, df) +def test_drop_duplicates_index(): + # Example 1: Basic usage with integer index and duplicate rows + df = DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + result = df.drop_duplicates(index=True) + expected = DataFrame({"A": [1, 2]}, index=[0, 1]) + tm.assert_frame_equal(result, expected) + + # Example 2: Using strings as index + df2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + result2 = df2.drop_duplicates(index=True) + expected2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + tm.assert_frame_equal(result2, expected2) + + # Example 3: Index is not reset after dropping duplicates + df3 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "a"]) + result3 = df3.drop_duplicates(index=True) + expected3 = DataFrame({"A": [1, 2]}, index=["a", "b"]) + tm.assert_frame_equal(result3, expected3) + + def test_drop_duplicates_series_vs_dataframe(keep): # GH#14192 df = DataFrame(