From 9974ac9d0176fce2c018a477ae431c4ea8ccbe9b Mon Sep 17 00:00:00 2001 From: Yousinator Date: Tue, 25 Jun 2024 07:39:51 +0300 Subject: [PATCH 1/4] Main lohic is done --- pandas/core/frame.py | 202 ++++++++++++++++++++++++------------------- 1 file changed, 113 insertions(+), 89 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b156cd75e373..6f521c8ed1049 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6546,109 +6546,133 @@ def drop_duplicates( ignore_index: bool = ..., ) -> DataFrame | None: ... - def drop_duplicates( - self, - subset: Hashable | Sequence[Hashable] | None = None, - *, - keep: DropKeep = "first", - inplace: bool = False, - ignore_index: bool = False, - ) -> DataFrame | None: - """ - Return DataFrame with duplicate rows removed. - - Considering certain columns is optional. Indexes, including time indexes - are ignored. - - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', ``False``}, default 'first' - Determines which duplicates (if any) to keep. - - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - inplace : bool, default ``False`` - Whether to modify the DataFrame rather than creating a new one. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. +def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = None, + *, + keep: DropKeep = "first", + inplace: bool = False, + ignore_index: bool = False, + index: bool = False, +) -> DataFrame | None: + """ + Return DataFrame with duplicate rows removed. - Returns - ------- - DataFrame or None - DataFrame with duplicates removed or None if ``inplace=True``. + Considering certain columns is optional. Indexes, including time indexes + are ignored. - See Also - -------- - DataFrame.value_counts: Count unique combinations of columns. + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', ``False``}, default 'first' + Determines which duplicates (if any) to keep. - Notes - ----- - This method requires columns specified by ``subset`` to be of hashable type. - Passing unhashable columns will raise a ``TypeError``. + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. - Examples - -------- - Consider dataset containing ramen rating. + inplace : bool, default ``False`` + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 + index : bool, default ``False`` + If ``True``, drop duplicates based on the index instead of columns. - By default, it removes duplicate rows based on all columns. + Returns + ------- + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. - >>> df.drop_duplicates() - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. - To remove duplicates on specific column(s), use ``subset``. + Notes + ----- + This method requires columns specified by ``subset`` to be of hashable type. + Passing unhashable columns will raise a ``TypeError``. - >>> df.drop_duplicates(subset=["brand"]) - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 + Examples + -------- + Consider dataset containing ramen rating. + + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based on all columns. + + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + To remove duplicates on specific column(s), use ``subset``. + + >>> df.drop_duplicates(subset=["brand"]) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurrences, use ``keep``. + + >>> df.drop_duplicates(subset=["brand", "style"], keep="last") + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + + To remove duplicates based on index, use ``index=True``. + + >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True) + A + 0 1 + 1 2 + + To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``. + + >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True, keep='last') + A + 0 1 + 1 3 + """ + if self.empty: + return self.copy(deep=False) - To remove duplicates and keep last occurrences, use ``keep``. + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") - >>> df.drop_duplicates(subset=["brand", "style"], keep="last") - brand style rating - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 4 Indomie pack 5.0 - """ - if self.empty: - return self.copy(deep=False) + if index: + subset = self.index.names - inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + result = self[-self.duplicated(subset=subset, keep=keep)] + if ignore_index: + result.index = default_index(len(result)) - result = self[-self.duplicated(subset, keep=keep)] - if ignore_index: - result.index = default_index(len(result)) + if inplace: + self._update_inplace(result) + return None + else: + return result - if inplace: - self._update_inplace(result) - return None - else: - return result def duplicated( self, From 48cbca4b2ddf8c582dae26cffa99583a7b40262b Mon Sep 17 00:00:00 2001 From: Yousinator Date: Tue, 25 Jun 2024 07:46:27 +0300 Subject: [PATCH 2/4] Done adding index parameter to drop_duplicates method --- .../frame/methods/test_drop_duplicates.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 6bea97b2cf189..a68110cade714 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -441,6 +441,29 @@ def test_drop_duplicates_null_in_object_column(nulls_fixture): tm.assert_frame_equal(result, df) +import pandas as pd + + +def test_drop_duplicates_index(): + # Example 1: Basic usage with integer index and duplicate rows + df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + result = df.drop_duplicates(index=True) + expected = pd.DataFrame({"A": [1, 2]}, index=[0, 1]) + pd.testing.assert_frame_equal(result, expected) + + # Example 2: Using strings as index + df2 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + result2 = df2.drop_duplicates(index=True) + expected2 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + pd.testing.assert_frame_equal(result2, expected2) + + # Example 3: Index is not reset after dropping duplicates + df3 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "a"]) + result3 = df3.drop_duplicates(index=True) + expected3 = pd.DataFrame({"A": [1, 2]}, index=["a", "b"]) + pd.testing.assert_frame_equal(result3, expected3) + + def test_drop_duplicates_series_vs_dataframe(keep): # GH#14192 df = DataFrame( From 424d7c72913deb0fc8cae84d8169fb6fb4565640 Mon Sep 17 00:00:00 2001 From: Yousinator Date: Fri, 28 Jun 2024 08:37:29 +0300 Subject: [PATCH 3/4] Done adding index parameter to drop_duplicates method --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 10 ++++----- .../frame/methods/test_drop_duplicates.py | 21 ++++++++----------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f7039021ff276..c1df34c917fea 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -41,6 +41,7 @@ Other enhancements - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) +- :meth:`DataFrame.drop_duplicates` now supports a new parameter ``index`` to drop duplicate indices. (:issue:`58648`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6f521c8ed1049..f2752ec4f594a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -46,7 +46,6 @@ lib, properties, ) -from pandas._libs.hashtable import duplicated from pandas._libs.lib import is_range_indexer from pandas.compat import PYPY from pandas.compat._constants import REF_COUNT @@ -175,7 +174,6 @@ treat_as_nested, ) from pandas.core.methods import selectn -from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( @@ -6546,6 +6544,7 @@ def drop_duplicates( ignore_index: bool = ..., ) -> DataFrame | None: ... + def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, @@ -6640,7 +6639,7 @@ def drop_duplicates( To remove duplicates based on index, use ``index=True``. - >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=[0, 1, 1]) + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) >>> df.drop_duplicates(index=True) A 0 1 @@ -6648,8 +6647,8 @@ def drop_duplicates( To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``. - >>> df = pd.DataFrame({'A': [1, 2, 3]}, index=[0, 1, 1]) - >>> df.drop_duplicates(index=True, keep='last') + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True, keep="last") A 0 1 1 3 @@ -6673,7 +6672,6 @@ def drop_duplicates( else: return result - def duplicated( self, subset: Hashable | Sequence[Hashable] | None = None, diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index a68110cade714..89e4e7fec4243 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -441,27 +441,24 @@ def test_drop_duplicates_null_in_object_column(nulls_fixture): tm.assert_frame_equal(result, df) -import pandas as pd - - def test_drop_duplicates_index(): # Example 1: Basic usage with integer index and duplicate rows - df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + df = DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) result = df.drop_duplicates(index=True) - expected = pd.DataFrame({"A": [1, 2]}, index=[0, 1]) - pd.testing.assert_frame_equal(result, expected) + expected = DataFrame({"A": [1, 2]}, index=[0, 1]) + tm.assert_frame_equal(result, expected) # Example 2: Using strings as index - df2 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + df2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) result2 = df2.drop_duplicates(index=True) - expected2 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) - pd.testing.assert_frame_equal(result2, expected2) + expected2 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "c"]) + tm.assert_frame_equal(result2, expected2) # Example 3: Index is not reset after dropping duplicates - df3 = pd.DataFrame({"A": [1, 2, 3]}, index=["a", "b", "a"]) + df3 = DataFrame({"A": [1, 2, 3]}, index=["a", "b", "a"]) result3 = df3.drop_duplicates(index=True) - expected3 = pd.DataFrame({"A": [1, 2]}, index=["a", "b"]) - pd.testing.assert_frame_equal(result3, expected3) + expected3 = DataFrame({"A": [1, 2]}, index=["a", "b"]) + tm.assert_frame_equal(result3, expected3) def test_drop_duplicates_series_vs_dataframe(keep): From 02d655402a9c179eeb68fee62f1f11d9d63c0cc8 Mon Sep 17 00:00:00 2001 From: Yousinator Date: Sat, 29 Jun 2024 11:29:01 +0300 Subject: [PATCH 4/4] FIxed Indentation --- pandas/core/frame.py | 245 ++++++++++++++++++++++--------------------- 1 file changed, 126 insertions(+), 119 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2752ec4f594a..ed75a9aae538c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1384,7 +1384,9 @@ def style(self) -> Styler: return Styler(self) - _shared_docs["items"] = r""" + _shared_docs[ + "items" + ] = r""" Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with @@ -6544,133 +6546,132 @@ def drop_duplicates( ignore_index: bool = ..., ) -> DataFrame | None: ... + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = None, + *, + keep: DropKeep = "first", + inplace: bool = False, + ignore_index: bool = False, + index: bool = False, + ) -> DataFrame | None: + """ + Return DataFrame with duplicate rows removed. -def drop_duplicates( - self, - subset: Hashable | Sequence[Hashable] | None = None, - *, - keep: DropKeep = "first", - inplace: bool = False, - ignore_index: bool = False, - index: bool = False, -) -> DataFrame | None: - """ - Return DataFrame with duplicate rows removed. + Considering certain columns is optional. Indexes, including time indexes + are ignored. - Considering certain columns is optional. Indexes, including time indexes - are ignored. + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', ``False``}, default 'first' + Determines which duplicates (if any) to keep. - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', ``False``}, default 'first' - Determines which duplicates (if any) to keep. + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. + inplace : bool, default ``False`` + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - inplace : bool, default ``False`` - Whether to modify the DataFrame rather than creating a new one. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + index : bool, default ``False`` + If ``True``, drop duplicates based on the index instead of columns. - index : bool, default ``False`` - If ``True``, drop duplicates based on the index instead of columns. + Returns + ------- + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. - Returns - ------- - DataFrame or None - DataFrame with duplicates removed or None if ``inplace=True``. + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. - See Also - -------- - DataFrame.value_counts: Count unique combinations of columns. + Notes + ----- + This method requires columns specified by ``subset`` to be of hashable type. + Passing unhashable columns will raise a ``TypeError``. - Notes - ----- - This method requires columns specified by ``subset`` to be of hashable type. - Passing unhashable columns will raise a ``TypeError``. + Examples + -------- + Consider dataset containing ramen rating. - Examples - -------- - Consider dataset containing ramen rating. - - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - By default, it removes duplicate rows based on all columns. - - >>> df.drop_duplicates() - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - To remove duplicates on specific column(s), use ``subset``. - - >>> df.drop_duplicates(subset=["brand"]) - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - - To remove duplicates and keep last occurrences, use ``keep``. - - >>> df.drop_duplicates(subset=["brand", "style"], keep="last") - brand style rating - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 4 Indomie pack 5.0 - - To remove duplicates based on index, use ``index=True``. - - >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) - >>> df.drop_duplicates(index=True) - A - 0 1 - 1 2 - - To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``. - - >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) - >>> df.drop_duplicates(index=True, keep="last") - A - 0 1 - 1 3 - """ - if self.empty: - return self.copy(deep=False) + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 - inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + By default, it removes duplicate rows based on all columns. - if index: - subset = self.index.names + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 - result = self[-self.duplicated(subset=subset, keep=keep)] - if ignore_index: - result.index = default_index(len(result)) + To remove duplicates on specific column(s), use ``subset``. - if inplace: - self._update_inplace(result) - return None - else: - return result + >>> df.drop_duplicates(subset=["brand"]) + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + + To remove duplicates and keep last occurrences, use ``keep``. + + >>> df.drop_duplicates(subset=["brand", "style"], keep="last") + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + + To remove duplicates based on index, use ``index=True``. + + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True) + A + 0 1 + 1 2 + + To remove duplicates based on index and keep last occurrences, use ``keep='last'`` with ``index=True``. + + >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 1]) + >>> df.drop_duplicates(index=True, keep="last") + A + 0 1 + 1 3 + """ + if self.empty: + return self.copy(deep=False) + + inplace = validate_bool_kwarg(inplace, "inplace") + ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + + if index: + subset = self.index.names + + result = self[-self.duplicated(subset=subset, keep=keep)] + if ignore_index: + result.index = default_index(len(result)) + + if inplace: + self._update_inplace(result) + return None + else: + return result def duplicated( self, @@ -9098,7 +9099,9 @@ def groupby( dropna=dropna, ) - _shared_docs["pivot"] = """ + _shared_docs[ + "pivot" + ] = """ Return reshaped DataFrame organized by given index / column values. Reshape data (produce a "pivot" table) based on column values. Uses @@ -9242,7 +9245,9 @@ def pivot( return pivot(self, index=index, columns=columns, values=values) - _shared_docs["pivot_table"] = """ + _shared_docs[ + "pivot_table" + ] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects @@ -10519,9 +10524,11 @@ def _append( index = Index( [other.name], - name=self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name, + name=( + self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name + ), ) row_df = other.to_frame().T # infer_objects is needed for