From 388637676eb15abf495a172596f1e58ed566e1cd Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 11:55:10 +0200 Subject: [PATCH 1/4] BUG: Series.setitem losing precision when enlarging --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/indexing.py | 19 ++++++++++++++++--- pandas/tests/series/indexing/test_setitem.py | 14 ++++++++++++++ 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 5891eeea98cbb..81af4fe117dc6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -823,6 +823,7 @@ Indexing - Bug in :meth:`Series.__setitem__` where setting :attr:`NA` into a numeric-dtpye :class:`Series` would incorrectly upcast to object-dtype rather than treating the value as ``np.nan`` (:issue:`44199`) - Bug in :meth:`Series.__setitem__` with ``datetime64[ns]`` dtype, an all-``False`` boolean mask, and an incompatible value incorrectly casting to ``object`` instead of retaining ``datetime64[ns]`` dtype (:issue:`45967`) - Bug in :meth:`Index.__getitem__` raising ``ValueError`` when indexer is from boolean dtype with ``NA`` (:issue:`45806`) +- Bug in :meth:`Series.__setitem__` losing precision when enlarging :class:`Series` with scalar (:issue:`32346`) - Bug in :meth:`Series.mask` with ``inplace=True`` or setting values with a boolean mask with small integer dtypes incorrectly raising (:issue:`45750`) - Bug in :meth:`DataFrame.mask` with ``inplace=True`` and ``ExtensionDtype`` columns incorrectly raising (:issue:`45577`) - Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 20ac0fedc28d1..06d6a1fb01d9b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -21,7 +21,10 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.cast import can_hold_element +from pandas.core.dtypes.cast import ( + can_hold_element, + maybe_promote, +) from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -2083,8 +2086,18 @@ def _setitem_with_indexer_missing(self, indexer, value): # We get only here with loc, so can hard code return self._setitem_with_indexer(new_indexer, value, "loc") - # this preserves dtype of the value - new_values = Series([value])._values + # this preserves dtype of the value and of the object + if isna(value): + new_dtype = self.obj.dtype + elif not self.obj.empty and not is_object_dtype(self.obj.dtype): + # We should not cast, if we have object dtype because we can + # set timedeltas into object series + curr_dtype = self.obj.dtype + curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype) + new_dtype = maybe_promote(curr_dtype, value)[0] + else: + new_dtype = None + new_values = Series([value], dtype=new_dtype)._values if len(self.obj._values): # GH#22717 handle casting compatibility that np.concatenate # does incorrectly diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index e2a5517066ad9..738bd414649f4 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -534,6 +534,20 @@ def test_setitem_not_contained(self, string_series): expected = concat([string_series, app]) tm.assert_series_equal(ser, expected) + def test_setitem_keep_precision(self, any_numeric_ea_dtype): + # GH#32346 + ser = Series([1, 2], dtype=any_numeric_ea_dtype) + ser[2] = 10 + expected = Series([1, 2, 10], dtype=any_numeric_ea_dtype) + tm.assert_series_equal(ser, expected) + + def test_setitem_enlarge_with_na(self): + # GH#32346 + ser = Series([1, 2], dtype="Int64") + ser[2] = NA + expected = Series([1, 2, NA], dtype="Int64") + tm.assert_series_equal(ser, expected) + def test_setitem_scalar_into_readonly_backing_data(): # GH#14359: test that you cannot mutate a read only buffer From f986959680529dc663d3701fe088682a4dd554e7 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 14 Jun 2022 21:32:29 +0200 Subject: [PATCH 2/4] Fix nan case and add test --- pandas/core/indexing.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 06d6a1fb01d9b..a96814df9ffa2 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2087,7 +2087,7 @@ def _setitem_with_indexer_missing(self, indexer, value): return self._setitem_with_indexer(new_indexer, value, "loc") # this preserves dtype of the value and of the object - if isna(value): + if isna(value == value): new_dtype = self.obj.dtype elif not self.obj.empty and not is_object_dtype(self.obj.dtype): # We should not cast, if we have object dtype because we can diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 738bd414649f4..4710eae0968ed 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -548,6 +548,13 @@ def test_setitem_enlarge_with_na(self): expected = Series([1, 2, NA], dtype="Int64") tm.assert_series_equal(ser, expected) + def test_setitem_enlarge_with_nan(self): + # GH#32346 + ser = Series([1, 2]) + ser[2] = np.nan + expected = Series([1, 2, np.nan]) + tm.assert_series_equal(ser, expected) + def test_setitem_scalar_into_readonly_backing_data(): # GH#14359: test that you cannot mutate a read only buffer From 1c5c0d4369cc3a1bbd076080dcc0c527c9da281b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 24 Jun 2022 13:21:19 +0200 Subject: [PATCH 3/4] Handle multiple na cases --- pandas/core/indexing.py | 11 +++++++++-- pandas/tests/series/indexing/test_setitem.py | 20 ++++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a96814df9ffa2..a8988b159f9f8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -44,7 +44,9 @@ ) from pandas.core.dtypes.missing import ( infer_fill_value, + is_valid_na_for_dtype, isna, + na_value_for_dtype, ) from pandas.core import algorithms as algos @@ -2087,8 +2089,11 @@ def _setitem_with_indexer_missing(self, indexer, value): return self._setitem_with_indexer(new_indexer, value, "loc") # this preserves dtype of the value and of the object - if isna(value == value): - new_dtype = self.obj.dtype + if is_valid_na_for_dtype(value, self.obj.dtype): + value = na_value_for_dtype(self.obj.dtype, compat=False) + new_dtype = maybe_promote(self.obj.dtype, value)[0] + elif not is_valid_na_for_dtype(value, self.obj.dtype): + new_dtype = None elif not self.obj.empty and not is_object_dtype(self.obj.dtype): # We should not cast, if we have object dtype because we can # set timedeltas into object series @@ -2097,7 +2102,9 @@ def _setitem_with_indexer_missing(self, indexer, value): new_dtype = maybe_promote(curr_dtype, value)[0] else: new_dtype = None + new_values = Series([value], dtype=new_dtype)._values + if len(self.obj._values): # GH#22717 handle casting compatibility that np.concatenate # does incorrectly diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 4710eae0968ed..65a0ef1ab2e79 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -541,11 +541,23 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): expected = Series([1, 2, 10], dtype=any_numeric_ea_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlarge_with_na(self): + @pytest.mark.parametrize("indexer", [1, 2]) + @pytest.mark.parametrize( + "na, target_na, dtype, target_dtype", + [ + (NA, NA, "Int64", "Int64"), + (NA, np.nan, "int64", "float64"), + (NaT, NaT, "int64", "object"), + (np.nan, NA, "Int64", "Int64"), + (np.nan, NA, "Float64", "Float64"), + ], + ) + def test_setitem_enlarge_with_na(self, na, target_na, dtype, target_dtype, indexer): # GH#32346 - ser = Series([1, 2], dtype="Int64") - ser[2] = NA - expected = Series([1, 2, NA], dtype="Int64") + ser = Series([1, 2], dtype=dtype) + ser[indexer] = na + expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na] + expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) def test_setitem_enlarge_with_nan(self): From d83391a77771d6e30dc424a527b394b2cf2a6a47 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Fri, 24 Jun 2022 13:31:35 +0200 Subject: [PATCH 4/4] Remove test --- pandas/core/indexing.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index a8988b159f9f8..67242aeeb49c6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2092,7 +2092,7 @@ def _setitem_with_indexer_missing(self, indexer, value): if is_valid_na_for_dtype(value, self.obj.dtype): value = na_value_for_dtype(self.obj.dtype, compat=False) new_dtype = maybe_promote(self.obj.dtype, value)[0] - elif not is_valid_na_for_dtype(value, self.obj.dtype): + elif isna(value): new_dtype = None elif not self.obj.empty and not is_object_dtype(self.obj.dtype): # We should not cast, if we have object dtype because we can diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 65a0ef1ab2e79..b73aacae18bc5 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -550,6 +550,7 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype): (NaT, NaT, "int64", "object"), (np.nan, NA, "Int64", "Int64"), (np.nan, NA, "Float64", "Float64"), + (np.nan, np.nan, "int64", "float64"), ], ) def test_setitem_enlarge_with_na(self, na, target_na, dtype, target_dtype, indexer): @@ -560,13 +561,6 @@ def test_setitem_enlarge_with_na(self, na, target_na, dtype, target_dtype, index expected = Series(expected_values, dtype=target_dtype) tm.assert_series_equal(ser, expected) - def test_setitem_enlarge_with_nan(self): - # GH#32346 - ser = Series([1, 2]) - ser[2] = np.nan - expected = Series([1, 2, np.nan]) - tm.assert_series_equal(ser, expected) - def test_setitem_scalar_into_readonly_backing_data(): # GH#14359: test that you cannot mutate a read only buffer