From d65adc7aaaed20a552bcae0a0bd059bb03e37b66 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 31 Oct 2021 18:56:55 -0700 Subject: [PATCH 1/6] BUG: Series[int8][:3] = range(3) unnecessary upcasting to int64 --- pandas/core/dtypes/cast.py | 15 +++++++ .../dtypes/cast/test_can_hold_element.py | 41 +++++++++++++++++++ pandas/tests/series/indexing/test_setitem.py | 41 +++++++++++++++++++ 3 files changed, 97 insertions(+) create mode 100644 pandas/tests/dtypes/cast/test_can_hold_element.py diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d8c58d1eaf4c7..4cd6a8e3d3677 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2197,6 +2197,9 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if dtype.kind in ["i", "u"]: + if isinstance(element, range): + return _dtype_can_hold_range(element, dtype) + if tipo is not None: if tipo.kind not in ["i", "u"]: if is_float(element) and element.is_integer(): @@ -2209,6 +2212,7 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: # i.e. nullable IntegerDtype; we can put this into an ndarray # losslessly iff it has no NAs return not element._mask.any() + return True # We have not inferred an integer from the dtype @@ -2249,3 +2253,14 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: return isinstance(element, bytes) and len(element) <= dtype.itemsize raise NotImplementedError(dtype) + + +def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: + """ + maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints), + but in many cases a range can be held by a smaller integer dtype. + Check if this is one of those cases. + """ + if not len(rng): + return True + return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) diff --git a/pandas/tests/dtypes/cast/test_can_hold_element.py b/pandas/tests/dtypes/cast/test_can_hold_element.py new file mode 100644 index 0000000000000..e61915d595c41 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_can_hold_element.py @@ -0,0 +1,41 @@ +import numpy as np + +from pandas.core.dtypes.cast import can_hold_element + + +def test_can_hold_element_range(any_int_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + arr = np.array([], dtype=dtype) + + rng = range(2, 127) + assert can_hold_element(arr, rng) + + # negatives -> can't be held by uint dtypes + rng = range(-2, 127) + if dtype.kind == "i": + assert can_hold_element(arr, rng) + else: + assert not can_hold_element(arr, rng) + + rng = range(2, 255) + if dtype == "int8": + assert not can_hold_element(arr, rng) + else: + assert can_hold_element(arr, rng) + + rng = range(-255, 65537) + if dtype.kind == "u": + assert not can_hold_element(arr, rng) + elif dtype.itemsize < 4: + assert not can_hold_element(arr, rng) + else: + assert can_hold_element(arr, rng) + + # empty + rng = range(-(10 ** 10), -(10 ** 10)) + assert len(rng) == 0 + # assert can_hold_element(arr, rng) + + rng = range(10 ** 10, 10 ** 10) + assert len(rng) == 0 + assert can_hold_element(arr, rng) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5521bee09b19b..d001e6c13d78a 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_list_like + from pandas import ( Categorical, DataFrame, @@ -622,6 +624,16 @@ def test_mask_key(self, obj, key, expected, val, indexer_sli): tm.assert_series_equal(obj, expected) def test_series_where(self, obj, key, expected, val, is_inplace): + if is_list_like(val) and len(val) < len(obj): + # Series.where is not valid here + if isinstance(val, range): + return + + # FIXME: The remaining TestSetitemDT64IntoInt that go through here + # are relying on technically-incorrect behavior because Block.where + # uses np.putmask instead of expressions.where in those cases, + # which has different length-checking semantics. + mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -973,6 +985,35 @@ def expected(self, obj, val): return Series(idx) +class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): + # Setting a range with sufficiently-small integers into small-itemsize + # integer dtypes should not need to upcast + + @pytest.fixture + def obj(self, any_int_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + ser = Series(range(5), dtype=dtype) + return ser + + @pytest.fixture + def val(self): + return range(2, 4) + + @pytest.fixture + def key(self): + return slice(0, 2) + + @pytest.fixture + def expected(self, any_int_numpy_dtype): + dtype = np.dtype(any_int_numpy_dtype) + exp = Series([2, 3, 2, 3, 4], dtype=dtype) + return exp + + @pytest.fixture + def inplace(self): + return True + + def test_setitem_int_as_positional_fallback_deprecation(): # GH#42215 deprecated falling back to positional on __setitem__ with an # int not contained in the index From 66ba2a02dbeaf873654a6c624678c46ddcdfed71 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 31 Oct 2021 18:58:25 -0700 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2a718fdcf16e7..19b7ac0588ce8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -537,7 +537,8 @@ Indexing - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) - Bug in setting a scalar :class:`Interval` value into a :class:`Series` with ``IntervalDtype`` when the scalar's sides are floats and the values' sides are integers (:issue:`44201`) - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) - +- Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`??`) +- Missing ^^^^^^^ From 622cffde07cfadf9f48b3b9801aafed44af53211 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 31 Oct 2021 18:59:46 -0700 Subject: [PATCH 3/6] GH refs --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/tests/dtypes/cast/test_can_hold_element.py | 1 + pandas/tests/series/indexing/test_setitem.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 19b7ac0588ce8..0430db0c9dda7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -537,7 +537,7 @@ Indexing - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) - Bug in setting a scalar :class:`Interval` value into a :class:`Series` with ``IntervalDtype`` when the scalar's sides are floats and the values' sides are integers (:issue:`44201`) - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) -- Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`??`) +- Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`) - Missing diff --git a/pandas/tests/dtypes/cast/test_can_hold_element.py b/pandas/tests/dtypes/cast/test_can_hold_element.py index e61915d595c41..c4776f2a1e143 100644 --- a/pandas/tests/dtypes/cast/test_can_hold_element.py +++ b/pandas/tests/dtypes/cast/test_can_hold_element.py @@ -4,6 +4,7 @@ def test_can_hold_element_range(any_int_numpy_dtype): + # GH#44261 dtype = np.dtype(any_int_numpy_dtype) arr = np.array([], dtype=dtype) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d001e6c13d78a..5f0710dfbb85a 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -986,8 +986,8 @@ def expected(self, obj, val): class TestSetitemRangeIntoIntegerSeries(SetitemCastingEquivalents): - # Setting a range with sufficiently-small integers into small-itemsize - # integer dtypes should not need to upcast + # GH#44261 Setting a range with sufficiently-small integers into + # small-itemsize integer dtypes should not need to upcast @pytest.fixture def obj(self, any_int_numpy_dtype): From 0f6e5eb52d6d5a3ab601274cb5724ec9fa39c294 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Nov 2021 12:34:20 -0700 Subject: [PATCH 4/6] BUG: broadcasting listlike values in Series.__setitem__ GH#44265 --- pandas/core/series.py | 17 +++++++++++++++++ pandas/tests/series/indexing/test_where.py | 22 +++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index b67f16008bb13..8e345cbfb4fa6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1096,9 +1096,26 @@ def __setitem__(self, key, value) -> None: if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) key = np.asarray(key, dtype=bool) + + if ( + is_list_like(value) + and len(value) != len(self) + and not isinstance(value, Series) + and not is_object_dtype(self.dtype) + ): + # Series will be reindexed to have matching length inside + # _where call below + # GH#44265 + indexer = key.nonzero()[0] + self._set_values(indexer, value) + return + + # otherwise with listlike other we interpret series[mask] = other + # as series[mask] = other[mask] try: self._where(~key, value, inplace=True) except InvalidIndexError: + # test_where_dups self.iloc[key] = value return diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index fc9d3a1e1e6ab..1013d670945c8 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -88,7 +88,7 @@ def test_where_unsafe(): s = Series(np.arange(10)) mask = s > 5 - msg = "cannot assign mismatch length to masked array" + msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[mask] = [5, 4, 3, 2, 1] @@ -161,13 +161,10 @@ def test_where_error(): tm.assert_series_equal(s, expected) # failures - msg = "cannot assign mismatch length to masked array" + msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] - msg = ( - "NumPy boolean array indexing assignment cannot assign 0 input " - "values to the 1 output values where the mask is true" - ) + with pytest.raises(ValueError, match=msg): s[[True, False]] = [] @@ -298,6 +295,7 @@ def test_where_setitem_invalid(): "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] ) def test_broadcast(size, mask, item, box): + # GH#8801, GH#4195 selection = np.resize(mask, size) data = np.arange(size, dtype=float) @@ -309,7 +307,17 @@ def test_broadcast(size, mask, item, box): ) s = Series(data) - s[selection] = box(item) + + if selection.sum() != 1: + msg = ( + "cannot set using a list-like indexer with a different " + "length than the value" + ) + with pytest.raises(ValueError, match=msg): + # GH#44265 + s[selection] = box(item) + + s[selection] = item tm.assert_series_equal(s, expected) s = Series(data) From 69e13ce803a841d1cd39d7217921ef85afd868e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Nov 2021 18:50:21 -0700 Subject: [PATCH 5/6] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 0430db0c9dda7..6e103aa8def5b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -538,6 +538,7 @@ Indexing - Bug in setting a scalar :class:`Interval` value into a :class:`Series` with ``IntervalDtype`` when the scalar's sides are floats and the values' sides are integers (:issue:`44201`) - Bug when setting string-backed :class:`Categorical` values that can be parsed to datetimes into a :class:`DatetimeArray` or :class:`Series` or :class:`DataFrame` column backed by :class:`DatetimeArray` failing to parse these strings (:issue:`44236`) - Bug in :meth:`Series.__setitem__` with an integer dtype other than ``int64`` setting with a ``range`` object unnecessarily upcasting to ``int64`` (:issue:`44261`) +- Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`) - Missing From a860225f0f427ca43d267922b3e4009d58403700 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Nov 2021 21:04:03 -0700 Subject: [PATCH 6/6] separate tests --- pandas/tests/series/indexing/test_setitem.py | 40 ++++++++++++++++++++ pandas/tests/series/indexing/test_where.py | 9 ----- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5f0710dfbb85a..4706025b70db6 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1064,3 +1064,43 @@ def test_setitem_with_bool_indexer(): df.loc[[True, False, False], "a"] = 10 expected = DataFrame({"a": [10, 2, 3]}) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("size", range(2, 6)) +@pytest.mark.parametrize( + "mask", [[True, False, False, False, False], [True, False], [False]] +) +@pytest.mark.parametrize( + "item", [2.0, np.nan, np.finfo(float).max, np.finfo(float).min] +) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize( + "box", [lambda x: np.array([x]), lambda x: [x], lambda x: (x,)] +) +def test_setitem_bool_indexer_dont_broadcast_length1_values(size, mask, item, box): + # GH#44265 + # see also tests.series.indexing.test_where.test_broadcast + + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + ser = Series(data) + + if selection.sum() != 1: + msg = ( + "cannot set using a list-like indexer with a different " + "length than the value" + ) + with pytest.raises(ValueError, match=msg): + # GH#44265 + ser[selection] = box(item) + else: + # In this corner case setting is equivalent to setting with the unboxed + # item + ser[selection] = box(item) + + expected = Series(np.arange(size, dtype=float)) + expected[selection] = item + tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 1013d670945c8..88b75164d2f3e 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -308,15 +308,6 @@ def test_broadcast(size, mask, item, box): s = Series(data) - if selection.sum() != 1: - msg = ( - "cannot set using a list-like indexer with a different " - "length than the value" - ) - with pytest.raises(ValueError, match=msg): - # GH#44265 - s[selection] = box(item) - s[selection] = item tm.assert_series_equal(s, expected)