From 804ec68e14696aba6bec9f5cf16acea8f75536d2 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Jun 2021 11:34:40 -0700 Subject: [PATCH 1/6] DEPR: DataFrame(floaty, dtype=inty) match Series --- pandas/core/construction.py | 18 +++++++++++++++++ pandas/core/dtypes/cast.py | 8 +++++++- pandas/core/internals/construction.py | 24 ++++++----------------- pandas/tests/frame/methods/test_to_csv.py | 6 ++++-- pandas/tests/frame/test_constructors.py | 21 ++++++++++++++++++++ 5 files changed, 56 insertions(+), 21 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 92f94f4424ee8..0ea8af0de678b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -12,6 +12,7 @@ Sequence, cast, ) +import warnings import numpy as np import numpy.ma as ma @@ -27,6 +28,7 @@ Dtype, DtypeObj, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.base import ( ExtensionDtype, @@ -523,7 +525,23 @@ def sanitize_array( # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) + except IntCastingNaNError: + subarr = np.array(data, copy=copy) except ValueError: + if not raise_cast_failure: + # i.e. called via DataFrame constructor + warnings.warn( + "In a future version, passing float-dtype values and an " + "integer dtype to DataFrame will retain floating dtype " + "if they cannot be cast losslessly (matching Series behavior). " + "To retain the old behavior, use DataFrame(data).astype(dtype)", + FutureWarning, + stacklevel=4, + ) + # GH#40110 until the deprecation is enforced, we _dont_ + # ignore the dtype for DataFrame, and _do_ cast even though + # it is lossy. + return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 40883dd8f747b..aeccce510a5f0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array( if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + if not np.isfinite(arr).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + raise ValueError("Trying to coerce float values to integers") + if is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5c2bed109e3bf..70164241d4765 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -22,11 +22,9 @@ DtypeObj, Manager, ) -from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, dict_compat, maybe_cast_to_datetime, maybe_convert_platform, @@ -306,22 +304,12 @@ def ndarray_to_mgr( shape = values.shape flat = values.ravel() - if not is_integer_dtype(dtype): - # TODO: skipping integer_dtype is needed to keep the tests passing, - # not clear it is correct - # Note: we really only need _try_cast, but keeping to exposed funcs - values = sanitize_array( - flat, None, dtype=dtype, copy=copy, raise_cast_failure=True - ) - else: - try: - values = construct_1d_ndarray_preserving_na( - flat, dtype=dtype, copy=False - ) - except IntCastingNaNError: - # following Series, we ignore the dtype and retain floating - # values instead of casting nans to meaningless ints - pass + # GH#40110 see similar check inside sanitize_array + rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") + + values = sanitize_array( + flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf + ) values = values.reshape(shape) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 769b08373b890..5156d0371e9b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -714,7 +714,9 @@ def create_cols(name): np.random.randn(100, 5), dtype="float64", columns=create_cols("float") ) df_int = DataFrame( - np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + np.random.randn(100, 5).astype("int64"), + dtype="int64", + columns=create_cols("int"), ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self): tm.assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") - df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") + df_int = DataFrame(np.random.randn(1000, 3)).astype("int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d118a376b56ec..6fca9b20eb9be 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2479,6 +2479,27 @@ def test_nested_list_columns(self): tm.assert_frame_equal(result, expected) +class TestDataFrameConstructorWithDtypeCoercion: + def test_floating_values_integer_dtype(self): + # GH#40110 make DataFrame behavior with arraylike floating data and + # inty dtype match Series behavior + + arr = np.random.randn(10, 5) + + msg = "if they cannot be cast losslessly" + with tm.assert_produces_warning(FutureWarning, match=msg): + DataFrame(arr, dtype="i8") + + with tm.assert_produces_warning(None): + # if they can be cast losslessly, no warning + DataFrame(arr.round(), dtype="i8") + + # with NaNs, we already have the correct behavior, so no warning + arr[0, 0] = np.nan + with tm.assert_produces_warning(None): + DataFrame(arr, dtype="i8") + + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_construction_preserves_tzaware_dtypes(self, tz): From 75a04fa942ee37f5f0877071755e90ecc1bcc1fb Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Jun 2021 11:34:40 -0700 Subject: [PATCH 2/6] DEPR: DataFrame(floaty, dtype=inty) match Series --- pandas/core/construction.py | 17 ++++++++++++++++ pandas/core/dtypes/cast.py | 8 +++++++- pandas/core/internals/construction.py | 24 ++++++----------------- pandas/tests/frame/methods/test_to_csv.py | 6 ++++-- pandas/tests/frame/test_constructors.py | 21 ++++++++++++++++++++ 5 files changed, 55 insertions(+), 21 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ff73bc227fdb2..f4926d3b05e1f 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -24,6 +24,7 @@ Dtype, DtypeObj, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.base import ( ExtensionDtype, @@ -516,7 +517,23 @@ def sanitize_array( # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) + except IntCastingNaNError: + subarr = np.array(data, copy=copy) except ValueError: + if not raise_cast_failure: + # i.e. called via DataFrame constructor + warnings.warn( + "In a future version, passing float-dtype values and an " + "integer dtype to DataFrame will retain floating dtype " + "if they cannot be cast losslessly (matching Series behavior). " + "To retain the old behavior, use DataFrame(data).astype(dtype)", + FutureWarning, + stacklevel=4, + ) + # GH#40110 until the deprecation is enforced, we _dont_ + # ignore the dtype for DataFrame, and _do_ cast even though + # it is lossy. + return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5c7211a5d1852..2466e1a2d37c1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2104,7 +2104,13 @@ def maybe_cast_to_integer_array( if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + if not np.isfinite(arr).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + raise ValueError("Trying to coerce float values to integers") + if is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") if casted.dtype < arr.dtype: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 46eb138dc74d1..1aedfeb4008e1 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -22,11 +22,9 @@ DtypeObj, Manager, ) -from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, dict_compat, maybe_cast_to_datetime, maybe_convert_platform, @@ -305,22 +303,12 @@ def ndarray_to_mgr( shape = values.shape flat = values.ravel() - if not is_integer_dtype(dtype): - # TODO: skipping integer_dtype is needed to keep the tests passing, - # not clear it is correct - # Note: we really only need _try_cast, but keeping to exposed funcs - values = sanitize_array( - flat, None, dtype=dtype, copy=copy, raise_cast_failure=True - ) - else: - try: - values = construct_1d_ndarray_preserving_na( - flat, dtype=dtype, copy=False - ) - except IntCastingNaNError: - # following Series, we ignore the dtype and retain floating - # values instead of casting nans to meaningless ints - pass + # GH#40110 see similar check inside sanitize_array + rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") + + values = sanitize_array( + flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf + ) values = values.reshape(shape) diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 769b08373b890..5156d0371e9b7 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -714,7 +714,9 @@ def create_cols(name): np.random.randn(100, 5), dtype="float64", columns=create_cols("float") ) df_int = DataFrame( - np.random.randn(100, 5), dtype="int64", columns=create_cols("int") + np.random.randn(100, 5).astype("int64"), + dtype="int64", + columns=create_cols("int"), ) df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool")) df_object = DataFrame( @@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self): tm.assert_frame_equal(result, df) df_float = DataFrame(np.random.randn(1000, 3), dtype="float64") - df_int = DataFrame(np.random.randn(1000, 3), dtype="int64") + df_int = DataFrame(np.random.randn(1000, 3)).astype("int64") df_bool = DataFrame(True, index=df_float.index, columns=range(3)) df_object = DataFrame("foo", index=df_float.index, columns=range(3)) df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3)) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 784969c199c9f..039df6ae746b5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2484,6 +2484,27 @@ def test_nested_list_columns(self): tm.assert_frame_equal(result, expected) +class TestDataFrameConstructorWithDtypeCoercion: + def test_floating_values_integer_dtype(self): + # GH#40110 make DataFrame behavior with arraylike floating data and + # inty dtype match Series behavior + + arr = np.random.randn(10, 5) + + msg = "if they cannot be cast losslessly" + with tm.assert_produces_warning(FutureWarning, match=msg): + DataFrame(arr, dtype="i8") + + with tm.assert_produces_warning(None): + # if they can be cast losslessly, no warning + DataFrame(arr.round(), dtype="i8") + + # with NaNs, we already have the correct behavior, so no warning + arr[0, 0] = np.nan + with tm.assert_produces_warning(None): + DataFrame(arr, dtype="i8") + + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) def test_construction_preserves_tzaware_dtypes(self, tz): From 97f35887a474eed0d0b18ac20a64db69cfdebd42 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Jun 2021 07:13:51 -0700 Subject: [PATCH 3/6] catch dt64-int case --- pandas/core/dtypes/cast.py | 11 +++++++++++ pandas/tests/frame/methods/test_sort_index.py | 4 ++-- pandas/tests/frame/test_constructors.py | 13 ++++++++++++- pandas/tests/frame/test_nonunique_indexes.py | 2 +- pandas/tests/indexing/test_coercion.py | 5 ++++- 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2466e1a2d37c1..cd25c35051e7e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2124,6 +2124,17 @@ def maybe_cast_to_integer_array( ) return casted + if arr.dtype.kind in ["m", "M"]: + # test_constructor_maskedarray_nonfloat + warnings.warn( + f"Constructing Series or DataFrame from {arr.dtype} values and " + f"dtype={dtype} is deprecated and will raise in a future version. " + "Use values.view(dtype) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + return + # No known cases that get here, but raising explicitly to cover our bases. raise ValueError(f"values cannot be losslessly cast to {dtype}") diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 6e176310da6b4..dac3c0382df01 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -603,7 +603,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int64) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64) + df = DataFrame(np.random.randn(4000).astype("int64"), index=index) # it works! result = df.sort_index(level=0) @@ -611,7 +611,7 @@ def test_sort_index_level_large_cardinality(self): # GH#2684 (int32) index = MultiIndex.from_arrays([np.arange(4000)] * 3) - df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32) + df = DataFrame(np.random.randn(4000).astype("int32"), index=index) # it works! result = df.sort_index(level=0) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 039df6ae746b5..6e0013c196760 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,6 +10,7 @@ import functools import itertools import re +import warnings import numpy as np import numpy.ma as ma @@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self): assert isna(frame).values.all() # cast type - frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + msg = r"datetime64\[ns\] values and dtype=int64" + with tm.assert_produces_warning(FutureWarning, match=msg): + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message="elementwise comparison failed", + ) + frame = DataFrame( + mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64 + ) assert frame.values.dtype == np.int64 # Check non-masked values diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index c9a39eb460cf4..d010426bee53e 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -294,7 +294,7 @@ def test_multi_dtype2(self): def test_dups_across_blocks(self, using_array_manager): # dups across blocks df_float = DataFrame(np.random.randn(10, 3), dtype="float64") - df_int = DataFrame(np.random.randn(10, 3), dtype="int64") + df_int = DataFrame(np.random.randn(10, 3).astype("int64")) df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns) df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns) df_dt = DataFrame( diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 6f4949267c00c..26f2ba577d184 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request): ) request.node.add_marker(mark) - exp = pd.Series([1, val, 3, 4], dtype=np.int8) + warn = None if exp_dtype is np.int8 else FutureWarning + msg = "Values are too large to be losslessly cast to int8" + with tm.assert_produces_warning(warn, match=msg): + exp = pd.Series([1, val, 3, 4], dtype=np.int8) self._assert_setitem_series_conversion(obj, val, exp, exp_dtype) @pytest.mark.parametrize( From 0a287c99eb9ce3aac9f883dac76865faaee9fe08 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Jun 2021 07:16:33 -0700 Subject: [PATCH 4/6] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1556c88aaecc6..ef65d5db4376e 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,6 +700,7 @@ Deprecations - Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) - Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) - Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) From 852bc17fff54e311b147acce0ad01051c1a8e6d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Jun 2021 11:10:19 -0700 Subject: [PATCH 5/6] mypy fixup --- pandas/core/construction.py | 1 + pandas/core/dtypes/cast.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index eeaab34389fe8..c877d27fd2392 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -528,6 +528,7 @@ def sanitize_array( # GH#40110 until the deprecation is enforced, we _dont_ # ignore the dtype for DataFrame, and _do_ cast even though # it is lossy. + dtype = cast(np.dtype, dtype) return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index cd25c35051e7e..690172d5dd960 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2133,7 +2133,7 @@ def maybe_cast_to_integer_array( FutureWarning, stacklevel=find_stack_level(), ) - return + return casted # No known cases that get here, but raising explicitly to cover our bases. raise ValueError(f"values cannot be losslessly cast to {dtype}") From b57f94a683b6cda8885f67d5611db79a5473535b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Jun 2021 18:12:00 -0700 Subject: [PATCH 6/6] avoid OverflowError --- asv_bench/benchmarks/frame_methods.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 760da36a30075..c32eda4928da7 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -652,7 +652,9 @@ class Rank: ] def setup(self, dtype): - self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype) + self.df = DataFrame( + np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype + ) def time_rank(self, dtype): self.df.rank()