From d0675f4f48972eb2523e3e25386c68cd0807e796 Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Fri, 16 Feb 2024 13:01:41 +0100 Subject: [PATCH 1/6] attempt failing test --- pandas/tests/tslibs/test_array_to_datetime.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index f8939d1d8ccd4..9b0a66609dbf2 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -272,6 +272,17 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +def test_to_datetime_barely_inside_bounds(): + # see gh-57150 + # + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + arr = np.array(["1677-09-21T00:12:43.145224193"], dtype=object) + result, _ = tslib.array_to_datetime(arr) + expected = ["1677-09-21T00:12:43.145224193"] + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + + class SubDatetime(datetime): pass From b4c825da2f87131b379ee81119ccdc3ba8237fec Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Fri, 16 Feb 2024 13:53:40 +0100 Subject: [PATCH 2/6] expand test for demonstration purposes --- pandas/tests/tslibs/test_array_to_datetime.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 9b0a66609dbf2..97995175af8ed 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -272,15 +272,21 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) -def test_to_datetime_barely_inside_bounds(): +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): # see gh-57150 - # - # Close enough to bounds that scaling micros to nanos overflows - # but adding nanos would result in an in-bounds datetime. - arr = np.array(["1677-09-21T00:12:43.145224193"], dtype=object) - result, _ = tslib.array_to_datetime(arr) - expected = ["1677-09-21T00:12:43.145224193"] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) class SubDatetime(datetime): From 58bc627994196ebf82e1b21bcdd1750d72d797bd Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Fri, 16 Feb 2024 14:02:08 +0100 Subject: [PATCH 3/6] fix near-minimum timestamp overflow when scaling from microseconds to nanoseconds --- doc/source/whatsnew/v2.2.1.rst | 1 + .../_libs/src/vendored/numpy/datetime/np_datetime.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9733aff0e6eb5..0f17b418523be 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..b949412caad3f 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -481,6 +481,17 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, } if (base == NPY_FR_ns) { + // for near-minimum timestamps, scaling microseconds to nanoseconds + // overflows but adding nanoseconds puts the timestamp back in a valid range + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // calculate final nanoseconds from minimum without scaling microseconds + int64_t nanoseconds = min_nanoseconds; + PD_CHECK_OVERFLOW(checked_int64_add( + nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + return nanoseconds; + } + int64_t nanoseconds; PD_CHECK_OVERFLOW( scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); From d9cf1a611560b68024181a5f73802cda0cdb1962 Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Fri, 16 Feb 2024 15:49:59 +0100 Subject: [PATCH 4/6] minor refactor --- .../src/vendored/numpy/datetime/np_datetime.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index b949412caad3f..a243f281c8fe1 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -481,23 +481,22 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, } if (base == NPY_FR_ns) { + int64_t nanoseconds; + // for near-minimum timestamps, scaling microseconds to nanoseconds // overflows but adding nanoseconds puts the timestamp back in a valid range const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; if (microseconds == min_nanoseconds / 1000 - 1) { // calculate final nanoseconds from minimum without scaling microseconds - int64_t nanoseconds = min_nanoseconds; PD_CHECK_OVERFLOW(checked_int64_add( - nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); - return nanoseconds; + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); } - int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); - return nanoseconds; } From af7c8eaf30db87aa921139350c37d7a7ffb64059 Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Wed, 21 Feb 2024 10:04:15 +0100 Subject: [PATCH 5/6] add comments around specifically handling near-minimum microsecond and nanosecond timestamps --- .../src/vendored/numpy/datetime/np_datetime.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index a243f281c8fe1..ebbea117e20e5 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -483,14 +483,27 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - // for near-minimum timestamps, scaling microseconds to nanoseconds - // overflows but adding nanoseconds puts the timestamp back in a valid range + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + + // For near-minimum timestamps (1677-09-21 00:12:43.145224193 through + // 1677-09-21 00:12:43.145224999), scaling microseconds to nanoseconds + // overflows (1677-09-21 00:12:43.145224 -> 1677-09-21 00:12:43.145224000), + // but adding nanoseconds can put the timestamp back in a valid range for + // nanosecond parts >= 193. + + // (min_nanoseconds / 1000 - 1) * 1000 would overflow, so do not scale. + // This happens if microseconds corresponds to 1677-09-21 00:12:43.145224. if (microseconds == min_nanoseconds / 1000 - 1) { - // calculate final nanoseconds from minimum without scaling microseconds + // Instead, use minimum nanosecond timestamp as base and offset it with + // nanosecond delta between dts and the minimum (_NS_MIN_DTS.ps = 193000). + // If dts->ps >= _NS_MIN_DTS.ps, timestamp is at/above the minimum. + // If dts->ps < _NS_MIN_DTS.ps, timestamp is below minimum and overflows. PD_CHECK_OVERFLOW(checked_int64_add( min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); } else { + // microseconds does not correspond to near-minimum timestamp, use default + // scaling and addition approach, handling any other overflows. PD_CHECK_OVERFLOW( scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); PD_CHECK_OVERFLOW( From 9c7c0d31309393a6021c31819ecf22209326e564 Mon Sep 17 00:00:00 2001 From: Robert Schmidtke Date: Thu, 22 Feb 2024 09:17:00 +0100 Subject: [PATCH 6/6] consolidate comments --- .../src/vendored/numpy/datetime/np_datetime.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index ebbea117e20e5..277d01807f2f3 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -485,25 +485,12 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; - - // For near-minimum timestamps (1677-09-21 00:12:43.145224193 through - // 1677-09-21 00:12:43.145224999), scaling microseconds to nanoseconds - // overflows (1677-09-21 00:12:43.145224 -> 1677-09-21 00:12:43.145224000), - // but adding nanoseconds can put the timestamp back in a valid range for - // nanosecond parts >= 193. - - // (min_nanoseconds / 1000 - 1) * 1000 would overflow, so do not scale. - // This happens if microseconds corresponds to 1677-09-21 00:12:43.145224. if (microseconds == min_nanoseconds / 1000 - 1) { - // Instead, use minimum nanosecond timestamp as base and offset it with - // nanosecond delta between dts and the minimum (_NS_MIN_DTS.ps = 193000). - // If dts->ps >= _NS_MIN_DTS.ps, timestamp is at/above the minimum. - // If dts->ps < _NS_MIN_DTS.ps, timestamp is below minimum and overflows. + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. PD_CHECK_OVERFLOW(checked_int64_add( min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); } else { - // microseconds does not correspond to near-minimum timestamp, use default - // scaling and addition approach, handling any other overflows. PD_CHECK_OVERFLOW( scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); PD_CHECK_OVERFLOW(