From 9d29eeadae51b80be8d3575cc6a98e9b21807e74 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 5 Jan 2023 16:25:48 +0000 Subject: [PATCH 1/3] refactor and fix bug --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/tslib.pyx | 238 ++++++++++++------------- pandas/tests/tools/test_to_datetime.py | 29 ++- 3 files changed, 139 insertions(+), 129 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ea6a832d25058..828ca4f7e6d93 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -824,6 +824,7 @@ Datetimelike - Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`) - Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`) - Bug in :func:`Timestamp.utctimetuple` raising a ``TypeError`` (:issue:`32174`) +- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing mixed-offset :class:`Timestamp` with ``errors='ignore'`` (:issue:`50585`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index bfb5d81a3cc76..d83603b2adffa 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -505,144 +505,134 @@ cpdef array_to_datetime( result = np.empty(n, dtype="M8[ns]") iresult = result.view("i8") - try: - for i in range(n): - val = values[i] - - try: - if checknull_with_nat_and_na(val): - iresult[i] = NPY_NAT + for i in range(n): + val = values[i] - elif PyDateTime_Check(val): - if val.tzinfo is not None: - found_tz = True - else: - found_naive = True - tz_out = convert_timezone( - val.tzinfo, - tz_out, - found_naive, - found_tz, - utc_convert, - ) - result[i] = parse_pydatetime(val, &dts, utc_convert) + try: + if checknull_with_nat_and_na(val): + iresult[i] = NPY_NAT - elif PyDate_Check(val): - iresult[i] = pydate_to_dt64(val, &dts) - check_dts_bounds(&dts) + elif PyDateTime_Check(val): + if val.tzinfo is not None: + found_tz = True + else: + found_naive = True + tz_out = convert_timezone( + val.tzinfo, + tz_out, + found_naive, + found_tz, + utc_convert, + ) + result[i] = parse_pydatetime(val, &dts, utc_convert) + + elif PyDate_Check(val): + iresult[i] = pydate_to_dt64(val, &dts) + check_dts_bounds(&dts) - elif is_datetime64_object(val): - iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) + elif is_datetime64_object(val): + iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) - elif is_integer_object(val) or is_float_object(val): - # these must be ns unit by-definition + elif is_integer_object(val) or is_float_object(val): + # these must be ns unit by-definition - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - elif is_raise or is_ignore: - iresult[i] = val - else: - # coerce - # we now need to parse this as if unit='ns' - # we can ONLY accept integers at this point - # if we have previously (or in future accept - # datetimes/strings, then we must coerce) - try: - iresult[i] = cast_from_unit(val, "ns") - except OverflowError: - iresult[i] = NPY_NAT - - elif isinstance(val, str): - # string - if type(val) is not str: - # GH#32264 np.str_ object - val = str(val) - - if len(val) == 0 or val in nat_strings: + if val != val or val == NPY_NAT: + iresult[i] = NPY_NAT + elif is_raise or is_ignore: + iresult[i] = val + else: + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + try: + iresult[i] = cast_from_unit(val, "ns") + except OverflowError: iresult[i] = NPY_NAT - continue - string_to_dts_failed = string_to_dts( - val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, None, False - ) - if string_to_dts_failed: - # An error at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if parse_today_now(val, &iresult[i], utc): - continue - - try: - py_dt = parse_datetime_string(val, - dayfirst=dayfirst, - yearfirst=yearfirst) - # If the dateutil parser returned tzinfo, capture it - # to check if all arguments have the same tzinfo - tz = py_dt.utcoffset() - - except (ValueError, OverflowError): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise TypeError( - f"invalid string coercion to datetime " - f"for \"{val}\", at position {i}" - ) + elif isinstance(val, str): + # string + if type(val) is not str: + # GH#32264 np.str_ object + val = str(val) - if tz is not None: - seen_datetime_offset = True - # dateutil timezone objects cannot be hashed, so - # store the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") - - _ts = convert_datetime_to_tsobject(py_dt, None) - iresult[i] = _ts.value - if not string_to_dts_failed: - # No error reported by string_to_dts, pick back up - # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - if out_local == 1: - seen_datetime_offset = True - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - out_tzoffset_vals.add(out_tzoffset * 60.) - tz = timezone(timedelta(minutes=out_tzoffset)) - value = tz_localize_to_utc_single(value, tz) - out_local = 0 - out_tzoffset = 0 - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add("naive") - iresult[i] = value - check_dts_bounds(&dts) + if len(val) == 0 or val in nat_strings: + iresult[i] = NPY_NAT + continue - else: - if is_coerce: - iresult[i] = NPY_NAT + string_to_dts_failed = string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, None, False + ) + if string_to_dts_failed: + # An error at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if parse_today_now(val, &iresult[i], utc): + continue + + py_dt = parse_datetime_string(val, + dayfirst=dayfirst, + yearfirst=yearfirst) + # If the dateutil parser returned tzinfo, capture it + # to check if all arguments have the same tzinfo + tz = py_dt.utcoffset() + + if tz is not None: + seen_datetime_offset = True + # dateutil timezone objects cannot be hashed, so + # store the UTC offsets in seconds instead + out_tzoffset_vals.add(tz.total_seconds()) else: - raise TypeError(f"{type(val)} is not convertible to datetime") + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add("naive") - except OutOfBoundsDatetime as ex: - ex.args = (f"{ex}, at position {i}",) - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + _ts = convert_datetime_to_tsobject(py_dt, None) + iresult[i] = _ts.value + else: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + seen_datetime_offset = True + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + out_tzoffset_vals.add(out_tzoffset * 60.) + tz = timezone(timedelta(minutes=out_tzoffset)) + value = tz_localize_to_utc_single(value, tz) + out_local = 0 + out_tzoffset = 0 + else: + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add("naive") + iresult[i] = value + check_dts_bounds(&dts) - except OutOfBoundsDatetime: - if is_raise: - raise + else: + raise TypeError(f"{type(val)} is not convertible to datetime") - return ignore_errors_out_of_bounds_fallback(values), tz_out + except (OutOfBoundsDatetime,) as ex: + ex.args = (f"{ex}, at position {i}",) + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise + if isinstance(ex, OutOfBoundsDatetime): + return ignore_errors_out_of_bounds_fallback(values), tz_out + return values, None - except TypeError: - return _array_to_datetime_object(values, errors, dayfirst, yearfirst) + except (TypeError, OverflowError, ValueError) as ex: + ex.args = (f"{ex}, at position {i}",) + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise + return values, None if seen_datetime_offset and not utc_convert: # GH#17697 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index d6e862ed11d36..315fc2f8de582 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1094,8 +1094,9 @@ def test_to_datetime_tz(self, cache): ) tm.assert_index_equal(result, expected) - def test_to_datetime_tz_mixed_raises(self, cache): - # mixed tzs will raise + def test_to_datetime_tz_mixed(self, cache): + # mixed tzs will raise if errors='raise' + # https://github.com/pandas-dev/pandas/issues/50585 arr = [ Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), @@ -1107,6 +1108,21 @@ def test_to_datetime_tz_mixed_raises(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(arr, cache=cache) + result = to_datetime(arr, cache=cache, errors="ignore") + expected = Index( + [ + Timestamp("2013-01-01 13:00:00-08:00"), + Timestamp("2013-01-02 14:00:00-05:00"), + ], + dtype="object", + ) + tm.assert_index_equal(result, expected) + result = to_datetime(arr, cache=cache, errors="coerce") + expected = DatetimeIndex( + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ) + tm.assert_index_equal(result, expected) + def test_to_datetime_different_offsets(self, cache): # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark # see GH-26097 for more @@ -1540,7 +1556,10 @@ def test_to_datetime_malformed_raise(self): ts_strings = ["200622-12-31", "111111-24-11"] with pytest.raises( ValueError, - match=r"^hour must be in 0\.\.23: 111111-24-11, at position 1$", + match=( + r"^offset must be a timedelta strictly between " + r"-timedelta\(hours=24\) and timedelta\(hours=24\)., at position 0$" + ), ): with tm.assert_produces_warning( UserWarning, match="Could not infer format" @@ -2381,8 +2400,8 @@ def test_to_datetime_unprocessable_input(self, cache): expected = Index(np.array([1, "1"], dtype="O")) tm.assert_equal(result, expected) - msg = "invalid string coercion to datetime" - with pytest.raises(TypeError, match=msg): + msg = '^Given date string "1" not likely a datetime, at position 1$' + with pytest.raises(ValueError, match=msg): to_datetime([1, "1"], errors="raise", cache=cache) def test_to_datetime_unhashable_input(self, cache): From a334a8412d23a63eff6562001a0eb638228b42a7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 5 Jan 2023 18:08:28 +0000 Subject: [PATCH 2/3] simplify --- pandas/_libs/tslib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d83603b2adffa..65829bc431a8c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -614,16 +614,14 @@ cpdef array_to_datetime( else: raise TypeError(f"{type(val)} is not convertible to datetime") - except (OutOfBoundsDatetime,) as ex: + except OutOfBoundsDatetime as ex: ex.args = (f"{ex}, at position {i}",) if is_coerce: iresult[i] = NPY_NAT continue elif is_raise: raise - if isinstance(ex, OutOfBoundsDatetime): - return ignore_errors_out_of_bounds_fallback(values), tz_out - return values, None + return ignore_errors_out_of_bounds_fallback(values), tz_out except (TypeError, OverflowError, ValueError) as ex: ex.args = (f"{ex}, at position {i}",) From 1a7d888c0dfe4d905c15a7ca89908097636224d9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 5 Jan 2023 18:12:30 +0000 Subject: [PATCH 3/3] whatsnew notes --- doc/source/whatsnew/v2.0.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 828ca4f7e6d93..962cebfd866ba 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -105,7 +105,6 @@ Other enhancements - :meth:`DataFrame.plot.hist` now recognizes ``xlabel`` and ``ylabel`` arguments (:issue:`49793`) - Improved error message in :func:`to_datetime` for non-ISO8601 formats, informing users about the position of the first error (:issue:`50361`) - Improved error message when trying to align :class:`DataFrame` objects (for example, in :func:`DataFrame.compare`) to clarify that "identically labelled" refers to both index and columns (:issue:`50083`) -- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) - .. --------------------------------------------------------------------------- @@ -786,6 +785,8 @@ Performance improvements - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) - Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) +- Performance improvement in :func:`to_datetime` when format is given or can be inferred (:issue:`50465`) +- Performance improvement in :func:`read_csv` when passing :func:`to_datetime` lambda-function to ``date_parser`` and inputs have mixed timezone offsetes (:issue:`35296`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: