-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: parsing mixed-offset Timestamps with errors='ignore' and no format raises #50586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -505,144 +505,132 @@ cpdef array_to_datetime( | |
result = np.empty(n, dtype="M8[ns]") | ||
iresult = result.view("i8") | ||
|
||
try: | ||
for i in range(n): | ||
val = values[i] | ||
|
||
try: | ||
if checknull_with_nat_and_na(val): | ||
iresult[i] = NPY_NAT | ||
for i in range(n): | ||
val = values[i] | ||
|
||
elif PyDateTime_Check(val): | ||
if val.tzinfo is not None: | ||
found_tz = True | ||
else: | ||
found_naive = True | ||
tz_out = convert_timezone( | ||
val.tzinfo, | ||
tz_out, | ||
found_naive, | ||
found_tz, | ||
utc_convert, | ||
) | ||
result[i] = parse_pydatetime(val, &dts, utc_convert) | ||
try: | ||
if checknull_with_nat_and_na(val): | ||
iresult[i] = NPY_NAT | ||
|
||
elif PyDate_Check(val): | ||
iresult[i] = pydate_to_dt64(val, &dts) | ||
check_dts_bounds(&dts) | ||
elif PyDateTime_Check(val): | ||
if val.tzinfo is not None: | ||
found_tz = True | ||
else: | ||
found_naive = True | ||
tz_out = convert_timezone( | ||
val.tzinfo, | ||
tz_out, | ||
found_naive, | ||
found_tz, | ||
utc_convert, | ||
) | ||
result[i] = parse_pydatetime(val, &dts, utc_convert) | ||
|
||
elif PyDate_Check(val): | ||
iresult[i] = pydate_to_dt64(val, &dts) | ||
check_dts_bounds(&dts) | ||
|
||
elif is_datetime64_object(val): | ||
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) | ||
elif is_datetime64_object(val): | ||
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) | ||
|
||
elif is_integer_object(val) or is_float_object(val): | ||
# these must be ns unit by-definition | ||
elif is_integer_object(val) or is_float_object(val): | ||
# these must be ns unit by-definition | ||
|
||
if val != val or val == NPY_NAT: | ||
iresult[i] = NPY_NAT | ||
elif is_raise or is_ignore: | ||
iresult[i] = val | ||
else: | ||
# coerce | ||
# we now need to parse this as if unit='ns' | ||
# we can ONLY accept integers at this point | ||
# if we have previously (or in future accept | ||
# datetimes/strings, then we must coerce) | ||
try: | ||
iresult[i] = cast_from_unit(val, "ns") | ||
except OverflowError: | ||
iresult[i] = NPY_NAT | ||
|
||
elif isinstance(val, str): | ||
# string | ||
if type(val) is not str: | ||
# GH#32264 np.str_ object | ||
val = str(val) | ||
|
||
if len(val) == 0 or val in nat_strings: | ||
if val != val or val == NPY_NAT: | ||
iresult[i] = NPY_NAT | ||
elif is_raise or is_ignore: | ||
iresult[i] = val | ||
else: | ||
# coerce | ||
# we now need to parse this as if unit='ns' | ||
# we can ONLY accept integers at this point | ||
# if we have previously (or in future accept | ||
# datetimes/strings, then we must coerce) | ||
try: | ||
iresult[i] = cast_from_unit(val, "ns") | ||
except OverflowError: | ||
iresult[i] = NPY_NAT | ||
continue | ||
|
||
string_to_dts_failed = string_to_dts( | ||
val, &dts, &out_bestunit, &out_local, | ||
&out_tzoffset, False, None, False | ||
) | ||
if string_to_dts_failed: | ||
# An error at this point is a _parsing_ error | ||
# specifically _not_ OutOfBoundsDatetime | ||
if parse_today_now(val, &iresult[i], utc): | ||
continue | ||
|
||
try: | ||
py_dt = parse_datetime_string(val, | ||
dayfirst=dayfirst, | ||
yearfirst=yearfirst) | ||
# If the dateutil parser returned tzinfo, capture it | ||
# to check if all arguments have the same tzinfo | ||
tz = py_dt.utcoffset() | ||
|
||
except (ValueError, OverflowError): | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise TypeError( | ||
f"invalid string coercion to datetime " | ||
f"for \"{val}\", at position {i}" | ||
) | ||
elif isinstance(val, str): | ||
# string | ||
if type(val) is not str: | ||
# GH#32264 np.str_ object | ||
val = str(val) | ||
|
||
if tz is not None: | ||
seen_datetime_offset = True | ||
# dateutil timezone objects cannot be hashed, so | ||
# store the UTC offsets in seconds instead | ||
out_tzoffset_vals.add(tz.total_seconds()) | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add("naive") | ||
|
||
_ts = convert_datetime_to_tsobject(py_dt, None) | ||
iresult[i] = _ts.value | ||
if not string_to_dts_failed: | ||
# No error reported by string_to_dts, pick back up | ||
# where we left off | ||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||
if out_local == 1: | ||
seen_datetime_offset = True | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
out_tzoffset_vals.add(out_tzoffset * 60.) | ||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||
value = tz_localize_to_utc_single(value, tz) | ||
out_local = 0 | ||
out_tzoffset = 0 | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add("naive") | ||
iresult[i] = value | ||
check_dts_bounds(&dts) | ||
if len(val) == 0 or val in nat_strings: | ||
iresult[i] = NPY_NAT | ||
continue | ||
|
||
else: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
Comment on lines
-625
to
-627
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. taking this outside the loop |
||
string_to_dts_failed = string_to_dts( | ||
val, &dts, &out_bestunit, &out_local, | ||
&out_tzoffset, False, None, False | ||
) | ||
if string_to_dts_failed: | ||
# An error at this point is a _parsing_ error | ||
# specifically _not_ OutOfBoundsDatetime | ||
if parse_today_now(val, &iresult[i], utc): | ||
continue | ||
|
||
py_dt = parse_datetime_string(val, | ||
dayfirst=dayfirst, | ||
yearfirst=yearfirst) | ||
# If the dateutil parser returned tzinfo, capture it | ||
# to check if all arguments have the same tzinfo | ||
tz = py_dt.utcoffset() | ||
|
||
if tz is not None: | ||
seen_datetime_offset = True | ||
# dateutil timezone objects cannot be hashed, so | ||
# store the UTC offsets in seconds instead | ||
out_tzoffset_vals.add(tz.total_seconds()) | ||
else: | ||
raise TypeError(f"{type(val)} is not convertible to datetime") | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add("naive") | ||
|
||
except OutOfBoundsDatetime as ex: | ||
ex.args = (f"{ex}, at position {i}",) | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
_ts = convert_datetime_to_tsobject(py_dt, None) | ||
iresult[i] = _ts.value | ||
else: | ||
Comment on lines
-604
to
+593
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have here:
Might as well change the second one to |
||
# No error reported by string_to_dts, pick back up | ||
# where we left off | ||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||
if out_local == 1: | ||
seen_datetime_offset = True | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
out_tzoffset_vals.add(out_tzoffset * 60.) | ||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||
value = tz_localize_to_utc_single(value, tz) | ||
out_local = 0 | ||
out_tzoffset = 0 | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add("naive") | ||
iresult[i] = value | ||
check_dts_bounds(&dts) | ||
|
||
except OutOfBoundsDatetime: | ||
if is_raise: | ||
raise | ||
else: | ||
raise TypeError(f"{type(val)} is not convertible to datetime") | ||
|
||
return ignore_errors_out_of_bounds_fallback(values), tz_out | ||
except OutOfBoundsDatetime as ex: | ||
ex.args = (f"{ex}, at position {i}",) | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
elif is_raise: | ||
raise | ||
return ignore_errors_out_of_bounds_fallback(values), tz_out | ||
|
||
except TypeError: | ||
return _array_to_datetime_object(values, errors, dayfirst, yearfirst) | ||
except (TypeError, OverflowError, ValueError) as ex: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. currenly, If it's OK, I'll address that in a separate PR, to not make the diff too big here |
||
ex.args = (f"{ex}, at position {i}",) | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
elif is_raise: | ||
raise | ||
return values, None | ||
|
||
if seen_datetime_offset and not utc_convert: | ||
# GH#17697 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1094,8 +1094,9 @@ def test_to_datetime_tz(self, cache): | |
) | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_to_datetime_tz_mixed_raises(self, cache): | ||
# mixed tzs will raise | ||
def test_to_datetime_tz_mixed(self, cache): | ||
# mixed tzs will raise if errors='raise' | ||
# https://github.com/pandas-dev/pandas/issues/50585 | ||
arr = [ | ||
Timestamp("2013-01-01 13:00:00", tz="US/Pacific"), | ||
Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), | ||
|
@@ -1107,6 +1108,21 @@ def test_to_datetime_tz_mixed_raises(self, cache): | |
with pytest.raises(ValueError, match=msg): | ||
to_datetime(arr, cache=cache) | ||
|
||
result = to_datetime(arr, cache=cache, errors="ignore") | ||
expected = Index( | ||
[ | ||
Timestamp("2013-01-01 13:00:00-08:00"), | ||
Timestamp("2013-01-02 14:00:00-05:00"), | ||
], | ||
dtype="object", | ||
) | ||
tm.assert_index_equal(result, expected) | ||
result = to_datetime(arr, cache=cache, errors="coerce") | ||
expected = DatetimeIndex( | ||
["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" | ||
) | ||
tm.assert_index_equal(result, expected) | ||
|
||
def test_to_datetime_different_offsets(self, cache): | ||
# inspired by asv timeseries.ToDatetimeNONISO8601 benchmark | ||
# see GH-26097 for more | ||
|
@@ -1540,7 +1556,10 @@ def test_to_datetime_malformed_raise(self): | |
ts_strings = ["200622-12-31", "111111-24-11"] | ||
with pytest.raises( | ||
ValueError, | ||
match=r"^hour must be in 0\.\.23: 111111-24-11, at position 1$", | ||
match=( | ||
r"^offset must be a timedelta strictly between " | ||
r"-timedelta\(hours=24\) and timedelta\(hours=24\)., at position 0$" | ||
), | ||
Comment on lines
1556
to
+1562
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a fun one. on 1.5.x, if you only parse the first input, you get the "offset must be a timedelta strictly..." message, but not if you include the second one In [6]: to_datetime(["200622-12-31"])
ValueError: offset must be a timedelta strictly between -timedelta(hours=24) and timedelta(hours=24).
In [7]: to_datetime(["200622-12-31", "111111-24-11"])
ParserError: hour must be in 0..23: 111111-24-11 Now, you'll get the same message in both cases |
||
): | ||
with tm.assert_produces_warning( | ||
UserWarning, match="Could not infer format" | ||
|
@@ -2381,8 +2400,8 @@ def test_to_datetime_unprocessable_input(self, cache): | |
|
||
expected = Index(np.array([1, "1"], dtype="O")) | ||
tm.assert_equal(result, expected) | ||
msg = "invalid string coercion to datetime" | ||
with pytest.raises(TypeError, match=msg): | ||
msg = '^Given date string "1" not likely a datetime, at position 1$' | ||
with pytest.raises(ValueError, match=msg): | ||
to_datetime([1, "1"], errors="raise", cache=cache) | ||
|
||
def test_to_datetime_unhashable_input(self, cache): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
taking this outside the loop (also, why was a
ValueError
being re-raised asTypeError
? Doesn't look right to me)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I would think this should have just raised the original ValueError