-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG refactor datetime parsing and fix 8 bugs #50242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG refactor datetime parsing and fix 8 bugs #50242
Conversation
adff421
to
73a909d
Compare
37d8b15
to
7617774
Compare
pandas/_libs/tslibs/strptime.pyx
Outdated
if (iso_format and not (fmt == "%Y%m%d" and len(val) != 8)): | ||
# There is a fast-path for ISO8601-formatted strings. | ||
# BUT for %Y%m%d, it only works if the string is 8-digits long. | ||
string_to_dts_failed = string_to_dts( | ||
val, &dts, &out_bestunit, &out_local, | ||
&out_tzoffset, False, fmt, exact | ||
) | ||
if string_to_dts_failed: | ||
# An error at this point is a _parsing_ error | ||
# specifically _not_ OutOfBoundsDatetime | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise ValueError( | ||
f"time data \"{val}\" at position {i} doesn't " | ||
f"match format \"{fmt}\"" | ||
) | ||
# No error reported by string_to_dts, pick back up | ||
# where we left off | ||
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | ||
if out_local == 1: | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
# out_tzoffset_vals.add(out_tzoffset * 60.) | ||
tz = timezone(timedelta(minutes=out_tzoffset)) | ||
result_timezone[i] = tz | ||
# value = tz_localize_to_utc_single(value, tz) | ||
out_local = 0 | ||
out_tzoffset = 0 | ||
iresult[i] = value | ||
try: | ||
check_dts_bounds(&dts) | ||
except ValueError: | ||
if is_coerce: | ||
iresult[i] = NPY_NAT | ||
continue | ||
raise | ||
continue |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this pretty-much matches
Lines 598 to 668 in 6598797
string_to_dts_failed = string_to_dts( | |
val, &dts, &out_bestunit, &out_local, | |
&out_tzoffset, False, format, exact | |
) | |
if string_to_dts_failed: | |
# An error at this point is a _parsing_ error | |
# specifically _not_ OutOfBoundsDatetime | |
if _parse_today_now(val, &iresult[i], utc): | |
continue | |
elif require_iso8601: | |
# if requiring iso8601 strings, skip trying | |
# other formats | |
if is_coerce: | |
iresult[i] = NPY_NAT | |
continue | |
elif is_raise: | |
raise ValueError( | |
f"time data \"{val}\" at position {i} doesn't " | |
f"match format \"{format}\"" | |
) | |
return values, tz_out | |
try: | |
py_dt = parse_datetime_string(val, | |
dayfirst=dayfirst, | |
yearfirst=yearfirst) | |
# If the dateutil parser returned tzinfo, capture it | |
# to check if all arguments have the same tzinfo | |
tz = py_dt.utcoffset() | |
except (ValueError, OverflowError): | |
if is_coerce: | |
iresult[i] = NPY_NAT | |
continue | |
raise TypeError( | |
f"invalid string coercion to datetime for \"{val}\" " | |
f"at position {i}" | |
) | |
if tz is not None: | |
seen_datetime_offset = True | |
# dateutil timezone objects cannot be hashed, so | |
# store the UTC offsets in seconds instead | |
out_tzoffset_vals.add(tz.total_seconds()) | |
else: | |
# Add a marker for naive string, to track if we are | |
# parsing mixed naive and aware strings | |
out_tzoffset_vals.add("naive") | |
_ts = convert_datetime_to_tsobject(py_dt, None) | |
iresult[i] = _ts.value | |
if not string_to_dts_failed: | |
# No error reported by string_to_dts, pick back up | |
# where we left off | |
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) | |
if out_local == 1: | |
seen_datetime_offset = True | |
# Store the out_tzoffset in seconds | |
# since we store the total_seconds of | |
# dateutil.tz.tzoffset objects | |
out_tzoffset_vals.add(out_tzoffset * 60.) | |
tz = timezone(timedelta(minutes=out_tzoffset)) | |
value = tz_localize_to_utc_single(value, tz) | |
out_local = 0 | |
out_tzoffset = 0 | |
else: | |
# Add a marker for naive string, to track if we are | |
# parsing mixed naive and aware strings | |
out_tzoffset_vals.add("naive") | |
iresult[i] = value | |
check_dts_bounds(&dts) |
but it's simpler as we don't need to try parse_datetime_string
. That's because if we got here, we know that we're expecting some specific ISO8601 format, so if string_to_dts
can't parse it, then we need to coerce/raise/ignore, but there's no need to try other formats
datetime.datetime(1300, 1, 1, 0, 0) | ||
'13000101' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], | ||
dtype='datetime64[ns, UTC-01:00]', freq=None) | ||
Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
# The +9 format for offsets is supported by dateutil, | ||
# but don't round-trip, see https://github.com/pandas-dev/pandas/issues/48921 | ||
("2011-12-30T00:00:00+9", None), | ||
("2011-12-30T00:00:00+09", None), | ||
("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"), | ||
("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is nice! In
pandas/pandas/_libs/tslibs/parsing.pyx
Lines 1003 to 1007 in 113bdb3
try: | |
array_strptime(np.asarray([dt_str], dtype=object), guessed_format) | |
except ValueError: | |
# Doesn't parse, so this can't be the correct format. | |
return None |
we check that array_strptime
can parse the first non-null element with the guessed format. Now that array_strptime
can parse both ISO and non-ISO formats, we're expanding on the list of formats which can be guessed!
This comment was marked as outdated.
This comment was marked as outdated.
f72d3d6
to
3283b81
Compare
a177975
to
d37e743
Compare
d37e743
to
0c95207
Compare
0c95207
to
3257a31
Compare
3257a31
to
2f8fade
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm. have comments on a couple things I think can happen as follow ups
""" | ||
excluded_formats = ["%Y%m"] | ||
|
||
for date_sep in [" ", "/", "\\", "-", ".", ""]: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of a loop can you express this as a regular expression? Seems like it would help the performance that way as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah nevermind I see this is the way it is currently written - something to consider for another PR though. My guess is can only help
@@ -550,25 +536,13 @@ cpdef array_to_datetime( | |||
|
|||
string_to_dts_failed = string_to_dts( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error messaging here is a bit confusing to me - looks like string_to_dts
is already labeled ?except -1
. Is there a reason why Cython doesn't propogate an error before your check of if string_to_dts_failed
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's because here want_exc
is False
:
pandas/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Lines 665 to 671 in a37b78d
parse_error: | |
if (want_exc) { | |
PyErr_Format(PyExc_ValueError, | |
"Error parsing datetime string \"%s\" at position %d", str, | |
(int)(substr - str)); | |
} | |
return -1; |
The only place where it's True is
Lines 145 to 162 in 2f54a47
object[::1] res_flat = result.ravel() # should NOT be a copy | |
cnp.flatiter it = cnp.PyArray_IterNew(values) | |
if na_rep is None: | |
na_rep = "NaT" | |
if tz is None: | |
# if we don't have a format nor tz, then choose | |
# a format based on precision | |
basic_format = format is None | |
if basic_format: | |
reso_obj = get_resolution(values, tz=tz, reso=reso) | |
show_ns = reso_obj == Resolution.RESO_NS | |
show_us = reso_obj == Resolution.RESO_US | |
show_ms = reso_obj == Resolution.RESO_MS | |
elif format == "%Y-%m-%d %H:%M:%S": | |
# Same format as default, but with hardcoded precision (s) |
which is only a testing function. So, perhaps the ?except -1
can just be removed, and the testing function removed (I think it would be better to test to_datetime
directly).
I'd keep that to a separate PR anyway, but thanks for catching this!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool thanks for review. Yea I'd be OK with your suggestion in a separate PR. Always good to clean this up - not sure we've handled consistently in the past
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
Thanks! Can I ask that we get #50366 in first though? That'll reduce the diff in this one |
9c5c378
to
392d239
Compare
Cool, that's in, and I've rebased. Thanks for your reviews and approvals - @jbrockmendel any further thoughts? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice, thanks for being persistent on this
Thanks! @WillAyd @mroeschke any further comments, or good-to-merge? |
Thanks @MarcoGorelli |
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
tz = timezone(timedelta(minutes=out_tzoffset)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in the analogous block in tslib we then adjust value using tz_localize_to_utc. do we need to do that here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it happens a few levels up, here:
pandas/pandas/core/tools/datetimes.py
Lines 335 to 344 in ef0eaa4
tz_results = np.empty(len(result), dtype=object) | |
for zone in unique(timezones): | |
mask = timezones == zone | |
dta = DatetimeArray(result[mask]).tz_localize(zone) | |
if utc: | |
if dta.tzinfo is None: | |
dta = dta.tz_localize("utc") | |
else: | |
dta = dta.tz_convert("utc") | |
tz_results[mask] = dta |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense, thanks. would it be viable to use the same pattern so we can share more code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that would indeed be good, I'll see what I can do
doc/source/whatsnew/vX.X.X.rst
file if fixing a bug or adding a new feature.this'd solve a number of issues
work in progress
Performance: this maintains the fastness for ISO formats:
upstream/main:
here
Demo of how this addresses #17410
Now real difference for non-ISO formats:
1.5.2:
here:
note
gonna try to get #50361 in first, so marking as draft for now