Skip to content

Commit 612d390

Browse files
Add use_cftime option to open_dataset (#2759)
* Add use_cftime option to open_dataset * Remove f-strings * Fix test-skipping logic and remove 'dummy' from warning * Note that use_cftime is only relevant for standard calendar dates * Move use_cftime option to CFDatetimeCoder constructor
1 parent 57cd76d commit 612d390

File tree

6 files changed

+420
-75
lines changed

6 files changed

+420
-75
lines changed

doc/whats-new.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,15 @@ Enhancements
6868
- :py:meth:`pandas.Series.dropna` is now supported for a
6969
:py:class:`pandas.Series` indexed by a :py:class:`~xarray.CFTimeIndex`
7070
(:issue:`2688`). By `Spencer Clark <https://github.com/spencerkclark>`_.
71-
71+
- :py:meth:`~xarray.open_dataset` now accepts a ``use_cftime`` argument, which
72+
can be used to require that ``cftime.datetime`` objects are always used, or
73+
never used when decoding dates encoded with a standard calendar. This can be
74+
used to ensure consistent date types are returned when using
75+
:py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence
76+
serialization warnings raised if dates from a standard calendar are found to
77+
be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By
78+
`Spencer Clark <https://github.com/spencerkclark>`_.
79+
7280
Bug fixes
7381
~~~~~~~~~
7482

xarray/backends/api.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
161161
mask_and_scale=None, decode_times=True, autoclose=None,
162162
concat_characters=True, decode_coords=True, engine=None,
163163
chunks=None, lock=None, cache=None, drop_variables=None,
164-
backend_kwargs=None):
164+
backend_kwargs=None, use_cftime=None):
165165
"""Load and decode a dataset from a file or file-like object.
166166
167167
Parameters
@@ -231,6 +231,16 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
231231
A dictionary of keyword arguments to pass on to the backend. This
232232
may be useful when backend options would improve performance or
233233
allow user control of dataset processing.
234+
use_cftime: bool, optional
235+
Only relevant if encoded dates come from a standard calendar
236+
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
237+
specified). If None (default), attempt to decode times to
238+
``np.datetime64[ns]`` objects; if this is not possible, decode times to
239+
``cftime.datetime`` objects. If True, always decode times to
240+
``cftime.datetime`` objects, regardless of whether or not they can be
241+
represented using ``np.datetime64[ns]`` objects. If False, always
242+
decode times to ``np.datetime64[ns]`` objects; if this is not possible
243+
raise an error.
234244
235245
Returns
236246
-------
@@ -269,7 +279,7 @@ def maybe_decode_store(store, lock=False):
269279
ds = conventions.decode_cf(
270280
store, mask_and_scale=mask_and_scale, decode_times=decode_times,
271281
concat_characters=concat_characters, decode_coords=decode_coords,
272-
drop_variables=drop_variables)
282+
drop_variables=drop_variables, use_cftime=use_cftime)
273283

274284
_protect_dataset_variables_inplace(ds, cache)
275285

@@ -284,7 +294,8 @@ def maybe_decode_store(store, lock=False):
284294
mtime = None
285295
token = tokenize(filename_or_obj, mtime, group, decode_cf,
286296
mask_and_scale, decode_times, concat_characters,
287-
decode_coords, engine, chunks, drop_variables)
297+
decode_coords, engine, chunks, drop_variables,
298+
use_cftime)
288299
name_prefix = 'open_dataset-%s' % token
289300
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
290301
ds2._file_obj = ds._file_obj
@@ -360,7 +371,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
360371
mask_and_scale=None, decode_times=True, autoclose=None,
361372
concat_characters=True, decode_coords=True, engine=None,
362373
chunks=None, lock=None, cache=None, drop_variables=None,
363-
backend_kwargs=None):
374+
backend_kwargs=None, use_cftime=None):
364375
"""Open an DataArray from a netCDF file containing a single data variable.
365376
366377
This is designed to read netCDF files with only one data variable. If
@@ -428,6 +439,16 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
428439
A dictionary of keyword arguments to pass on to the backend. This
429440
may be useful when backend options would improve performance or
430441
allow user control of dataset processing.
442+
use_cftime: bool, optional
443+
Only relevant if encoded dates come from a standard calendar
444+
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
445+
specified). If None (default), attempt to decode times to
446+
``np.datetime64[ns]`` objects; if this is not possible, decode times to
447+
``cftime.datetime`` objects. If True, always decode times to
448+
``cftime.datetime`` objects, regardless of whether or not they can be
449+
represented using ``np.datetime64[ns]`` objects. If False, always
450+
decode times to ``np.datetime64[ns]`` objects; if this is not possible
451+
raise an error.
431452
432453
Notes
433454
-----
@@ -450,7 +471,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
450471
decode_coords=decode_coords, engine=engine,
451472
chunks=chunks, lock=lock, cache=cache,
452473
drop_variables=drop_variables,
453-
backend_kwargs=backend_kwargs)
474+
backend_kwargs=backend_kwargs,
475+
use_cftime=use_cftime)
454476

455477
if len(dataset.data_vars) != 1:
456478
raise ValueError('Given file dataset contains more than one data '

xarray/coding/times.py

Lines changed: 76 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -80,32 +80,7 @@ def _unpack_netcdf_time_units(units):
8080
return delta_units, ref_date
8181

8282

83-
def _decode_datetime_with_cftime(num_dates, units, calendar):
84-
cftime = _import_cftime()
85-
86-
if cftime.__name__ == 'cftime':
87-
dates = np.asarray(cftime.num2date(num_dates, units, calendar,
88-
only_use_cftime_datetimes=True))
89-
else:
90-
# Must be using num2date from an old version of netCDF4 which
91-
# does not have the only_use_cftime_datetimes option.
92-
dates = np.asarray(cftime.num2date(num_dates, units, calendar))
93-
94-
if (dates[np.nanargmin(num_dates)].year < 1678 or
95-
dates[np.nanargmax(num_dates)].year >= 2262):
96-
if calendar in _STANDARD_CALENDARS:
97-
warnings.warn(
98-
'Unable to decode time axis into full '
99-
'numpy.datetime64 objects, continuing using dummy '
100-
'cftime.datetime objects instead, reason: dates out '
101-
'of range', SerializationWarning, stacklevel=3)
102-
else:
103-
if calendar in _STANDARD_CALENDARS:
104-
dates = cftime_to_nptime(dates)
105-
return dates
106-
107-
108-
def _decode_cf_datetime_dtype(data, units, calendar):
83+
def _decode_cf_datetime_dtype(data, units, calendar, use_cftime):
10984
# Verify that at least the first and last date can be decoded
11085
# successfully. Otherwise, tracebacks end up swallowed by
11186
# Dataset.__repr__ when users try to view their lazily decoded array.
@@ -115,7 +90,8 @@ def _decode_cf_datetime_dtype(data, units, calendar):
11590
last_item(values) or [0]])
11691

11792
try:
118-
result = decode_cf_datetime(example_value, units, calendar)
93+
result = decode_cf_datetime(example_value, units, calendar,
94+
use_cftime)
11995
except Exception:
12096
calendar_msg = ('the default calendar' if calendar is None
12197
else 'calendar %r' % calendar)
@@ -129,7 +105,52 @@ def _decode_cf_datetime_dtype(data, units, calendar):
129105
return dtype
130106

131107

132-
def decode_cf_datetime(num_dates, units, calendar=None):
108+
def _decode_datetime_with_cftime(num_dates, units, calendar):
109+
cftime = _import_cftime()
110+
111+
if cftime.__name__ == 'cftime':
112+
return np.asarray(cftime.num2date(num_dates, units, calendar,
113+
only_use_cftime_datetimes=True))
114+
else:
115+
# Must be using num2date from an old version of netCDF4 which
116+
# does not have the only_use_cftime_datetimes option.
117+
return np.asarray(cftime.num2date(num_dates, units, calendar))
118+
119+
120+
def _decode_datetime_with_pandas(flat_num_dates, units, calendar):
121+
if calendar not in _STANDARD_CALENDARS:
122+
raise OutOfBoundsDatetime(
123+
'Cannot decode times from a non-standard calendar, {!r}, using '
124+
'pandas.'.format(calendar))
125+
126+
delta, ref_date = _unpack_netcdf_time_units(units)
127+
delta = _netcdf_to_numpy_timeunit(delta)
128+
try:
129+
ref_date = pd.Timestamp(ref_date)
130+
except ValueError:
131+
# ValueError is raised by pd.Timestamp for non-ISO timestamp
132+
# strings, in which case we fall back to using cftime
133+
raise OutOfBoundsDatetime
134+
135+
# fixes: https://github.com/pydata/pandas/issues/14068
136+
# these lines check if the the lowest or the highest value in dates
137+
# cause an OutOfBoundsDatetime (Overflow) error
138+
with warnings.catch_warnings():
139+
warnings.filterwarnings('ignore', 'invalid value encountered',
140+
RuntimeWarning)
141+
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
142+
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
143+
144+
# Cast input dates to integers of nanoseconds because `pd.to_datetime`
145+
# works much faster when dealing with integers
146+
# make _NS_PER_TIME_DELTA an array to ensure type upcasting
147+
flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
148+
_NS_PER_TIME_DELTA[delta]).astype(np.int64)
149+
150+
return (pd.to_timedelta(flat_num_dates_ns_int, 'ns') + ref_date).values
151+
152+
153+
def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
133154
"""Given an array of numeric dates in netCDF format, convert it into a
134155
numpy array of date time objects.
135156
@@ -149,41 +170,30 @@ def decode_cf_datetime(num_dates, units, calendar=None):
149170
if calendar is None:
150171
calendar = 'standard'
151172

152-
delta, ref_date = _unpack_netcdf_time_units(units)
153-
154-
try:
155-
if calendar not in _STANDARD_CALENDARS:
156-
raise OutOfBoundsDatetime
157-
158-
delta = _netcdf_to_numpy_timeunit(delta)
173+
if use_cftime is None:
159174
try:
160-
ref_date = pd.Timestamp(ref_date)
161-
except ValueError:
162-
# ValueError is raised by pd.Timestamp for non-ISO timestamp
163-
# strings, in which case we fall back to using cftime
164-
raise OutOfBoundsDatetime
165-
166-
# fixes: https://github.com/pydata/pandas/issues/14068
167-
# these lines check if the the lowest or the highest value in dates
168-
# cause an OutOfBoundsDatetime (Overflow) error
169-
with warnings.catch_warnings():
170-
warnings.filterwarnings('ignore', 'invalid value encountered',
171-
RuntimeWarning)
172-
pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
173-
pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
174-
175-
# Cast input dates to integers of nanoseconds because `pd.to_datetime`
176-
# works much faster when dealing with integers
177-
# make _NS_PER_TIME_DELTA an array to ensure type upcasting
178-
flat_num_dates_ns_int = (flat_num_dates.astype(np.float64) *
179-
_NS_PER_TIME_DELTA[delta]).astype(np.int64)
180-
181-
dates = (pd.to_timedelta(flat_num_dates_ns_int, 'ns') +
182-
ref_date).values
183-
184-
except (OutOfBoundsDatetime, OverflowError):
175+
dates = _decode_datetime_with_pandas(flat_num_dates, units,
176+
calendar)
177+
except (OutOfBoundsDatetime, OverflowError):
178+
dates = _decode_datetime_with_cftime(
179+
flat_num_dates.astype(np.float), units, calendar)
180+
181+
if (dates[np.nanargmin(num_dates)].year < 1678 or
182+
dates[np.nanargmax(num_dates)].year >= 2262):
183+
if calendar in _STANDARD_CALENDARS:
184+
warnings.warn(
185+
'Unable to decode time axis into full '
186+
'numpy.datetime64 objects, continuing using '
187+
'cftime.datetime objects instead, reason: dates out '
188+
'of range', SerializationWarning, stacklevel=3)
189+
else:
190+
if calendar in _STANDARD_CALENDARS:
191+
dates = cftime_to_nptime(dates)
192+
elif use_cftime:
185193
dates = _decode_datetime_with_cftime(
186194
flat_num_dates.astype(np.float), units, calendar)
195+
else:
196+
dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)
187197

188198
return dates.reshape(num_dates.shape)
189199

@@ -383,6 +393,8 @@ def encode_cf_timedelta(timedeltas, units=None):
383393

384394

385395
class CFDatetimeCoder(VariableCoder):
396+
def __init__(self, use_cftime=None):
397+
self.use_cftime = use_cftime
386398

387399
def encode(self, variable, name=None):
388400
dims, data, attrs, encoding = unpack_for_encoding(variable)
@@ -403,9 +415,11 @@ def decode(self, variable, name=None):
403415
if 'units' in attrs and 'since' in attrs['units']:
404416
units = pop_to(attrs, encoding, 'units')
405417
calendar = pop_to(attrs, encoding, 'calendar')
406-
dtype = _decode_cf_datetime_dtype(data, units, calendar)
418+
dtype = _decode_cf_datetime_dtype(data, units, calendar,
419+
self.use_cftime)
407420
transform = partial(
408-
decode_cf_datetime, units=units, calendar=calendar)
421+
decode_cf_datetime, units=units, calendar=calendar,
422+
use_cftime=self.use_cftime)
409423
data = lazy_elemwise_func(data, transform, dtype)
410424

411425
return Variable(dims, data, attrs, encoding)

xarray/conventions.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def encode_cf_variable(var, needs_copy=True, name=None):
240240

241241
def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
242242
decode_times=True, decode_endianness=True,
243-
stack_char_dim=True):
243+
stack_char_dim=True, use_cftime=None):
244244
"""
245245
Decodes a variable which may hold CF encoded information.
246246
@@ -270,6 +270,16 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
270270
Whether to stack characters into bytes along the last dimension of this
271271
array. Passed as an argument because we need to look at the full
272272
dataset to figure out if this is appropriate.
273+
use_cftime: bool, optional
274+
Only relevant if encoded dates come from a standard calendar
275+
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
276+
specified). If None (default), attempt to decode times to
277+
``np.datetime64[ns]`` objects; if this is not possible, decode times to
278+
``cftime.datetime`` objects. If True, always decode times to
279+
``cftime.datetime`` objects, regardless of whether or not they can be
280+
represented using ``np.datetime64[ns]`` objects. If False, always
281+
decode times to ``np.datetime64[ns]`` objects; if this is not possible
282+
raise an error.
273283
274284
Returns
275285
-------
@@ -292,7 +302,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
292302

293303
if decode_times:
294304
for coder in [times.CFTimedeltaCoder(),
295-
times.CFDatetimeCoder()]:
305+
times.CFDatetimeCoder(use_cftime=use_cftime)]:
296306
var = coder.decode(var, name=name)
297307

298308
dimensions, data, attributes, encoding = (
@@ -346,7 +356,8 @@ def _update_bounds_attributes(variables):
346356

347357
def decode_cf_variables(variables, attributes, concat_characters=True,
348358
mask_and_scale=True, decode_times=True,
349-
decode_coords=True, drop_variables=None):
359+
decode_coords=True, drop_variables=None,
360+
use_cftime=None):
350361
"""
351362
Decode several CF encoded variables.
352363
@@ -387,7 +398,7 @@ def stackable(dim):
387398
new_vars[k] = decode_cf_variable(
388399
k, v, concat_characters=concat_characters,
389400
mask_and_scale=mask_and_scale, decode_times=decode_times,
390-
stack_char_dim=stack_char_dim)
401+
stack_char_dim=stack_char_dim, use_cftime=use_cftime)
391402
if decode_coords:
392403
var_attrs = new_vars[k].attrs
393404
if 'coordinates' in var_attrs:
@@ -406,7 +417,8 @@ def stackable(dim):
406417

407418

408419
def decode_cf(obj, concat_characters=True, mask_and_scale=True,
409-
decode_times=True, decode_coords=True, drop_variables=None):
420+
decode_times=True, decode_coords=True, drop_variables=None,
421+
use_cftime=None):
410422
"""Decode the given Dataset or Datastore according to CF conventions into
411423
a new Dataset.
412424
@@ -430,6 +442,16 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
430442
A variable or list of variables to exclude from being parsed from the
431443
dataset. This may be useful to drop variables with problems or
432444
inconsistent values.
445+
use_cftime: bool, optional
446+
Only relevant if encoded dates come from a standard calendar
447+
(e.g. 'gregorian', 'proleptic_gregorian', 'standard', or not
448+
specified). If None (default), attempt to decode times to
449+
``np.datetime64[ns]`` objects; if this is not possible, decode times to
450+
``cftime.datetime`` objects. If True, always decode times to
451+
``cftime.datetime`` objects, regardless of whether or not they can be
452+
represented using ``np.datetime64[ns]`` objects. If False, always
453+
decode times to ``np.datetime64[ns]`` objects; if this is not possible
454+
raise an error.
433455
434456
Returns
435457
-------
@@ -454,7 +476,7 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
454476

455477
vars, attrs, coord_names = decode_cf_variables(
456478
vars, attrs, concat_characters, mask_and_scale, decode_times,
457-
decode_coords, drop_variables=drop_variables)
479+
decode_coords, drop_variables=drop_variables, use_cftime=use_cftime)
458480
ds = Dataset(vars, attrs=attrs)
459481
ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
460482
ds._file_obj = file_obj

0 commit comments

Comments
 (0)