Skip to content

Commit 4f3fa30

Browse files
committed
Allow for pd.TimedeltaIndex and serialize it to netCDF
Fixes GH55
1 parent a31e0e5 commit 4f3fa30

9 files changed

+154
-41
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ matrix:
77
fast_finish: true
88
include:
99
- python: 2.6
10-
env: UPDATE_ENV="conda install unittest2 pandas==0.13.1"
10+
env: UPDATE_ENV="conda install unittest2 pandas==0.15.0"
1111
# Test on Python 2.7 with and without netCDF4/scipy
1212
- python: 2.7
1313
env: UPDATE_ENV="pip install cyordereddict"

xray/conventions.py

+93-25
Original file line numberDiff line numberDiff line change
@@ -147,23 +147,48 @@ def nan_safe_num2date(num):
147147
return dates
148148

149149

150-
def guess_time_units(dates):
151-
"""Given an array of dates suitable for input to `pandas.DatetimeIndex`,
152-
returns a CF compatible time-unit string of the form "{time_unit} since
153-
{date[0]}", where `time_unit` is 'days', 'hours', 'minutes' or 'seconds'
154-
(the first one that can evenly divide all unique time deltas in `dates`)
150+
def decode_cf_timedelta(num_timedeltas, units):
151+
"""Given an array of numeric timedeltas in netCDF format, convert it into a
152+
numpy timedelta64[ns] array.
155153
"""
156-
dates = pd.DatetimeIndex(np.asarray(dates).reshape(-1))
157-
unique_timedeltas = np.unique(np.diff(dates.values[pd.notnull(dates)]))
154+
# rename 'seconds', 'minutes' and 'hours' to formats pandas recognizes
155+
units = {'seconds': 's', 'minutes': 'm', 'hours': 'h'}.get(units, units)
156+
return pd.to_timedelta(np.asarray(num_timedeltas), unit=units, box=False)
157+
158+
159+
TIME_UNITS = set(['days', 'hours', 'minutes', 'seconds'])
160+
161+
def _infer_time_units_from_diff(unique_timedeltas):
158162
for time_unit, delta in [('days', 86400), ('hours', 3600),
159163
('minutes', 60), ('seconds', 1)]:
160164
unit_delta = np.timedelta64(10 ** 9 * delta, 'ns')
161165
diffs = unique_timedeltas / unit_delta
162166
if np.all(diffs == diffs.astype(int)):
163-
break
164-
else:
165-
raise ValueError('could not automatically determine time units')
166-
return '%s since %s' % (time_unit, dates[0])
167+
return time_unit
168+
raise ValueError('could not automatically determine time units')
169+
170+
171+
def infer_datetime_units(dates):
172+
"""Given an array of datetimes, returns a CF compatible time-unit string of
173+
the form "{time_unit} since {date[0]}", where `time_unit` is 'days',
174+
'hours', 'minutes' or 'seconds' (the first one that can evenly divide all
175+
unique time deltas in `dates`)
176+
"""
177+
dates = pd.to_datetime(dates, box=False)
178+
unique_timedeltas = np.unique(np.diff(dates[pd.notnull(dates)]))
179+
units = _infer_time_units_from_diff(unique_timedeltas)
180+
return '%s since %s' % (units, pd.Timestamp(dates[0]))
181+
182+
183+
def infer_timedelta_units(deltas):
184+
"""Given an array of timedeltas, returns a CF compatible time-unit from
185+
{'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
186+
divide all unique time deltas in `deltas`)
187+
"""
188+
deltas = pd.to_timedelta(deltas, box=False)
189+
unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
190+
units = _infer_time_units_from_diff(unique_timedeltas)
191+
return units
167192

168193

169194
def nctime_to_nptime(times):
@@ -193,7 +218,7 @@ def encode_cf_datetime(dates, units=None, calendar=None):
193218
dates = np.asarray(dates)
194219

195220
if units is None:
196-
units = guess_time_units(dates)
221+
units = infer_datetime_units(dates)
197222
if calendar is None:
198223
calendar = 'proleptic_gregorian'
199224

@@ -211,6 +236,21 @@ def encode_datetime(d):
211236
return (num, units, calendar)
212237

213238

239+
def encode_cf_timedelta(timedeltas, units=None):
240+
if units is None:
241+
units = infer_timedelta_units(timedeltas)
242+
243+
np_unit = {'seconds': 's', 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units]
244+
num = timedeltas.astype('timedelta64[%s]' % np_unit).view(np.int64)
245+
246+
missing = pd.isnull(timedeltas)
247+
if np.any(missing):
248+
num = num.astype(float)
249+
num[missing] = np.nan
250+
251+
return (num, units)
252+
253+
214254
class MaskedAndScaledArray(utils.NDArrayMixin):
215255
"""Wrapper around array-like objects to create a new indexable object where
216256
values, when accessesed, are automatically scaled and masked according to
@@ -288,6 +328,23 @@ def __getitem__(self, key):
288328
calendar=self.calendar)
289329

290330

331+
class DecodedCFTimedeltaArray(utils.NDArrayMixin):
332+
"""Wrapper around array-like objects to create a new indexable object where
333+
values, when accessesed, are automatically converted into timedelta objects
334+
using decode_cf_timedelta.
335+
"""
336+
def __init__(self, array, units):
337+
self.array = array
338+
self.units = units
339+
340+
@property
341+
def dtype(self):
342+
return np.dtype('timedelta64[ns]')
343+
344+
def __getitem__(self, key):
345+
return decode_cf_timedelta(self.array[key], units=self.units)
346+
347+
291348
class CharToStringArray(utils.NDArrayMixin):
292349
"""Wrapper around array-like objects to create a new indexable object where
293350
values, when accessesed, are automatically concatenated along the last
@@ -358,7 +415,7 @@ def char_to_string(arr):
358415
return arr.view(kind + str(arr.shape[-1]))[..., 0]
359416

360417

361-
def _safe_setitem(dest, key, value):
418+
def safe_setitem(dest, key, value):
362419
if key in dest:
363420
raise ValueError('Failed hard to prevent overwriting key %r' % key)
364421
dest[key] = value
@@ -370,9 +427,9 @@ def pop_to(source, dest, key, default=None):
370427
None values are not passed on. If k already exists in dest an
371428
error is raised.
372429
"""
373-
value = source.pop(key, default)
430+
value = source.pop(key, None)
374431
if value is not None:
375-
_safe_setitem(dest, key, value)
432+
safe_setitem(dest, key, value)
376433
return value
377434

378435

@@ -384,16 +441,21 @@ def maybe_encode_datetime(var):
384441
if (np.issubdtype(var.dtype, np.datetime64)
385442
or (var.dtype.kind == 'O'
386443
and isinstance(var.values.flat[0], datetime))):
387-
388444
dims, values, attrs, encoding = _var_as_tuple(var)
389-
if 'units' in attrs or 'calendar' in attrs:
390-
raise ValueError(
391-
"Failed hard to prevent overwriting 'units' or 'calendar'")
392-
393445
(values, units, calendar) = encode_cf_datetime(
394446
values, encoding.pop('units', None), encoding.pop('calendar', None))
395-
attrs['units'] = units
396-
attrs['calendar'] = calendar
447+
safe_setitem(attrs, 'units', units)
448+
safe_setitem(attrs, 'calendar', calendar)
449+
var = Variable(dims, values, attrs, encoding)
450+
return var
451+
452+
453+
def maybe_encode_timedelta(var):
454+
if np.issubdtype(var.dtype, np.timedelta64):
455+
dims, values, attrs, encoding = _var_as_tuple(var)
456+
values, units = encode_cf_timedelta(
457+
values, encoding.pop('units', None))
458+
safe_setitem(attrs, 'units', units)
397459
var = Variable(dims, values, attrs, encoding)
398460
return var
399461

@@ -452,7 +514,7 @@ def _infer_dtype(array):
452514
else:
453515
dtype = np.array(array.flat[0]).dtype
454516
if dtype.kind in ['S', 'U']:
455-
# don't just use inferred_dtype to avoid truncating arrays to
517+
# don't just use inferred dtype to avoid truncating arrays to
456518
# the length of their first element
457519
dtype = np.dtype(dtype.kind)
458520
elif dtype.kind == 'O':
@@ -511,6 +573,7 @@ def encode_cf_variable(var, needs_copy=True):
511573
A variable which has been encoded as described above.
512574
"""
513575
var = maybe_encode_datetime(var)
576+
var = maybe_encode_timedelta(var)
514577
var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
515578
var, needs_copy = maybe_encode_fill_value(var, needs_copy)
516579
var = maybe_encode_dtype(var, needs_copy)
@@ -585,11 +648,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
585648
data = MaskedAndScaledArray(data, fill_value, scale_factor,
586649
add_offset, dtype)
587650

588-
if decode_times:
589-
if 'units' in attributes and 'since' in attributes['units']:
651+
if decode_times and 'units' in attributes:
652+
if 'since' in attributes['units']:
653+
# datetime
590654
units = pop_to(attributes, encoding, 'units')
591655
calendar = pop_to(attributes, encoding, 'calendar')
592656
data = DecodedCFDatetimeArray(data, units, calendar)
657+
elif attributes['units'] in TIME_UNITS:
658+
# timedelta
659+
units = pop_to(attributes, encoding, 'units')
660+
data = DecodedCFTimedeltaArray(data, units)
593661

594662
return Variable(dimensions, indexing.LazilyIndexedArray(data),
595663
attributes, encoding=encoding)

xray/core/utils.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,8 @@ def safe_cast_to_index(array):
107107
index = array.to_index()
108108
else:
109109
kwargs = {}
110-
if hasattr(array, 'dtype'):
111-
if array.dtype == object or array.dtype.kind == 'm':
112-
kwargs['dtype'] = object
110+
if hasattr(array, 'dtype') and array.dtype.kind == 'O':
111+
kwargs['dtype'] = object
113112
index = pd.Index(np.asarray(array), **kwargs)
114113
return index
115114

xray/core/variable.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import timedelta
12
import functools
23

34
import numpy as np
@@ -72,13 +73,15 @@ def _as_compatible_data(data):
7273

7374
if isinstance(data, pd.Timestamp):
7475
# TODO: convert, handle datetime objects, too
75-
data = np.datetime64(data, 'ns')
76+
data = np.datetime64(data.value, 'ns')
77+
if isinstance(data, timedelta):
78+
data = np.timedelta64(getattr(data, 'value', data), 'ns')
7679

7780
# don't check for __len__ or __iter__ so as not to cast if data is a numpy
7881
# numeric type like np.float32
7982
required = ['dtype', 'shape', 'size', 'ndim']
8083
if (any(not hasattr(data, attr) for attr in required)
81-
or isinstance(data, (np.string_, np.datetime64))):
84+
or isinstance(data, (np.string_, np.datetime64, np.timedelta64))):
8285
# data must be ndarray-like
8386
data = np.asarray(data)
8487

@@ -103,6 +106,8 @@ def _as_compatible_data(data):
103106
if data.dtype.kind == 'M':
104107
# TODO: automatically cast arrays of datetime objects as well
105108
data = np.asarray(data, 'datetime64[ns]')
109+
if data.dtype.kind == 'm':
110+
data = np.asarray(data, 'timedelta64[ns]')
106111
data = NumpyArrayAdapter(data)
107112

108113
return data
@@ -170,6 +175,8 @@ def __getitem__(self, key):
170175
# pd.Timestamp rather np.than datetime64 but this is easier
171176
# (for now)
172177
value = np.datetime64('NaT', 'ns')
178+
elif isinstance(value, timedelta):
179+
value = np.timedelta64(getattr(value, 'value', value), 'ns')
173180
else:
174181
value = np.asarray(value, dtype=self.dtype)
175182
else:
@@ -205,6 +212,8 @@ def _as_array_or_item(data):
205212
# convert to a np.datetime64 object, because 0-dimensional ndarrays
206213
# with dtype=datetime64 are broken :(
207214
data = np.datetime64(data, 'ns')
215+
elif data.dtype.kind == 'm':
216+
data = np.timedelta64(data, 'ns')
208217
return data
209218

210219

xray/test/test_backends.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
1-
from xray.conventions import cf_decoder
2-
try:
3-
import cPickle as pickle
4-
except ImportError:
5-
import pickle
61
from io import BytesIO
72
import contextlib
83
import os.path
4+
import pickle
95
import tempfile
106
import unittest
117
import sys
@@ -154,6 +150,12 @@ def test_roundtrip_datetime_data(self):
154150
with self.roundtrip(expected) as actual:
155151
self.assertDatasetIdentical(expected, actual)
156152

153+
def test_roundtrip_timedelta_data(self):
154+
time_deltas = pd.to_timedelta(['1h', '2h', 'NaT'])
155+
expected = Dataset({'td': ('td', time_deltas)})
156+
with self.roundtrip(expected) as actual:
157+
self.assertDatasetIdentical(expected, actual)
158+
157159
def test_roundtrip_example_1_netcdf(self):
158160
expected = open_example_dataset('example_1.nc')
159161
with self.roundtrip(expected) as actual:

xray/test/test_conventions.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def test_cf_datetime_nan(self):
264264
expected = np.array(expected_list, dtype='datetime64[ns]')
265265
self.assertArrayEqual(expected, actual)
266266

267-
def test_guess_time_units(self):
267+
def test_infer_datetime_units(self):
268268
for dates, expected in [(pd.date_range('1900-01-01', periods=5),
269269
'days since 1900-01-01 00:00:00'),
270270
(pd.date_range('1900-01-01 12:00:00', freq='H',
@@ -275,14 +275,24 @@ def test_guess_time_units(self):
275275
'seconds since 1900-01-01 00:00:00'),
276276
(pd.to_datetime(['1900-01-01', '1900-01-02', 'NaT']),
277277
'days since 1900-01-01 00:00:00')]:
278-
self.assertEqual(expected, conventions.guess_time_units(dates))
278+
self.assertEqual(expected, conventions.infer_datetime_units(dates))
279279

280+
def test_infer_timedelta_units(self):
281+
for deltas, expected in [
282+
(pd.to_timedelta(['1 day', '2 days']), 'days'),
283+
(pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
284+
(pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
285+
(pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]:
286+
self.assertEqual(expected, conventions.infer_timedelta_units(deltas))
280287

288+
289+
@requires_netCDF4
281290
class TestEncodeCFVariable(TestCase):
282291
def test_incompatible_attributes(self):
283292
invalid_vars = [
284293
Variable(['t'], pd.date_range('2000-01-01', periods=3),
285294
{'units': 'foobar'}),
295+
Variable(['t'], pd.to_timedelta(['1 day']), {'units': 'foobar'}),
286296
Variable(['t'], [0, 1, 2], {'add_offset': 0}, {'add_offset': 2}),
287297
Variable(['t'], [0, 1, 2], {'_FillValue': 0}, {'_FillValue': 2}),
288298
]

xray/test/test_dataset.py

+9
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,15 @@ def test_sel(self):
457457
self.assertDatasetEqual(data.isel(time=slice(3)),
458458
data.sel(time=(data['time.dayofyear'] <= 3)))
459459

460+
td = pd.to_timedelta(np.arange(3), unit='days')
461+
data = Dataset({'x': ('td', np.arange(3)), 'td': td})
462+
self.assertDatasetEqual(data, data.sel(td=td))
463+
self.assertDatasetEqual(data, data.sel(td=slice('3 days')))
464+
self.assertDatasetEqual(data.isel(td=0), data.sel(td='0 days'))
465+
self.assertDatasetEqual(data.isel(td=0), data.sel(td='0h'))
466+
self.assertDatasetEqual(data.isel(td=slice(1, 3)),
467+
data.sel(td=slice('1 days', '2 days')))
468+
460469
def test_loc(self):
461470
data = create_test_data()
462471
expected = data.sel(dim3='a')

xray/test/test_utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@ class TestSafeCastToIndex(TestCase):
1010
def test(self):
1111
dates = pd.date_range('2000-01-01', periods=10)
1212
x = np.arange(5)
13-
timedeltas = x * np.timedelta64(1, 'D')
13+
td = x * np.timedelta64(1, 'D')
1414
for expected, array in [
1515
(dates, dates.values),
1616
(pd.Index(x, dtype=object), x.astype(object)),
17-
(pd.Index(timedeltas, dtype=object), timedeltas),
17+
(pd.Index(td), td),
18+
(pd.Index(td, dtype=object), td.astype(object)),
1819
]:
1920
actual = utils.safe_cast_to_index(array)
2021
self.assertArrayEqual(expected, actual)

0 commit comments

Comments
 (0)