Allow for pd.TimedeltaIndex and serialize it to netCDF

shoyer · shoyer · commit 4f3fa30920f2 · 2014-12-12T01:27:15.000-08:00
Fixes GH55
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,7 @@ matrix:
   fast_finish: true
   include:
   - python: 2.6
-    env: UPDATE_ENV="conda install unittest2 pandas==0.13.1"
+    env: UPDATE_ENV="conda install unittest2 pandas==0.15.0"
   # Test on Python 2.7 with and without netCDF4/scipy
   - python: 2.7
     env: UPDATE_ENV="pip install cyordereddict"
diff --git a/xray/conventions.py b/xray/conventions.py
@@ -147,23 +147,48 @@ def nan_safe_num2date(num):
     return dates
 
 
-def guess_time_units(dates):
-    """Given an array of dates suitable for input to `pandas.DatetimeIndex`,
-    returns a CF compatible time-unit string of the form "{time_unit} since
-    {date[0]}", where `time_unit` is 'days', 'hours', 'minutes' or 'seconds'
-    (the first one that can evenly divide all unique time deltas in `dates`)
+def decode_cf_timedelta(num_timedeltas, units):
+    """Given an array of numeric timedeltas in netCDF format, convert it into a
+    numpy timedelta64[ns] array.
     """
-    dates = pd.DatetimeIndex(np.asarray(dates).reshape(-1))
-    unique_timedeltas = np.unique(np.diff(dates.values[pd.notnull(dates)]))
+    # rename 'seconds', 'minutes' and 'hours' to formats pandas recognizes
+    units = {'seconds': 's', 'minutes': 'm', 'hours': 'h'}.get(units, units)
+    return pd.to_timedelta(np.asarray(num_timedeltas), unit=units, box=False)
+
+
+TIME_UNITS = set(['days', 'hours', 'minutes', 'seconds'])
+
+def _infer_time_units_from_diff(unique_timedeltas):
     for time_unit, delta in [('days', 86400), ('hours', 3600),
                              ('minutes', 60), ('seconds', 1)]:
         unit_delta = np.timedelta64(10 ** 9 * delta, 'ns')
         diffs = unique_timedeltas / unit_delta
         if np.all(diffs == diffs.astype(int)):
-            break
-    else:
-        raise ValueError('could not automatically determine time units')
-    return '%s since %s' % (time_unit, dates[0])
+            return time_unit
+    raise ValueError('could not automatically determine time units')
+
+
+def infer_datetime_units(dates):
+    """Given an array of datetimes, returns a CF compatible time-unit string of
+    the form "{time_unit} since {date[0]}", where `time_unit` is 'days',
+    'hours', 'minutes' or 'seconds' (the first one that can evenly divide all
+    unique time deltas in `dates`)
+    """
+    dates = pd.to_datetime(dates, box=False)
+    unique_timedeltas = np.unique(np.diff(dates[pd.notnull(dates)]))
+    units = _infer_time_units_from_diff(unique_timedeltas)
+    return '%s since %s' % (units, pd.Timestamp(dates[0]))
+
+
+def infer_timedelta_units(deltas):
+    """Given an array of timedeltas, returns a CF compatible time-unit from
+    {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly
+    divide all unique time deltas in `deltas`)
+    """
+    deltas = pd.to_timedelta(deltas, box=False)
+    unique_timedeltas = np.unique(deltas[pd.notnull(deltas)])
+    units = _infer_time_units_from_diff(unique_timedeltas)
+    return units
 
 
 def nctime_to_nptime(times):
@@ -193,7 +218,7 @@ def encode_cf_datetime(dates, units=None, calendar=None):
     dates = np.asarray(dates)
 
     if units is None:
-        units = guess_time_units(dates)
+        units = infer_datetime_units(dates)
     if calendar is None:
         calendar = 'proleptic_gregorian'
 
@@ -211,6 +236,21 @@ def encode_datetime(d):
     return (num, units, calendar)
 
 
+def encode_cf_timedelta(timedeltas, units=None):
+    if units is None:
+        units = infer_timedelta_units(timedeltas)
+
+    np_unit = {'seconds': 's', 'minutes': 'm', 'hours': 'h', 'days': 'D'}[units]
+    num = timedeltas.astype('timedelta64[%s]' % np_unit).view(np.int64)
+
+    missing = pd.isnull(timedeltas)
+    if np.any(missing):
+        num = num.astype(float)
+        num[missing] = np.nan
+
+    return (num, units)
+
+
 class MaskedAndScaledArray(utils.NDArrayMixin):
     """Wrapper around array-like objects to create a new indexable object where
     values, when accessesed, are automatically scaled and masked according to
@@ -288,6 +328,23 @@ def __getitem__(self, key):
                                   calendar=self.calendar)
 
 
+class DecodedCFTimedeltaArray(utils.NDArrayMixin):
+    """Wrapper around array-like objects to create a new indexable object where
+    values, when accessesed, are automatically converted into timedelta objects
+    using decode_cf_timedelta.
+    """
+    def __init__(self, array, units):
+        self.array = array
+        self.units = units
+
+    @property
+    def dtype(self):
+        return np.dtype('timedelta64[ns]')
+
+    def __getitem__(self, key):
+        return decode_cf_timedelta(self.array[key], units=self.units)
+
+
 class CharToStringArray(utils.NDArrayMixin):
     """Wrapper around array-like objects to create a new indexable object where
     values, when accessesed, are automatically concatenated along the last
@@ -358,7 +415,7 @@ def char_to_string(arr):
     return arr.view(kind + str(arr.shape[-1]))[..., 0]
 
 
-def _safe_setitem(dest, key, value):
+def safe_setitem(dest, key, value):
     if key in dest:
         raise ValueError('Failed hard to prevent overwriting key %r' % key)
     dest[key] = value
@@ -370,9 +427,9 @@ def pop_to(source, dest, key, default=None):
     None values are not passed on.  If k already exists in dest an
     error is raised.
     """
-    value = source.pop(key, default)
+    value = source.pop(key, None)
     if value is not None:
-        _safe_setitem(dest, key, value)
+        safe_setitem(dest, key, value)
     return value
 
 
@@ -384,16 +441,21 @@ def maybe_encode_datetime(var):
     if (np.issubdtype(var.dtype, np.datetime64)
             or (var.dtype.kind == 'O'
                 and isinstance(var.values.flat[0], datetime))):
-
         dims, values, attrs, encoding = _var_as_tuple(var)
-        if 'units' in attrs or 'calendar' in attrs:
-            raise ValueError(
-                "Failed hard to prevent overwriting 'units' or 'calendar'")
-
         (values, units, calendar) = encode_cf_datetime(
             values, encoding.pop('units', None), encoding.pop('calendar', None))
-        attrs['units'] = units
-        attrs['calendar'] = calendar
+        safe_setitem(attrs, 'units', units)
+        safe_setitem(attrs, 'calendar', calendar)
+        var = Variable(dims, values, attrs, encoding)
+    return var
+
+
+def maybe_encode_timedelta(var):
+    if np.issubdtype(var.dtype, np.timedelta64):
+        dims, values, attrs, encoding = _var_as_tuple(var)
+        values, units = encode_cf_timedelta(
+            values, encoding.pop('units', None))
+        safe_setitem(attrs, 'units', units)
         var = Variable(dims, values, attrs, encoding)
     return var
 
@@ -452,7 +514,7 @@ def _infer_dtype(array):
     else:
         dtype = np.array(array.flat[0]).dtype
         if dtype.kind in ['S', 'U']:
-            # don't just use inferred_dtype to avoid truncating arrays to
+            # don't just use inferred dtype to avoid truncating arrays to
             # the length of their first element
             dtype = np.dtype(dtype.kind)
         elif dtype.kind == 'O':
@@ -511,6 +573,7 @@ def encode_cf_variable(var, needs_copy=True):
         A variable which has been encoded as described above.
     """
     var = maybe_encode_datetime(var)
+    var = maybe_encode_timedelta(var)
     var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
     var, needs_copy = maybe_encode_fill_value(var, needs_copy)
     var = maybe_encode_dtype(var, needs_copy)
@@ -585,11 +648,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
             data = MaskedAndScaledArray(data, fill_value, scale_factor,
                                         add_offset, dtype)
 
-    if decode_times:
-        if 'units' in attributes and 'since' in attributes['units']:
+    if decode_times and 'units' in attributes:
+        if 'since' in attributes['units']:
+            # datetime
             units = pop_to(attributes, encoding, 'units')
             calendar = pop_to(attributes, encoding, 'calendar')
             data = DecodedCFDatetimeArray(data, units, calendar)
+        elif attributes['units'] in TIME_UNITS:
+            # timedelta
+            units = pop_to(attributes, encoding, 'units')
+            data = DecodedCFTimedeltaArray(data, units)
 
     return Variable(dimensions, indexing.LazilyIndexedArray(data),
                     attributes, encoding=encoding)
diff --git a/xray/core/utils.py b/xray/core/utils.py
@@ -107,9 +107,8 @@ def safe_cast_to_index(array):
         index = array.to_index()
     else:
         kwargs = {}
-        if hasattr(array, 'dtype'):
-            if array.dtype == object or array.dtype.kind == 'm':
-                kwargs['dtype'] = object
+        if hasattr(array, 'dtype') and array.dtype.kind == 'O':
+            kwargs['dtype'] = object
         index = pd.Index(np.asarray(array), **kwargs)
     return index
 
diff --git a/xray/core/variable.py b/xray/core/variable.py
@@ -1,3 +1,4 @@
+from datetime import timedelta
 import functools
 
 import numpy as np
@@ -72,13 +73,15 @@ def _as_compatible_data(data):
 
     if isinstance(data, pd.Timestamp):
         # TODO: convert, handle datetime objects, too
-        data = np.datetime64(data, 'ns')
+        data = np.datetime64(data.value, 'ns')
+    if isinstance(data, timedelta):
+        data = np.timedelta64(getattr(data, 'value', data), 'ns')
 
     # don't check for __len__ or __iter__ so as not to cast if data is a numpy
     # numeric type like np.float32
     required = ['dtype', 'shape', 'size', 'ndim']
     if (any(not hasattr(data, attr) for attr in required)
-            or isinstance(data, (np.string_, np.datetime64))):
+            or isinstance(data, (np.string_, np.datetime64, np.timedelta64))):
         # data must be ndarray-like
         data = np.asarray(data)
 
@@ -103,6 +106,8 @@ def _as_compatible_data(data):
             if data.dtype.kind == 'M':
                 # TODO: automatically cast arrays of datetime objects as well
                 data = np.asarray(data, 'datetime64[ns]')
+            if data.dtype.kind == 'm':
+                data = np.asarray(data, 'timedelta64[ns]')
             data = NumpyArrayAdapter(data)
 
     return data
@@ -170,6 +175,8 @@ def __getitem__(self, key):
                 # pd.Timestamp rather np.than datetime64 but this is easier
                 # (for now)
                 value = np.datetime64('NaT', 'ns')
+            elif isinstance(value, timedelta):
+                value = np.timedelta64(getattr(value, 'value', value), 'ns')
             else:
                 value = np.asarray(value, dtype=self.dtype)
         else:
@@ -205,6 +212,8 @@ def _as_array_or_item(data):
             # convert to a np.datetime64 object, because 0-dimensional ndarrays
             # with dtype=datetime64 are broken :(
             data = np.datetime64(data, 'ns')
+        elif data.dtype.kind == 'm':
+            data = np.timedelta64(data, 'ns')
     return data
 
 
diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py
@@ -1,11 +1,7 @@
-from xray.conventions import cf_decoder
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
 from io import BytesIO
 import contextlib
 import os.path
+import pickle
 import tempfile
 import unittest
 import sys
@@ -154,6 +150,12 @@ def test_roundtrip_datetime_data(self):
         with self.roundtrip(expected) as actual:
             self.assertDatasetIdentical(expected, actual)
 
+    def test_roundtrip_timedelta_data(self):
+        time_deltas = pd.to_timedelta(['1h', '2h', 'NaT'])
+        expected = Dataset({'td': ('td', time_deltas)})
+        with self.roundtrip(expected) as actual:
+            self.assertDatasetIdentical(expected, actual)
+
     def test_roundtrip_example_1_netcdf(self):
         expected = open_example_dataset('example_1.nc')
         with self.roundtrip(expected) as actual:
diff --git a/xray/test/test_conventions.py b/xray/test/test_conventions.py
@@ -264,7 +264,7 @@ def test_cf_datetime_nan(self):
             expected = np.array(expected_list, dtype='datetime64[ns]')
             self.assertArrayEqual(expected, actual)
 
-    def test_guess_time_units(self):
+    def test_infer_datetime_units(self):
         for dates, expected in [(pd.date_range('1900-01-01', periods=5),
                                  'days since 1900-01-01 00:00:00'),
                                 (pd.date_range('1900-01-01 12:00:00', freq='H',
@@ -275,14 +275,24 @@ def test_guess_time_units(self):
                                  'seconds since 1900-01-01 00:00:00'),
                                 (pd.to_datetime(['1900-01-01', '1900-01-02', 'NaT']),
                                  'days since 1900-01-01 00:00:00')]:
-            self.assertEqual(expected, conventions.guess_time_units(dates))
+            self.assertEqual(expected, conventions.infer_datetime_units(dates))
 
+    def test_infer_timedelta_units(self):
+        for deltas, expected in [
+                (pd.to_timedelta(['1 day', '2 days']), 'days'),
+                (pd.to_timedelta(['1h', '1 day 1 hour']), 'hours'),
+                (pd.to_timedelta(['1m', '2m', np.nan]), 'minutes'),
+                (pd.to_timedelta(['1m3s', '1m4s']), 'seconds')]:
+            self.assertEqual(expected, conventions.infer_timedelta_units(deltas))
 
+
+@requires_netCDF4
 class TestEncodeCFVariable(TestCase):
     def test_incompatible_attributes(self):
         invalid_vars = [
             Variable(['t'], pd.date_range('2000-01-01', periods=3),
                      {'units': 'foobar'}),
+            Variable(['t'], pd.to_timedelta(['1 day']), {'units': 'foobar'}),
             Variable(['t'], [0, 1, 2], {'add_offset': 0}, {'add_offset': 2}),
             Variable(['t'], [0, 1, 2], {'_FillValue': 0}, {'_FillValue': 2}),
             ]
diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py
@@ -457,6 +457,15 @@ def test_sel(self):
         self.assertDatasetEqual(data.isel(time=slice(3)),
                                 data.sel(time=(data['time.dayofyear'] <= 3)))
 
+        td = pd.to_timedelta(np.arange(3), unit='days')
+        data = Dataset({'x': ('td', np.arange(3)), 'td': td})
+        self.assertDatasetEqual(data, data.sel(td=td))
+        self.assertDatasetEqual(data, data.sel(td=slice('3 days')))
+        self.assertDatasetEqual(data.isel(td=0), data.sel(td='0 days'))
+        self.assertDatasetEqual(data.isel(td=0), data.sel(td='0h'))
+        self.assertDatasetEqual(data.isel(td=slice(1, 3)),
+                                data.sel(td=slice('1 days', '2 days')))
+
     def test_loc(self):
         data = create_test_data()
         expected = data.sel(dim3='a')
diff --git a/xray/test/test_utils.py b/xray/test/test_utils.py
@@ -10,11 +10,12 @@ class TestSafeCastToIndex(TestCase):
     def test(self):
         dates = pd.date_range('2000-01-01', periods=10)
         x = np.arange(5)
-        timedeltas = x * np.timedelta64(1, 'D')
+        td = x * np.timedelta64(1, 'D')
         for expected, array in [
                 (dates, dates.values),
                 (pd.Index(x, dtype=object), x.astype(object)),
-                (pd.Index(timedeltas, dtype=object), timedeltas),
+                (pd.Index(td), td),
+                (pd.Index(td, dtype=object), td.astype(object)),
                 ]:
             actual = utils.safe_cast_to_index(array)
             self.assertArrayEqual(expected, actual)
diff --git a/xray/test/test_variable.py b/xray/test/test_variable.py