to_dict without data (#2659)

rabernat · web-flow · commit a7d55b9bcd0c · 2019-01-22T00:25:55.000+01:00
* add data=False to to_dict methods

* doc and whats-new

* fix pep8 errors

* small tweaks

* added shape and dtype
diff --git a/doc/io.rst b/doc/io.rst
@@ -81,6 +81,16 @@ require external libraries and dicts can easily be pickled, or converted to
 json, or geojson. All the values are converted to lists, so dicts might
 be quite large.
 
+To export just the dataset schema, without the data itself, use the
+``data=False`` option:
+
+.. ipython:: python
+
+    ds.to_dict(data=False)
+
+This can be useful for generating indices of dataset contents to expose to
+search indices or other automated data discovery tools.
+
 .. _io.netcdf:
 
 netCDF
@@ -665,7 +675,7 @@ To read a consolidated store, pass the ``consolidated=True`` option to
 :py:func:`~xarray.open_zarr`::
 
     ds = xr.open_zarr('foo.zarr', consolidated=True)
-    
+
 Xarray can't perform consolidation on pre-existing zarr datasets. This should
 be done directly from zarr, as described in the
 `zarr docs <https://zarr.readthedocs.io/en/latest/tutorial.html#consolidating-metadata>`_.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -28,6 +28,8 @@ Breaking changes
 Enhancements
 ~~~~~~~~~~~~
 
+- Add ``data=False`` option to ``to_dict()`` methods. (:issue:`2656`)
+  By `Ryan Abernathey <https://github.com/rabernat>`_
 - :py:meth:`~xarray.DataArray.coarsen` and
   :py:meth:`~xarray.Dataset.coarsen` are newly added.
   See :ref:`comput.coarsen` for details.
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1760,7 +1760,7 @@ def to_netcdf(self, *args, **kwargs):
 
         return dataset.to_netcdf(*args, **kwargs)
 
-    def to_dict(self):
+    def to_dict(self, data=True):
         """
         Convert this xarray.DataArray into a dictionary following xarray
         naming conventions.
@@ -1769,22 +1769,20 @@ def to_dict(self):
         Useful for coverting to json. To avoid datetime incompatibility
         use decode_times=False kwarg in xarrray.open_dataset.
 
+        Parameters
+        ----------
+        data : bool, optional
+            Whether to include the actual data in the dictionary. When set to
+            False, returns just the schema.
+
         See also
         --------
         DataArray.from_dict
         """
-        d = {'coords': {}, 'attrs': decode_numpy_dict_values(self.attrs),
-             'dims': self.dims}
-
+        d = self.variable.to_dict(data=data)
+        d.update({'coords': {}, 'name': self.name})
         for k in self.coords:
-            data = ensure_us_time_resolution(self[k].values).tolist()
-            d['coords'].update({
-                k: {'data': data,
-                    'dims': self[k].dims,
-                    'attrs': decode_numpy_dict_values(self[k].attrs)}})
-
-        d.update({'data': ensure_us_time_resolution(self.values).tolist(),
-                  'name': self.name})
+            d['coords'][k] = self.coords[k].variable.to_dict(data=data)
         return d
 
     @classmethod
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -3221,7 +3221,7 @@ def to_dask_dataframe(self, dim_order=None, set_index=False):
 
         return df
 
-    def to_dict(self):
+    def to_dict(self, data=True):
         """
         Convert this dataset to a dictionary following xarray naming
         conventions.
@@ -3230,25 +3230,22 @@ def to_dict(self):
         Useful for coverting to json. To avoid datetime incompatibility
         use decode_times=False kwarg in xarrray.open_dataset.
 
+        Parameters
+        ----------
+        data : bool, optional
+            Whether to include the actual data in the dictionary. When set to
+            False, returns just the schema.
+
         See also
         --------
         Dataset.from_dict
         """
         d = {'coords': {}, 'attrs': decode_numpy_dict_values(self.attrs),
              'dims': dict(self.dims), 'data_vars': {}}
-
         for k in self.coords:
-            data = ensure_us_time_resolution(self[k].values).tolist()
-            d['coords'].update({
-                k: {'data': data,
-                    'dims': self[k].dims,
-                    'attrs': decode_numpy_dict_values(self[k].attrs)}})
+            d['coords'].update({k: self[k].variable.to_dict(data=data)})
         for k in self.data_vars:
-            data = ensure_us_time_resolution(self[k].values).tolist()
-            d['data_vars'].update({
-                k: {'data': data,
-                    'dims': self[k].dims,
-                    'attrs': decode_numpy_dict_values(self[k].attrs)}})
+            d['data_vars'].update({k: self[k].variable.to_dict(data=data)})
         return d
 
     @classmethod
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -19,7 +19,8 @@
 from .options import _get_keep_attrs
 from .pycompat import (
     OrderedDict, basestring, dask_array_type, integer_types, zip)
-from .utils import OrderedSet, either_dict_or_kwargs
+from .utils import (OrderedSet, either_dict_or_kwargs,
+                    decode_numpy_dict_values, ensure_us_time_resolution)
 
 try:
     import dask.array as da
@@ -410,6 +411,16 @@ def to_index(self):
         """Convert this variable to a pandas.Index"""
         return self.to_index_variable().to_index()
 
+    def to_dict(self, data=True):
+        """Dictionary representation of variable."""
+        item = {'dims': self.dims,
+                'attrs': decode_numpy_dict_values(self.attrs)}
+        if data:
+            item['data'] = ensure_us_time_resolution(self.values).tolist()
+        else:
+            item.update({'dtype': str(self.dtype), 'shape': self.shape})
+        return item
+
     @property
     def dims(self):
         """Tuple of dimension names with which this variable is associated.
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -2909,6 +2909,15 @@ def test_to_and_from_dict(self):
                 ValueError, "cannot convert dict without the key 'data'"):
             DataArray.from_dict(d)
 
+        # check the data=False option
+        expected_no_data = expected.copy()
+        del expected_no_data['data']
+        del expected_no_data['coords']['x']['data']
+        expected_no_data['coords']['x'].update({'dtype': '<U1', 'shape': (2,)})
+        expected_no_data.update({'dtype': 'float64', 'shape': (2, 3)})
+        actual_no_data = array.to_dict(data=False)
+        assert expected_no_data == actual_no_data
+
     def test_to_and_from_dict_with_time_dim(self):
         x = np.random.randn(10, 3)
         t = pd.date_range('20130101', periods=10)
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -3042,11 +3042,25 @@ def test_to_and_from_dict(self):
         # check roundtrip
         assert_identical(ds, Dataset.from_dict(actual))
 
+        # check the data=False option
+        expected_no_data = expected.copy()
+        del expected_no_data['coords']['t']['data']
+        del expected_no_data['data_vars']['a']['data']
+        del expected_no_data['data_vars']['b']['data']
+        expected_no_data['coords']['t'].update({'dtype': '<U1',
+                                                'shape': (10,)})
+        expected_no_data['data_vars']['a'].update({'dtype': 'float64',
+                                                   'shape': (10,)})
+        expected_no_data['data_vars']['b'].update({'dtype': 'float64',
+                                                   'shape': (10,)})
+        actual_no_data = ds.to_dict(data=False)
+        assert expected_no_data == actual_no_data
+
         # verify coords are included roundtrip
-        expected = ds.set_coords('b')
-        actual = Dataset.from_dict(expected.to_dict())
+        expected_ds = ds.set_coords('b')
+        actual = Dataset.from_dict(expected_ds.to_dict())
 
-        assert_identical(expected, actual)
+        assert_identical(expected_ds, actual)
 
         # test some incomplete dicts:
         # this one has no attrs field, the dims are strings, and x, y are