Fix dtype=S1 encoding in to_netcdf() (#2158)

shoyer · web-flow · commit 4106b949091d · 2018-05-31T18:09:38.000-07:00
* Fix dtype=S1 encoding in to_netcdf()

Fixes GH2149

* Add test_encoding_kwarg_compression from crusaderky

* Fix dtype=S1 in kwargs for bytes, too

* Fix lint

* Move compression encoding kwarg test

* Remvoe no longer relevant chanegs

* Fix encoding dtype=str

* More lint

* Fix failed tests

* Review comments

* oops, we still need to skip that test

* check for presence in a tuple rather than making two comparisons
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -71,6 +71,11 @@ Enhancements
 Bug fixes
 ~~~~~~~~~
 
+- Fixed a regression in 0.10.4, where explicitly specifying ``dtype='S1'`` or
+  ``dtype=str`` in ``encoding`` with ``to_netcdf()`` raised an error
+  (:issue:`2149`).
+  `Stephan Hoyer <https://github.com/shoyer>`_
+
 - :py:func:`apply_ufunc` now directly validates output variables
   (:issue:`1931`).
   By `Stephan Hoyer <https://github.com/shoyer>`_.
diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -94,6 +94,8 @@ def __init__(self, filename, mode='r', format=None, group=None,
         super(H5NetCDFStore, self).__init__(writer, lock=lock)
 
     def open_store_variable(self, name, var):
+        import h5py
+
         with self.ensure_open(autoclose=False):
             dimensions = var.dimensions
             data = indexing.LazilyOuterIndexedArray(
@@ -119,6 +121,15 @@ def open_store_variable(self, name, var):
             encoding['source'] = self._filename
             encoding['original_shape'] = var.shape
 
+            vlen_dtype = h5py.check_dtype(vlen=var.dtype)
+            if vlen_dtype is unicode_type:
+                encoding['dtype'] = str
+            elif vlen_dtype is not None:  # pragma: no cover
+                # xarray doesn't support writing arbitrary vlen dtypes yet.
+                pass
+            else:
+                encoding['dtype'] = var.dtype
+
         return Variable(dimensions, data, attrs, encoding)
 
     def get_variables(self):
@@ -161,7 +172,8 @@ def prepare_variable(self, name, variable, check_encoding=False,
         import h5py
 
         attrs = variable.attrs.copy()
-        dtype = _get_datatype(variable)
+        dtype = _get_datatype(
+            variable, raise_on_invalid_encoding=check_encoding)
 
         fillvalue = attrs.pop('_FillValue', None)
         if dtype is str and fillvalue is not None:
@@ -189,8 +201,9 @@ def prepare_variable(self, name, variable, check_encoding=False,
                 raise ValueError("'zlib' and 'compression' encodings mismatch")
             encoding.setdefault('compression', 'gzip')
 
-        if (check_encoding and encoding.get('complevel') not in
-                (None, encoding.get('compression_opts'))):
+        if (check_encoding and
+                'complevel' in encoding and 'compression_opts' in encoding and
+                encoding['complevel'] != encoding['compression_opts']):
             raise ValueError("'complevel' and 'compression_opts' encodings "
                              "mismatch")
         complevel = encoding.pop('complevel', 0)
diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -89,16 +89,33 @@ def _encode_nc4_variable(var):
     return var
 
 
-def _get_datatype(var, nc_format='NETCDF4'):
+def _check_encoding_dtype_is_vlen_string(dtype):
+    if dtype is not str:
+        raise AssertionError(  # pragma: no cover
+            "unexpected dtype encoding %r. This shouldn't happen: please "
+            "file a bug report at github.com/pydata/xarray" % dtype)
+
+
+def _get_datatype(var, nc_format='NETCDF4', raise_on_invalid_encoding=False):
     if nc_format == 'NETCDF4':
         datatype = _nc4_dtype(var)
     else:
+        if 'dtype' in var.encoding:
+            encoded_dtype = var.encoding['dtype']
+            _check_encoding_dtype_is_vlen_string(encoded_dtype)
+            if raise_on_invalid_encoding:
+                raise ValueError(
+                    'encoding dtype=str for vlen strings is only supported '
+                    'with format=\'NETCDF4\'.')
         datatype = var.dtype
     return datatype
 
 
 def _nc4_dtype(var):
-    if coding.strings.is_unicode_dtype(var.dtype):
+    if 'dtype' in var.encoding:
+        dtype = var.encoding.pop('dtype')
+        _check_encoding_dtype_is_vlen_string(dtype)
+    elif coding.strings.is_unicode_dtype(var.dtype):
         dtype = str
     elif var.dtype.kind in ['i', 'u', 'f', 'c', 'S']:
         dtype = var.dtype
@@ -172,7 +189,7 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
 
     safe_to_drop = set(['source', 'original_shape'])
     valid_encodings = set(['zlib', 'complevel', 'fletcher32', 'contiguous',
-                           'chunksizes', 'shuffle', '_FillValue'])
+                           'chunksizes', 'shuffle', '_FillValue', 'dtype'])
     if lsd_okay:
         valid_encodings.add('least_significant_digit')
     if h5py_okay:
@@ -344,6 +361,7 @@ def open_store_variable(self, name, var):
             # save source so __repr__ can detect if it's local or not
             encoding['source'] = self._filename
             encoding['original_shape'] = var.shape
+            encoding['dtype'] = var.dtype
 
         return Variable(dimensions, data, attributes, encoding)
 
@@ -398,7 +416,8 @@ def encode_variable(self, variable):
 
     def prepare_variable(self, name, variable, check_encoding=False,
                          unlimited_dims=None):
-        datatype = _get_datatype(variable, self.format)
+        datatype = _get_datatype(variable, self.format,
+                                 raise_on_invalid_encoding=check_encoding)
         attrs = variable.attrs.copy()
 
         fill_value = attrs.pop('_FillValue', None)
diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py
@@ -43,7 +43,10 @@ def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         contains_unicode = is_unicode_dtype(data.dtype)
-        encode_as_char = 'dtype' in encoding and encoding['dtype'] == 'S1'
+        encode_as_char = encoding.get('dtype') == 'S1'
+
+        if encode_as_char:
+            del encoding['dtype']  # no longer relevant
 
         if contains_unicode and (encode_as_char or not self.allows_unicode):
             if '_FillValue' in attrs:
@@ -100,7 +103,7 @@ def encode(self, variable, name=None):
         variable = ensure_fixed_length_bytes(variable)
 
         dims, data, attrs, encoding = unpack_for_encoding(variable)
-        if data.dtype.kind == 'S':
+        if data.dtype.kind == 'S' and encoding.get('dtype') is not str:
             data = bytes_to_char(data)
             dims = dims + ('string%s' % data.shape[-1],)
         return Variable(dims, data, attrs, encoding)
diff --git a/xarray/conventions.py b/xarray/conventions.py
@@ -79,7 +79,8 @@ def _var_as_tuple(var):
 
 
 def maybe_encode_nonstring_dtype(var, name=None):
-    if 'dtype' in var.encoding and var.encoding['dtype'] != 'S1':
+    if ('dtype' in var.encoding and
+            var.encoding['dtype'] not in ('S1', str)):
         dims, data, attrs, encoding = _var_as_tuple(var)
         dtype = np.dtype(encoding.pop('dtype'))
         if dtype != var.dtype:
@@ -307,12 +308,7 @@ def decode_cf_variable(name, var, concat_characters=True, mask_and_scale=True,
         data = NativeEndiannessArray(data)
         original_dtype = data.dtype
 
-    if 'dtype' in encoding:
-        if original_dtype != encoding['dtype']:
-            warnings.warn("CF decoding is overwriting dtype on variable {!r}"
-                          .format(name))
-    else:
-        encoding['dtype'] = original_dtype
+    encoding.setdefault('dtype', original_dtype)
 
     if 'dtype' in attributes and attributes['dtype'] == 'bool':
         del attributes['dtype']
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -753,13 +753,26 @@ def test_encoding_kwarg(self):
             with self.roundtrip(ds, save_kwargs=kwargs) as actual:
                 pass
 
+    def test_encoding_kwarg_dates(self):
         ds = Dataset({'t': pd.date_range('2000-01-01', periods=3)})
         units = 'days since 1900-01-01'
         kwargs = dict(encoding={'t': {'units': units}})
         with self.roundtrip(ds, save_kwargs=kwargs) as actual:
             self.assertEqual(actual.t.encoding['units'], units)
             assert_identical(actual, ds)
 
+    def test_encoding_kwarg_fixed_width_string(self):
+        # regression test for GH2149
+        for strings in [
+            [b'foo', b'bar', b'baz'],
+            [u'foo', u'bar', u'baz'],
+        ]:
+            ds = Dataset({'x': strings})
+            kwargs = dict(encoding={'x': {'dtype': 'S1'}})
+            with self.roundtrip(ds, save_kwargs=kwargs) as actual:
+                self.assertEqual(actual['x'].encoding['dtype'], 'S1')
+                assert_identical(actual, ds)
+
     def test_default_fill_value(self):
         # Test default encoding for float:
         ds = Dataset({'x': ('y', np.arange(10.0))})
@@ -879,8 +892,8 @@ def create_tmp_files(nfiles, suffix='.nc', allow_cleanup_failure=False):
         yield files
 
 
-@requires_netCDF4
 class BaseNetCDF4Test(CFEncodedDataTest):
+    """Tests for both netCDF4-python and h5netcdf."""
 
     engine = 'netcdf4'
 
@@ -942,6 +955,18 @@ def test_write_groups(self):
             with self.open(tmp_file, group='data/2') as actual2:
                 assert_identical(data2, actual2)
 
+    def test_encoding_kwarg_vlen_string(self):
+        for input_strings in [
+            [b'foo', b'bar', b'baz'],
+            [u'foo', u'bar', u'baz'],
+        ]:
+            original = Dataset({'x': input_strings})
+            expected = Dataset({'x': [u'foo', u'bar', u'baz']})
+            kwargs = dict(encoding={'x': {'dtype': str}})
+            with self.roundtrip(original, save_kwargs=kwargs) as actual:
+                assert actual['x'].encoding['dtype'] is str
+                assert_identical(actual, expected)
+
     def test_roundtrip_string_with_fill_value_vlen(self):
         values = np.array([u'ab', u'cdef', np.nan], dtype=object)
         expected = Dataset({'x': ('t', values)})
@@ -1054,6 +1079,23 @@ def test_compression_encoding(self):
         with self.roundtrip(expected) as actual:
             assert_equal(expected, actual)
 
+    def test_encoding_kwarg_compression(self):
+        ds = Dataset({'x': np.arange(10.0)})
+        encoding = dict(dtype='f4', zlib=True, complevel=9, fletcher32=True,
+                        chunksizes=(5,), shuffle=True)
+        kwargs = dict(encoding=dict(x=encoding))
+
+        with self.roundtrip(ds, save_kwargs=kwargs) as actual:
+            assert_equal(actual, ds)
+            self.assertEqual(actual.x.encoding['dtype'], 'f4')
+            self.assertEqual(actual.x.encoding['zlib'], True)
+            self.assertEqual(actual.x.encoding['complevel'], 9)
+            self.assertEqual(actual.x.encoding['fletcher32'], True)
+            self.assertEqual(actual.x.encoding['chunksizes'], (5,))
+            self.assertEqual(actual.x.encoding['shuffle'], True)
+
+        self.assertEqual(ds.x.encoding, {})
+
     def test_encoding_chunksizes_unlimited(self):
         # regression test for GH1225
         ds = Dataset({'x': [1, 2, 3], 'y': ('x', [2, 3, 4])})
@@ -1117,7 +1159,7 @@ def test_already_open_dataset(self):
                     expected = Dataset({'x': ((), 42)})
                     assert_identical(expected, ds)
 
-    def test_variable_len_strings(self):
+    def test_read_variable_len_strings(self):
         with create_tmp_file() as tmp_file:
             values = np.array(['foo', 'bar', 'baz'], dtype=object)
 
@@ -1410,6 +1452,10 @@ def test_group(self):
                             open_kwargs={'group': group}) as actual:
             assert_identical(original, actual)
 
+    def test_encoding_kwarg_fixed_width_string(self):
+        # not relevant for zarr, since we don't use EncodedStringCoder
+        pass
+
     # TODO: someone who understand caching figure out whether chaching
     # makes sense for Zarr backend
     @pytest.mark.xfail(reason="Zarr caching not implemented")
@@ -1579,6 +1625,13 @@ def create_store(self):
                     tmp_file, mode='w', format='NETCDF3_CLASSIC') as store:
                 yield store
 
+    def test_encoding_kwarg_vlen_string(self):
+        original = Dataset({'x': [u'foo', u'bar', u'baz']})
+        kwargs = dict(encoding={'x': {'dtype': str}})
+        with raises_regex(ValueError, 'encoding dtype=str for vlen'):
+            with self.roundtrip(original, save_kwargs=kwargs):
+                pass
+
 
 class NetCDF3ViaNetCDF4DataTestAutocloseTrue(NetCDF3ViaNetCDF4DataTest):
     autoclose = True
diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py
@@ -272,7 +272,14 @@ def test_roundtrip_coordinates(self):
                                 'CFEncodedInMemoryStore')
 
     def test_invalid_dataarray_names_raise(self):
+        # only relevant for on-disk file formats
         pass
 
     def test_encoding_kwarg(self):
+        # we haven't bothered to raise errors yet for unexpected encodings in
+        # this test dummy
+        pass
+
+    def test_encoding_kwarg_fixed_width_string(self):
+        # CFEncodedInMemoryStore doesn't support explicit string encodings.
         pass