Skip to content

Bytes attributes are decoded to strings with engine='h5netcdf' #477

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 16, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ Bug fixes
- Fixed a bug in serializing scalar datetime variable to netCDF.
- Fixed a bug that could occur in serialization of 0-dimensional integer arrays.
- Fixed a bug where concatenating DataArrays was not always lazy (:issue:`464`).
- When reading datasets with h5netcdf, bytes attributes are decoded to strings.
This allows conventions decoding to work properly on Python 3 (:issue:`451`).

v0.5.1 (15 June 2015)
---------------------
Expand Down
32 changes: 25 additions & 7 deletions xray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,33 @@
from .. import Variable
from ..conventions import cf_encoder
from ..core import indexing
from ..core.utils import FrozenOrderedDict, close_on_error
from ..core.pycompat import iteritems, basestring, unicode_type, OrderedDict
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict

from .common import AbstractWritableDataStore
from .netCDF4_ import _nc4_group, _nc4_values_and_dtype


def maybe_decode_bytes(txt):
if isinstance(txt, bytes_type):
return txt.decode('utf-8')
else:
return txt


def _read_attributes(h5netcdf_var):
# GH451
# to ensure conventions decoding works properly on Python 3, decode all
# bytes attributes to strings
attrs = OrderedDict()
for k in h5netcdf_var.ncattrs():
v = h5netcdf_var.getncattr(k)
if k not in ['_FillValue', 'missing_value']:
v = maybe_decode_bytes(v)
attrs[k] = v
return attrs


class H5NetCDFStore(AbstractWritableDataStore):
"""Store for reading and writing data via h5netcdf
"""
Expand All @@ -33,8 +53,7 @@ def store(self, variables, attributes):
def open_store_variable(self, var):
dimensions = var.dimensions
data = indexing.LazilyIndexedArray(var)
attributes = OrderedDict((k, var.getncattr(k))
for k in var.ncattrs())
attrs = _read_attributes(var)

# netCDF4 specific encoding
encoding = dict(var.filters())
Expand All @@ -44,15 +63,14 @@ def open_store_variable(self, var):
# save source so __repr__ can detect if it's local or not
encoding['source'] = self._filename

return Variable(dimensions, data, attributes, encoding)
return Variable(dimensions, data, attrs, encoding)

def get_variables(self):
return FrozenOrderedDict((k, self.open_store_variable(v))
for k, v in iteritems(self.ds.variables))

def get_attrs(self):
return FrozenOrderedDict((k, self.ds.getncattr(k))
for k in self.ds.ncattrs())
return Frozen(_read_attributes(self.ds))

def get_dimensions(self):
return self.ds.dimensions
Expand Down
21 changes: 20 additions & 1 deletion xray/test/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ def test_engine(self):
with self.assertRaisesRegexp(ValueError, 'can only read'):
open_dataset(BytesIO(netcdf_bytes), engine='foobar')

def test_cross_engine_read_write(self):
def test_cross_engine_read_write_netcdf3(self):
data = create_test_data()
valid_engines = set()
if has_netCDF4:
Expand Down Expand Up @@ -704,6 +704,25 @@ def test_complex(self):
with self.roundtrip(expected) as actual:
self.assertDatasetEqual(expected, actual)

def test_cross_engine_read_write_netcdf4(self):
data = create_test_data().drop('dim3')
data.attrs['foo'] = 'bar'
valid_engines = ['netcdf4', 'h5netcdf']
for write_engine in valid_engines:
with create_tmp_file() as tmp_file:
data.to_netcdf(tmp_file, engine=write_engine)
for read_engine in valid_engines:
with open_dataset(tmp_file, engine=read_engine) as actual:
self.assertDatasetIdentical(data, actual)

def test_read_byte_attrs_as_unicode(self):
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, 'w') as nc:
nc.foo = b'bar'
actual = open_dataset(tmp_file)
expected = Dataset(attrs={'foo': 'bar'})
self.assertDatasetIdentical(expected, actual)


@requires_dask
@requires_netCDF4
Expand Down