diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3a324810281..5e9aa06f507 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,5 +1,4 @@ - [ ] Closes #xxxx (remove if there is no corresponding issue, which should only be the case for minor changes) - [ ] Tests added (for all bug fixes or enhancements) - [ ] Tests passed (for all non-documentation changes) - - [ ] Passes ``git diff upstream/master **/*py | flake8 --diff`` (remove if you did not edit any Python files) - [ ] Fully documented, including `whats-new.rst` for all changes and `api.rst` for new API (remove if this change should not be visible to users, e.g., if it is an internal clean-up, or if this is part of a larger project that will be documented later) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 495b088c837..48de04ed194 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -57,6 +57,11 @@ Enhancements - :py:func:`~plot.line()` learned to draw multiple lines if provided with a 2D variable. By `Deepak Cherian `_. +- Reduce memory usage when decoding a variable with a scale_factor, by + converting 8-bit and 16-bit integers to float32 instead of float64 + (:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`). + Correspondingly, encoded variables may also be saved with a smaller dtype. + By `Zac Hatfield-Dodds `_. .. _Zarr: http://zarr.readthedocs.io/ @@ -76,14 +81,12 @@ Bug fixes - Fixed encoding of multi-dimensional coordinates in :py:meth:`~Dataset.to_netcdf` (:issue:`1763`). By `Mike Neish `_. - - Fixed chunking with non-file-based rasterio datasets (:issue:`1816`) and refactored rasterio test suite. By `Ryan Abernathey `_ - Bug fix in open_dataset(engine='pydap') (:issue:`1775`) By `Keisuke Fujii `_. - -- Bug fix in vectorized assignment (:issue:`1743`, `1744`). +- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`). Now item assignment to :py:meth:`~DataArray.__setitem__` checks - Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`). Now item assignment to :py:meth:`DataArray.__setitem__` checks diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index bf2ded8b562..5d32970e2ed 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype): return data +def _choose_float_dtype(dtype, has_offset): + """Return a float dtype that can losslessly represent `dtype` values.""" + # Keep float32 as-is. Upcast half-precision to single-precision, + # because float16 is "intended for storage but not computation" + if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): + return np.float32 + # float32 can exactly represent all integers up to 24 bits + if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): + # A scale factor is entirely safe (vanishing into the mantissa), + # but a large integer offset could lead to loss of precision. + # Sensitivity analysis can be tricky, so we just use a float64 + # if there's any offset at all - better unoptimised than wrong! + if not has_offset: + return np.float32 + # For all other types and circumstances, we just use float64. + # (safe because eg. complex numbers are not supported in NetCDF) + return np.float64 + + class CFScaleOffsetCoder(VariableCoder): """Scale and offset variables according to CF conventions. @@ -216,7 +235,8 @@ def encode(self, variable, name=None): dims, data, attrs, encoding = unpack_for_encoding(variable) if 'scale_factor' in encoding or 'add_offset' in encoding: - data = data.astype(dtype=np.float64, copy=True) + dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding) + data = data.astype(dtype=dtype, copy=True) if 'add_offset' in encoding: data -= pop_to(encoding, attrs, 'add_offset', name=name) if 'scale_factor' in encoding: @@ -230,7 +250,7 @@ def decode(self, variable, name=None): if 'scale_factor' in attrs or 'add_offset' in attrs: scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name) add_offset = pop_to(attrs, encoding, 'add_offset', name=name) - dtype = np.float64 + dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs) transform = partial(_scale_offset_decoding, scale_factor=scale_factor, add_offset=add_offset, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 013131a1e7b..9c4db8b4ccb 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -65,7 +65,7 @@ def open_example_dataset(name, *args, **kwargs): def create_masked_and_scaled_data(): - x = np.array([np.nan, np.nan, 10, 10.1, 10.2]) + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) encoding = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': np.float32(0.1), 'dtype': 'i2'} return Dataset({'x': ('t', x, {}, encoding)}) @@ -80,7 +80,7 @@ def create_encoded_masked_and_scaled_data(): def create_unsigned_masked_scaled_data(): encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1', 'add_offset': 10, 'scale_factor': np.float32(0.1)} - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan]) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) return Dataset({'x': ('t', x, {}, encoding)}) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index d1b54fca95e..a6faea8749b 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -1,5 +1,7 @@ import numpy as np +import pytest + import xarray as xr from xarray.core.pycompat import suppress from xarray.coding import variables @@ -36,3 +38,15 @@ def test_coder_roundtrip(): coder = variables.CFMaskCoder() roundtripped = coder.decode(coder.encode(original)) assert_identical(original, roundtripped) + + +@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split()) +def test_scaling_converts_to_float32(dtype): + original = xr.Variable(('x',), np.arange(10, dtype=dtype), + encoding=dict(scale_factor=10)) + coder = variables.CFScaleOffsetCoder() + encoded = coder.encode(original) + assert encoded.dtype == np.float32 + roundtripped = coder.decode(encoded) + assert_identical(original, roundtripped) + assert roundtripped.dtype == np.float32