Read small integers as float32, not float64

Zac-HD · Zac-HD · commit 2e25e5e62e7f · 2018-01-21T14:35:04.000+11:00
AKA the "I just wasted 4.6 TB of memory" patch.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -55,6 +55,11 @@ Enhancements
 - :py:func:`~plot.line()` learned to draw multiple lines if provided with a
   2D variable.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Reduce memory usage when decoding a variable with a scale_factor, by
+  converting 8-bit and 16-bit integers to float32 instead of float64
+  (:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
+  Correspondingly, encoded variables may also be saved with a smaller dtype.
+  By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
 
 .. _Zarr: http://zarr.readthedocs.io/
 
@@ -74,11 +79,9 @@ Bug fixes
 - Fixed encoding of multi-dimensional coordinates in
   :py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
   By `Mike Neish <https://github.com/neishm>`_.
-
 - Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-
-- Bug fix in vectorized assignment  (:issue:`1743`, `1744`).
+- Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`~DataArray.__setitem__` checks
 - Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`DataArray.__setitem__` checks
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
     return data
 
 
+def _choose_float_dtype(dtype, has_offset):
+    """Return a float dtype that can losslessly represent `dtype` values."""
+    # Keep float32 as-is.  Upcast half-precision to single-precision,
+    # because float16 is "intended for storage but not computation"
+    if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
+        return np.float32
+    # float32 can exactly represent all integers up to 24 bits
+    if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
+        # A scale factor is entirely safe (vanishing into the mantissa),
+        # but a large integer offset could lead to loss of precision.
+        # Sensitivity analysis can be tricky, so we just use a float64
+        # if there's any offset at all - better unoptimised than wrong!
+        if not has_offset:
+            return np.float32
+    # For all other types and circumstances, we just use float64.
+    # (safe because eg. complex numbers are not supported in NetCDF)
+    return np.float64
+
+
 class CFScaleOffsetCoder(VariableCoder):
     """Scale and offset variables according to CF conventions.
 
@@ -216,7 +235,8 @@ def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if 'scale_factor' in encoding or 'add_offset' in encoding:
-            data = data.astype(dtype=np.float64, copy=True)
+            dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
+            data = data.astype(dtype=dtype, copy=True)
             if 'add_offset' in encoding:
                 data -= pop_to(encoding, attrs, 'add_offset', name=name)
             if 'scale_factor' in encoding:
@@ -230,7 +250,7 @@ def decode(self, variable, name=None):
         if 'scale_factor' in attrs or 'add_offset' in attrs:
             scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
             add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
-            dtype = np.float64
+            dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
             transform = partial(_scale_offset_decoding,
                                 scale_factor=scale_factor,
                                 add_offset=add_offset,
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -65,7 +65,7 @@ def open_example_dataset(name, *args, **kwargs):
 
 
 def create_masked_and_scaled_data():
-    x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
+    x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
     encoding = {'_FillValue': -1, 'add_offset': 10,
                 'scale_factor': np.float32(0.1), 'dtype': 'i2'}
     return Dataset({'x': ('t', x, {}, encoding)})
@@ -80,7 +80,7 @@ def create_encoded_masked_and_scaled_data():
 def create_unsigned_masked_scaled_data():
     encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
                 'add_offset': 10, 'scale_factor': np.float32(0.1)}
-    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
+    x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
     return Dataset({'x': ('t', x, {}, encoding)})
 
 
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+import pytest
+
 import xarray as xr
 from xarray.core.pycompat import suppress
 from xarray.coding import variables
@@ -36,3 +38,15 @@ def test_coder_roundtrip():
     coder = variables.CFMaskCoder()
     roundtripped = coder.decode(coder.encode(original))
     assert_identical(original, roundtripped)
+
+
+@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split())
+def test_scaling_converts_to_float32(dtype):
+    original = xr.Variable(('x',), np.arange(10, dtype=dtype),
+                           encoding=dict(scale_factor=10))
+    coder = variables.CFScaleOffsetCoder()
+    encoded = coder.encode(original)
+    assert encoded.dtype == np.float32
+    roundtripped = coder.decode(encoded)
+    assert_identical(original, roundtripped)
+    assert roundtripped.dtype == np.float32