Read small integers as float32, not float64

Zac-HD · Zac-HD · commit f659398db4b8 · 2018-01-19T17:45:38.000+11:00
AKA the "I just wasted 4.6 TB of memory" patch.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -51,6 +51,9 @@ Enhancements
 - :py:func:`~plot.line()` learned to draw multiple lines if provided with a
   2D variable.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Reduce memory usage when decoding a variable with a scale_factor, by
+  converting 8-bit and 16-bit integers to float32 instead of float64 (:pull:`1840`).
+  By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
 
 .. _Zarr: http://zarr.readthedocs.io/
 
@@ -67,11 +70,9 @@ Bug fixes
 - Fixed encoding of multi-dimensional coordinates in
   :py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
   By `Mike Neish <https://github.com/neishm>`_.
-
 - Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-
-- Bug fix in vectorized assignment  (:issue:`1743`, `1744`).
+- Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`~DataArray.__setitem__` checks
 - Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`DataArray.__setitem__` checks
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -212,11 +212,27 @@ class CFScaleOffsetCoder(VariableCoder):
         decode_values = encoded_values * scale_factor + add_offset
     """
 
+    @staticmethod
+    def _choose_float_dtype(data, has_offset):
+        """Return a float dtype sufficient to losslessly represent `data`."""
+        # float32 can exactly represent all integers up to 24 bits
+        if data.dtype.itemsize <= 2 and np.issubdtype(data.dtype, np.integer):
+            # A scale factor is entirely safe (vanishing into the mantissa),
+            # but a large integer offset could lead to loss of precision.
+            # Sensitivity analysis can be tricky, so we just use a float64
+            # if there's any offset at all - better unoptimised than wrong!
+            if not has_offset:
+                return np.float32
+        # For all other types and circumstances, we just use float64.
+        # (safe because eg. complex numbers are not supported in NetCDF)
+        return np.float64
+
     def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if 'scale_factor' in encoding or 'add_offset' in encoding:
-            data = data.astype(dtype=np.float64, copy=True)
+            dtype = self._choose_float_dtype(data, 'add_offset' in encoding)
+            data = data.astype(dtype=dtype, copy=True)
             if 'add_offset' in encoding:
                 data -= pop_to(encoding, attrs, 'add_offset', name=name)
             if 'scale_factor' in encoding:
@@ -230,7 +246,7 @@ def decode(self, variable, name=None):
         if 'scale_factor' in attrs or 'add_offset' in attrs:
             scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
             add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
-            dtype = np.float64
+            dtype = self._choose_float_dtype(data, 'add_offset' in attrs)
             transform = partial(_scale_offset_decoding,
                                 scale_factor=scale_factor,
                                 add_offset=add_offset,