Skip to content

Commit 2e25e5e

Browse files
committed
Read small integers as float32, not float64
AKA the "I just wasted 4.6 TB of memory" patch.
1 parent 5320826 commit 2e25e5e

File tree

4 files changed

+44
-7
lines changed

4 files changed

+44
-7
lines changed

doc/whats-new.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ Enhancements
5555
- :py:func:`~plot.line()` learned to draw multiple lines if provided with a
5656
2D variable.
5757
By `Deepak Cherian <https://github.com/dcherian>`_.
58+
- Reduce memory usage when decoding a variable with a scale_factor, by
59+
converting 8-bit and 16-bit integers to float32 instead of float64
60+
(:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
61+
Correspondingly, encoded variables may also be saved with a smaller dtype.
62+
By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
5863

5964
.. _Zarr: http://zarr.readthedocs.io/
6065

@@ -74,11 +79,9 @@ Bug fixes
7479
- Fixed encoding of multi-dimensional coordinates in
7580
:py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
7681
By `Mike Neish <https://github.com/neishm>`_.
77-
7882
- Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
7983
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
80-
81-
- Bug fix in vectorized assignment (:issue:`1743`, `1744`).
84+
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
8285
Now item assignment to :py:meth:`~DataArray.__setitem__` checks
8386
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
8487
Now item assignment to :py:meth:`DataArray.__setitem__` checks

xarray/coding/variables.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
205205
return data
206206

207207

208+
def _choose_float_dtype(dtype, has_offset):
209+
"""Return a float dtype that can losslessly represent `dtype` values."""
210+
# Keep float32 as-is. Upcast half-precision to single-precision,
211+
# because float16 is "intended for storage but not computation"
212+
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
213+
return np.float32
214+
# float32 can exactly represent all integers up to 24 bits
215+
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
216+
# A scale factor is entirely safe (vanishing into the mantissa),
217+
# but a large integer offset could lead to loss of precision.
218+
# Sensitivity analysis can be tricky, so we just use a float64
219+
# if there's any offset at all - better unoptimised than wrong!
220+
if not has_offset:
221+
return np.float32
222+
# For all other types and circumstances, we just use float64.
223+
# (safe because eg. complex numbers are not supported in NetCDF)
224+
return np.float64
225+
226+
208227
class CFScaleOffsetCoder(VariableCoder):
209228
"""Scale and offset variables according to CF conventions.
210229
@@ -216,7 +235,8 @@ def encode(self, variable, name=None):
216235
dims, data, attrs, encoding = unpack_for_encoding(variable)
217236

218237
if 'scale_factor' in encoding or 'add_offset' in encoding:
219-
data = data.astype(dtype=np.float64, copy=True)
238+
dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
239+
data = data.astype(dtype=dtype, copy=True)
220240
if 'add_offset' in encoding:
221241
data -= pop_to(encoding, attrs, 'add_offset', name=name)
222242
if 'scale_factor' in encoding:
@@ -230,7 +250,7 @@ def decode(self, variable, name=None):
230250
if 'scale_factor' in attrs or 'add_offset' in attrs:
231251
scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
232252
add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
233-
dtype = np.float64
253+
dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
234254
transform = partial(_scale_offset_decoding,
235255
scale_factor=scale_factor,
236256
add_offset=add_offset,

xarray/tests/test_backends.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def open_example_dataset(name, *args, **kwargs):
6565

6666

6767
def create_masked_and_scaled_data():
68-
x = np.array([np.nan, np.nan, 10, 10.1, 10.2])
68+
x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32)
6969
encoding = {'_FillValue': -1, 'add_offset': 10,
7070
'scale_factor': np.float32(0.1), 'dtype': 'i2'}
7171
return Dataset({'x': ('t', x, {}, encoding)})
@@ -80,7 +80,7 @@ def create_encoded_masked_and_scaled_data():
8080
def create_unsigned_masked_scaled_data():
8181
encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1',
8282
'add_offset': 10, 'scale_factor': np.float32(0.1)}
83-
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan])
83+
x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32)
8484
return Dataset({'x': ('t', x, {}, encoding)})
8585

8686

xarray/tests/test_coding.py

+14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import numpy as np
22

3+
import pytest
4+
35
import xarray as xr
46
from xarray.core.pycompat import suppress
57
from xarray.coding import variables
@@ -36,3 +38,15 @@ def test_coder_roundtrip():
3638
coder = variables.CFMaskCoder()
3739
roundtripped = coder.decode(coder.encode(original))
3840
assert_identical(original, roundtripped)
41+
42+
43+
@pytest.mark.parametrize('dtype', 'u1 u2 i1 i2 f2 f4'.split())
44+
def test_scaling_converts_to_float32(dtype):
45+
original = xr.Variable(('x',), np.arange(10, dtype=dtype),
46+
encoding=dict(scale_factor=10))
47+
coder = variables.CFScaleOffsetCoder()
48+
encoded = coder.encode(original)
49+
assert encoded.dtype == np.float32
50+
roundtripped = coder.decode(encoded)
51+
assert_identical(original, roundtripped)
52+
assert roundtripped.dtype == np.float32

0 commit comments

Comments
 (0)