From cddfeec21a51dc8d79a20781614f8ea0e6b77bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 10:47:21 +0200 Subject: [PATCH 1/7] implement scale_factor/add_offset CF conformance test, add and align tests --- xarray/coding/variables.py | 102 ++++++++++++++++++++++++++++++++ xarray/tests/test_backends.py | 91 +++++++++++++++++++---------- xarray/tests/test_coding.py | 107 +++++++++++++++++++++++++++------- 3 files changed, 248 insertions(+), 52 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 5c6e51c2215..ca030df9fd0 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -318,6 +318,97 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A return np.float64 +def _ensure_scale_offset_conformance( + mapping: MutableMapping, strict: bool = False +) -> bool | None: + """Check conformance of scale_factor and add_offset for cf encoding/decoding. + + scale_factor and/or add_offset as well as packed dtype are needed within mapping + """ + conforms = True + # https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data + scale_factor = mapping.get("scale_factor") + if np.ndim(scale_factor) > 0: + scale_factor = np.asarray(scale_factor).item() + add_offset = mapping.get("add_offset") + if np.ndim(add_offset) > 0: + add_offset = np.asarray(add_offset).item() + dtype = mapping.get("dtype") + ptype = np.dtype(dtype) if dtype is not None else None + + # get the type from scale_factor/add_offset + scale_offset_dtype = list( + {np.dtype(type(att)) for att in [scale_factor, add_offset] if att is not None} + ) + + # raise early, aligns with netcdf4-python + if np.float16 in scale_offset_dtype: + raise ValueError( + f"scale_factor and/or add_offset dtype {scale_offset_dtype} mismatch. " + "float16 is not allowed." + ) + + ptype_exists = ptype is not None + + # no packing information available, do nothing + if not scale_offset_dtype: + return None + + # no packing information available, do nothing + if not scale_offset_dtype and not ptype_exists: + return None + + # mandatory packing information missing + if scale_offset_dtype and not ptype_exists: + raise ValueError("Packed dtype information is missing!") + + if len(scale_offset_dtype) == 1: + # OK, we have at least one of scale_factor or add_offset + # and if both are given, they are of the same dtype + scale_offset_dtype = scale_offset_dtype[0] + + if scale_offset_dtype != ptype: + if scale_offset_dtype not in [np.float32, np.float64]: + msg = ( + f"scale_factor and/or add_offset dtype {scale_offset_dtype} " + "mismatch. Must be either float32 or float64 dtype." + ) + if strict: + raise ValueError(msg) + else: + warnings.warn(msg, SerializationWarning, stacklevel=3) + conforms = False + if np.issubdtype(ptype, np.integer) and ptype not in [ + np.int8, + np.int16, + np.int32, + ]: + msg = f"packed dtype {ptype} mismatch. Must be of type byte, short or int." + if strict: + raise ValueError(msg) + else: + warnings.warn(msg, SerializationWarning, stacklevel=3) + conforms = False + if ptype == np.int32 and scale_offset_dtype == np.float32: + warnings.warn( + "Trying to pack float32 into int32. This is not advised per CF Convention " + "because of potential precision loss!", + SerializationWarning, + stacklevel=3, + ) + else: + msg = ( + f"scale_factor dtype {np.dtype(type(scale_factor))} and add_offset dtype " + f"{np.dtype(type(add_offset))} mismatch! Must be of same dtype." + ) + if strict: + raise ValueError(msg) + else: + warnings.warn(msg, SerializationWarning, stacklevel=3) + conforms = False + return conforms + + class CFScaleOffsetCoder(VariableCoder): """Scale and offset variables according to CF conventions. @@ -329,6 +420,9 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: dims, data, attrs, encoding = unpack_for_encoding(variable) if "scale_factor" in encoding or "add_offset" in encoding: + # strict checking, raise error on encoding + # we do not want to write non-conforming data + _ensure_scale_offset_conformance(encoding, strict=True) dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) data = data.astype(dtype=dtype, copy=True) if "add_offset" in encoding: @@ -345,6 +439,14 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) add_offset = pop_to(attrs, encoding, "add_offset", name=name) + + # for decoding we need the original dtype + encoding.setdefault("dtype", data.dtype) + + # only warn on decoding + # we try to decode non-conforming data + _ensure_scale_offset_conformance(encoding, strict=False) + dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7d58c5bfed2..f897beb495b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -17,7 +17,7 @@ from contextlib import ExitStack from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Any, Final, cast +from typing import TYPE_CHECKING, Any, Callable, Final, cast import numpy as np import pandas as pd @@ -138,96 +138,110 @@ def open_example_mfdataset(names, *args, **kwargs) -> Dataset: ) -def create_masked_and_scaled_data() -> Dataset: - x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=np.float32) +def create_masked_and_scaled_data(dtype: type[np.number] = np.float32) -> Dataset: + x = np.array([np.nan, np.nan, 10, 10.1, 10.2], dtype=dtype) encoding = { "_FillValue": -1, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), "dtype": "i2", } return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_masked_and_scaled_data() -> Dataset: - attributes = {"_FillValue": -1, "add_offset": 10, "scale_factor": np.float32(0.1)} +def create_encoded_masked_and_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: + attributes = {"_FillValue": -1, "add_offset": dtype(10), "scale_factor": dtype(0.1)} return Dataset( {"x": ("t", np.array([-1, -1, 0, 1, 2], dtype=np.int16), attributes)} ) -def create_unsigned_masked_scaled_data() -> Dataset: +def create_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": "true", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_encoded_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": "true", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create unsigned data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_bad_unsigned_masked_scaled_data() -> Dataset: +def create_bad_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": 255, "_Unsigned": True, "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(0), + "scale_factor": dtype(0.1), } - x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=np.float32) + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_bad_encoded_unsigned_masked_scaled_data() -> Dataset: +def create_bad_encoded_unsigned_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -1, "_Unsigned": True, - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype="i1") return Dataset({"x": ("t", sb, attributes)}) -def create_signed_masked_scaled_data() -> Dataset: +def create_signed_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: encoding = { "_FillValue": -127, "_Unsigned": "false", "dtype": "i1", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } - x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=np.float32) + x = np.array([-1.0, 10.1, 22.7, np.nan], dtype=dtype) return Dataset({"x": ("t", x, {}, encoding)}) -def create_encoded_signed_masked_scaled_data() -> Dataset: +def create_encoded_signed_masked_scaled_data( + dtype: type[np.number] = np.float32, +) -> Dataset: # These are values as written to the file: the _FillValue will # be represented in the signed form. attributes = { "_FillValue": -127, "_Unsigned": "false", - "add_offset": 10, - "scale_factor": np.float32(0.1), + "add_offset": dtype(10), + "scale_factor": dtype(0.1), } # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([-110, 1, 127, -127], dtype="i1") @@ -859,6 +873,8 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: with self.roundtrip(original) as actual: assert_identical(expected, actual) + # Todo: (kmuehlbauer) make this work np.float64 + @pytest.mark.parametrize("dtype", [np.float32]) @pytest.mark.parametrize( "decoded_fn, encoded_fn", [ @@ -878,9 +894,20 @@ def test_roundtrip_string_with_fill_value_nchar(self) -> None: (create_masked_and_scaled_data, create_encoded_masked_and_scaled_data), ], ) - def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: - decoded = decoded_fn() - encoded = encoded_fn() + def test_roundtrip_mask_and_scale( + self, + decoded_fn: Callable[[type[np.number]], Dataset], + encoded_fn: Callable[[type[np.number]], Dataset], + dtype: type[np.number], + ) -> None: + if dtype == np.float32 and isinstance( + self, (TestZarrDirectoryStore, TestZarrDictStore) + ): + pytest.skip( + "zarr attributes (eg. `scale_factor` are unconditionally promoted to `float64`" + ) + decoded = decoded_fn(dtype) + encoded = encoded_fn(dtype) with self.roundtrip(decoded) as actual: for k in decoded.variables: @@ -901,7 +928,7 @@ def test_roundtrip_mask_and_scale(self, decoded_fn, encoded_fn) -> None: # make sure roundtrip encoding didn't change the # original dataset. - assert_allclose(encoded, encoded_fn(), decode_bytes=False) + assert_allclose(encoded, encoded_fn(dtype), decode_bytes=False) with self.roundtrip(encoded) as actual: for k in decoded.variables: diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f7579c4b488..b74e6b4dfd4 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -6,9 +6,18 @@ import pandas as pd import pytest -import xarray as xr +from xarray import ( + DataArray, + SerializationWarning, + Variable, +) from xarray.coding import variables -from xarray.conventions import decode_cf_variable, encode_cf_variable +from xarray.conventions import ( + cf_encoder, + decode_cf, + decode_cf_variable, + encode_cf_variable, +) from xarray.tests import assert_allclose, assert_equal, assert_identical, requires_dask with suppress(ImportError): @@ -16,8 +25,8 @@ def test_CFMaskCoder_decode() -> None: - original = xr.Variable(("x",), [0, -1, 1], {"_FillValue": -1}) - expected = xr.Variable(("x",), [0, np.nan, 1]) + original = Variable(("x",), [0, -1, 1], {"_FillValue": -1}) + expected = Variable(("x",), [0, np.nan, 1]) coder = variables.CFMaskCoder() encoded = coder.decode(original) assert_identical(expected, encoded) @@ -45,7 +54,7 @@ def test_CFMaskCoder_decode() -> None: ids=list(CFMASKCODER_ENCODE_DTYPE_CONFLICT_TESTS.keys()), ) def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None: - original = xr.Variable(("x",), data, encoding=encoding) + original = Variable(("x",), data, encoding=encoding) encoded = encode_cf_variable(original) assert encoded.dtype == encoded.attrs["missing_value"].dtype @@ -57,27 +66,27 @@ def test_CFMaskCoder_encode_missing_fill_values_conflict(data, encoding) -> None def test_CFMaskCoder_missing_value() -> None: - expected = xr.DataArray( + expected = DataArray( np.array([[26915, 27755, -9999, 27705], [25595, -9999, 28315, -9999]]), dims=["npts", "ntimes"], name="tmpk", ) expected.attrs["missing_value"] = -9999 - decoded = xr.decode_cf(expected.to_dataset()) - encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) + decoded = decode_cf(expected.to_dataset()) + encoded, _ = cf_encoder(decoded.variables, decoded.attrs) assert_equal(encoded["tmpk"], expected.variable) decoded.tmpk.encoding["_FillValue"] = -9940 with pytest.raises(ValueError): - encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) + encoded, _ = cf_encoder(decoded.variables, decoded.attrs) @requires_dask def test_CFMaskCoder_decode_dask() -> None: - original = xr.Variable(("x",), [0, -1, 1], {"_FillValue": -1}).chunk() - expected = xr.Variable(("x",), [0, np.nan, 1]) + original = Variable(("x",), [0, -1, 1], {"_FillValue": -1}).chunk() + expected = Variable(("x",), [0, np.nan, 1]) coder = variables.CFMaskCoder() encoded = coder.decode(original) assert isinstance(encoded.data, da.Array) @@ -89,16 +98,18 @@ def test_CFMaskCoder_decode_dask() -> None: # TODO(shoyer): parameterize when we have more coders def test_coder_roundtrip() -> None: - original = xr.Variable(("x",), [0.0, np.nan, 1.0]) + original = Variable(("x",), [0.0, np.nan, 1.0]) coder = variables.CFMaskCoder() roundtripped = coder.decode(coder.encode(original)) assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) +@pytest.mark.parametrize("dtype", "i1 i2 f2 f4".split()) def test_scaling_converts_to_float32(dtype) -> None: - original = xr.Variable( - ("x",), np.arange(10, dtype=dtype), encoding=dict(scale_factor=10) + original = Variable( + ("x",), + np.arange(10, dtype=dtype), + encoding=dict(scale_factor=np.float64(10), dtype=dtype), ) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) @@ -108,12 +119,12 @@ def test_scaling_converts_to_float32(dtype) -> None: assert roundtripped.dtype == np.float32 -@pytest.mark.parametrize("scale_factor", (10, [10])) +@pytest.mark.parametrize("scale_factor", (10.0, [10.0])) @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 - encoding = dict(scale_factor=scale_factor, add_offset=add_offset) - original = xr.Variable(("x",), np.arange(10.0), encoding=encoding) + encoding = dict(scale_factor=scale_factor, add_offset=add_offset, dtype="i2") + original = Variable(("x",), np.arange(10.0), encoding=encoding) coder = variables.CFScaleOffsetCoder() encoded = coder.encode(original) roundtripped = coder.decode(encoded) @@ -125,7 +136,7 @@ def test_decode_unsigned_from_signed(bits) -> None: unsigned_dtype = np.dtype(f"u{bits}") signed_dtype = np.dtype(f"i{bits}") original_values = np.array([np.iinfo(unsigned_dtype).max], dtype=unsigned_dtype) - encoded = xr.Variable( + encoded = Variable( ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"} ) coder = variables.UnsignedIntegerCoder() @@ -139,10 +150,66 @@ def test_decode_signed_from_unsigned(bits) -> None: unsigned_dtype = np.dtype(f"u{bits}") signed_dtype = np.dtype(f"i{bits}") original_values = np.array([-1], dtype=signed_dtype) - encoded = xr.Variable( + encoded = Variable( ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"} ) coder = variables.UnsignedIntegerCoder() decoded = coder.decode(encoded) assert decoded.dtype == signed_dtype assert decoded.values == original_values + + +def test_ensure_scale_offset_conformance(): + # scale offset dtype mismatch + mapping = dict(scale_factor=np.float16(10), add_offset=np.float64(-10), dtype="i2") + with pytest.raises(ValueError, match="float16 is not allowed"): + variables._ensure_scale_offset_conformance(mapping) + + # do nothing + mapping = dict() + assert variables._ensure_scale_offset_conformance(mapping) is None + + # do nothing + mapping = dict(dtype="i2") + assert variables._ensure_scale_offset_conformance(mapping) is None + + # mandatory packing information missing + mapping = dict(scale_factor=np.float32(10)) + with pytest.raises(ValueError, match="Packed dtype information is missing!"): + variables._ensure_scale_offset_conformance(mapping) + + # scale offset dtype mismatch 1 + mapping = dict(scale_factor=np.int32(10), add_offset=np.int32(10), dtype="i2") + with pytest.warns( + SerializationWarning, match="Must be either float32 or float64 dtype." + ): + variables._ensure_scale_offset_conformance(mapping, strict=False) + with pytest.raises(ValueError, match="Must be either float32 or float64 dtype."): + variables._ensure_scale_offset_conformance(mapping, strict=True) + + # packed dtype mismatch + mapping = dict(scale_factor=np.float32(10), add_offset=np.float32(10), dtype="u2") + with pytest.warns( + SerializationWarning, match="Must be of type byte, short or int." + ): + variables._ensure_scale_offset_conformance(mapping, strict=False) + with pytest.raises(ValueError, match="Must be of type byte, short or int."): + variables._ensure_scale_offset_conformance(mapping, strict=True) + + # pack float32 into int32 + mapping = dict(scale_factor=np.float32(10), add_offset=np.float32(10), dtype="i4") + with pytest.warns(SerializationWarning, match="Trying to pack float32 into int32."): + variables._ensure_scale_offset_conformance(mapping) + + # scale offset dtype mismatch + mapping = dict(scale_factor=np.float32(10), add_offset=np.float64(-10), dtype="i2") + with pytest.warns( + SerializationWarning, + match="scale_factor dtype float32 and add_offset dtype float64 mismatch!", + ): + variables._ensure_scale_offset_conformance(mapping, strict=False) + with pytest.raises( + ValueError, + match="scale_factor dtype float32 and add_offset dtype float64 mismatch!", + ): + variables._ensure_scale_offset_conformance(mapping, strict=True) From 445ba916076fe5b9ae5f8df02ebd65b96353a5d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 13:08:18 +0200 Subject: [PATCH 2/7] fix typing --- xarray/coding/variables.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index ca030df9fd0..3421f9ca140 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -319,7 +319,7 @@ def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[A def _ensure_scale_offset_conformance( - mapping: MutableMapping, strict: bool = False + mapping: MutableMapping[str, Any], strict: bool = False ) -> bool | None: """Check conformance of scale_factor and add_offset for cf encoding/decoding. @@ -365,12 +365,10 @@ def _ensure_scale_offset_conformance( if len(scale_offset_dtype) == 1: # OK, we have at least one of scale_factor or add_offset # and if both are given, they are of the same dtype - scale_offset_dtype = scale_offset_dtype[0] - - if scale_offset_dtype != ptype: - if scale_offset_dtype not in [np.float32, np.float64]: + if scale_offset_dtype[0] != ptype: + if scale_offset_dtype[0] not in [np.float32, np.float64]: msg = ( - f"scale_factor and/or add_offset dtype {scale_offset_dtype} " + f"scale_factor and/or add_offset dtype {scale_offset_dtype[0]} " "mismatch. Must be either float32 or float64 dtype." ) if strict: @@ -389,7 +387,7 @@ def _ensure_scale_offset_conformance( else: warnings.warn(msg, SerializationWarning, stacklevel=3) conforms = False - if ptype == np.int32 and scale_offset_dtype == np.float32: + if ptype == np.int32 and scale_offset_dtype[0] == np.float32: warnings.warn( "Trying to pack float32 into int32. This is not advised per CF Convention " "because of potential precision loss!", From a88d987363230f1f66480cd97472f1e536d3307f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 13:57:38 +0200 Subject: [PATCH 3/7] fix check, adapt tests --- xarray/coding/variables.py | 15 +++++++++++++-- xarray/tests/test_coding.py | 20 ++++++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 3421f9ca140..4383761d515 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -329,10 +329,18 @@ def _ensure_scale_offset_conformance( # https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data scale_factor = mapping.get("scale_factor") if np.ndim(scale_factor) > 0: - scale_factor = np.asarray(scale_factor).item() + if strict: + raise ValueError(f"scale_factor {scale_factor} mismatch, scalar expected.") + else: + scale_factor = np.asarray(scale_factor).item() + add_offset = mapping.get("add_offset") if np.ndim(add_offset) > 0: - add_offset = np.asarray(add_offset).item() + if strict: + raise ValueError(f"add_offset {add_offset} mismatch, scalar expected.") + else: + add_offset = np.asarray(add_offset).item() + dtype = mapping.get("dtype") ptype = np.dtype(dtype) if dtype is not None else None @@ -446,10 +454,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: _ensure_scale_offset_conformance(encoding, strict=False) dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) + # if scale/offset are non-conforming, extract and overwrite if np.ndim(scale_factor) > 0: scale_factor = np.asarray(scale_factor).item() + encoding["scale_factor"] = scale_factor if np.ndim(add_offset) > 0: add_offset = np.asarray(add_offset).item() + encoding["add_offset"] = add_offset transform = partial( _scale_offset_decoding, scale_factor=scale_factor, diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index b74e6b4dfd4..f01ea86a577 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -18,7 +18,7 @@ decode_cf_variable, encode_cf_variable, ) -from xarray.tests import assert_allclose, assert_equal, assert_identical, requires_dask +from xarray.tests import assert_equal, assert_identical, requires_dask with suppress(ImportError): import dask.array as da @@ -123,12 +123,20 @@ def test_scaling_converts_to_float32(dtype) -> None: @pytest.mark.parametrize("add_offset", (0.1, [0.1])) def test_scaling_offset_as_list(scale_factor, add_offset) -> None: # test for #4631 - encoding = dict(scale_factor=scale_factor, add_offset=add_offset, dtype="i2") - original = Variable(("x",), np.arange(10.0), encoding=encoding) + # start with encoded Variables + attrs = dict(scale_factor=scale_factor, add_offset=add_offset) + bad_original = Variable(("x",), np.arange(10, dtype="i2"), attrs=attrs) + if np.ndim(scale_factor) > 0: + scale_factor = np.asarray(scale_factor).item() + if np.ndim(add_offset) > 0: + add_offset = np.asarray(add_offset).item() + attrs = dict(scale_factor=scale_factor, add_offset=add_offset) + good_original = Variable(("x",), np.arange(10, dtype="i2"), attrs=attrs) + coder = variables.CFScaleOffsetCoder() - encoded = coder.encode(original) - roundtripped = coder.decode(encoded) - assert_allclose(original, roundtripped) + decoded = coder.decode(bad_original) + roundtripped = coder.encode(decoded) + assert_identical(good_original, roundtripped) @pytest.mark.parametrize("bits", [1, 2, 4, 8]) From 08bc49532d9002c93f27f850274dc9ce9b49a226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 14:12:41 +0200 Subject: [PATCH 4/7] use function to fix wrong values on decoding --- xarray/coding/variables.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 4383761d515..3ba9ecd50fb 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -333,6 +333,7 @@ def _ensure_scale_offset_conformance( raise ValueError(f"scale_factor {scale_factor} mismatch, scalar expected.") else: scale_factor = np.asarray(scale_factor).item() + mapping["scale_factor"] = scale_factor add_offset = mapping.get("add_offset") if np.ndim(add_offset) > 0: @@ -340,6 +341,7 @@ def _ensure_scale_offset_conformance( raise ValueError(f"add_offset {add_offset} mismatch, scalar expected.") else: add_offset = np.asarray(add_offset).item() + mapping["add_offset"] = add_offset dtype = mapping.get("dtype") ptype = np.dtype(dtype) if dtype is not None else None @@ -449,18 +451,11 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: # for decoding we need the original dtype encoding.setdefault("dtype", data.dtype) - # only warn on decoding + # only warn on decoding, but fix erroneous encoding # we try to decode non-conforming data _ensure_scale_offset_conformance(encoding, strict=False) dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) - # if scale/offset are non-conforming, extract and overwrite - if np.ndim(scale_factor) > 0: - scale_factor = np.asarray(scale_factor).item() - encoding["scale_factor"] = scale_factor - if np.ndim(add_offset) > 0: - add_offset = np.asarray(add_offset).item() - encoding["add_offset"] = add_offset transform = partial( _scale_offset_decoding, scale_factor=scale_factor, From abb7bb45fedb26f75aa9f7f97a4b6a06c08209b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 15:55:32 +0200 Subject: [PATCH 5/7] use attributes from withing encoding --- xarray/coding/variables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 3ba9ecd50fb..21e726a6e0e 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -445,8 +445,8 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: if "scale_factor" in _attrs or "add_offset" in _attrs: dims, data, attrs, encoding = unpack_for_decoding(variable) - scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) - add_offset = pop_to(attrs, encoding, "add_offset", name=name) + pop_to(attrs, encoding, "scale_factor", name=name) + pop_to(attrs, encoding, "add_offset", name=name) # for decoding we need the original dtype encoding.setdefault("dtype", data.dtype) @@ -458,8 +458,8 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) transform = partial( _scale_offset_decoding, - scale_factor=scale_factor, - add_offset=add_offset, + scale_factor=encoding["scale_factor"], + add_offset=encoding["add_offset"], dtype=dtype, ) data = lazy_elemwise_func(data, transform, dtype) From 4267f288b71e65cb426d73361514748b9b85c20d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 20 Apr 2023 17:21:06 +0200 Subject: [PATCH 6/7] prevent KeyError --- xarray/coding/variables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 21e726a6e0e..52dd833cac1 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -328,7 +328,7 @@ def _ensure_scale_offset_conformance( conforms = True # https://cfconventions.org/cf-conventions/cf-conventions.html#packed-data scale_factor = mapping.get("scale_factor") - if np.ndim(scale_factor) > 0: + if scale_factor is not None and np.ndim(scale_factor) > 0: if strict: raise ValueError(f"scale_factor {scale_factor} mismatch, scalar expected.") else: @@ -336,7 +336,7 @@ def _ensure_scale_offset_conformance( mapping["scale_factor"] = scale_factor add_offset = mapping.get("add_offset") - if np.ndim(add_offset) > 0: + if add_offset is not None and np.ndim(add_offset) > 0: if strict: raise ValueError(f"add_offset {add_offset} mismatch, scalar expected.") else: @@ -458,8 +458,8 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) transform = partial( _scale_offset_decoding, - scale_factor=encoding["scale_factor"], - add_offset=encoding["add_offset"], + scale_factor=encoding.get("scale_factor"), + add_offset=encoding.get("add_offset"), dtype=dtype, ) data = lazy_elemwise_func(data, transform, dtype) From 3c745ed959657e5da8e6d4aa66f1e06f0a4e1655 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 6 May 2023 10:01:17 +0200 Subject: [PATCH 7/7] Update xarray/tests/test_coding.py --- xarray/tests/test_coding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index f01ea86a577..6998f047f8f 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -167,7 +167,7 @@ def test_decode_signed_from_unsigned(bits) -> None: assert decoded.values == original_values -def test_ensure_scale_offset_conformance(): +def test_ensure_scale_offset_conformance() -> None: # scale offset dtype mismatch mapping = dict(scale_factor=np.float16(10), add_offset=np.float64(-10), dtype="i2") with pytest.raises(ValueError, match="float16 is not allowed"):