Bitround Codec (#299)

rabernat · web-flow · commit aab539276448 · 2022-05-18T12:15:53.000-07:00
diff --git a/docs/bitround.rst b/docs/bitround.rst
@@ -0,0 +1,11 @@
+Bitround
+========
+.. automodule:: numcodecs.bitround
+
+.. autoclass:: BitRound
+
+    .. autoattribute:: codec_id
+    .. automethod:: encode
+    .. automethod:: decode
+    .. automethod:: get_config
+    .. automethod:: from_config
diff --git a/docs/index.rst b/docs/index.rst
@@ -71,6 +71,7 @@ Contents
     delta
     fixedscaleoffset
     quantize
+    bitround
     packbits
     categorize
     checksum32
diff --git a/docs/release.rst b/docs/release.rst
@@ -36,6 +36,9 @@ Unreleased
   By :user:`Dimitri Papadopoulos Orfanos <DimitriPapadopoulos>`,
   :issue:`295`, :issue:`294`, :issue:`293`, and :issue:`292`.
 
+* Add bitround codec
+  By :user:`Ryan Abernathy <rabernat>` and :user:`Martin Durant <martindurant>`, :issue:`298`.
+
 * Drop Python 3.6
   By :user:`Josh Moore <joshmoore>`, :issue:`318`.
 
diff --git a/numcodecs/bitround.py b/numcodecs/bitround.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+
+from .abc import Codec
+from .compat import ensure_ndarray_like, ndarray_copy
+
+# The size in bits of the mantissa/significand for the various floating types
+# You cannot keep more bits of data than you have available
+# https://en.wikipedia.org/wiki/IEEE_754
+max_bits = {
+    "float16": 10,
+    "float32": 23,
+    "float64": 52,
+}
+
+
+class BitRound(Codec):
+    """Floating-point bit rounding codec
+
+    Drops a specified number of bits from the floating point mantissa,
+    leaving an array more amenable to compression. The number of bits to keep should
+    be determined by an information analysis of the data to be compressed.
+    The approach is based on the paper by Klöwer et al. 2021
+    (https://www.nature.com/articles/s43588-021-00156-2). See
+    https://github.com/zarr-developers/numcodecs/issues/298 for discussion
+    and the original implementation in Julia referred to at
+    https://github.com/milankl/BitInformation.jl
+
+    Parameters
+    ----------
+
+    keepbits: int
+        The number of bits of the mantissa to keep. The range allowed
+        depends on the dtype input data. If keepbits is
+        equal to the maximum allowed for the data type, this is equivalent
+        to no transform.
+    """
+
+    codec_id = 'bitround'
+
+    def __init__(self, keepbits: int):
+        if keepbits < 0:
+            raise ValueError("keepbits must be zero or positive")
+        self.keepbits = keepbits
+
+    def encode(self, buf):
+        """Create int array by rounding floating-point data
+
+        The itemsize will be preserved, but the output should be much more
+        compressible.
+        """
+        a = ensure_ndarray_like(buf)
+        if not a.dtype.kind == "f" or a.dtype.itemsize > 8:
+            raise TypeError("Only float arrays (16-64bit) can be bit-rounded")
+        bits = max_bits[str(a.dtype)]
+        # cast float to int type of same width (preserve endianness)
+        a_int_dtype = np.dtype(a.dtype.str.replace("f", "i"))
+        all_set = np.array(-1, dtype=a_int_dtype)
+        if self.keepbits == bits:
+            return a
+        if self.keepbits > bits:
+            raise ValueError("Keepbits too large for given dtype")
+        b = a.view(a_int_dtype)
+        maskbits = bits - self.keepbits
+        mask = (all_set >> maskbits) << maskbits
+        half_quantum1 = (1 << (maskbits - 1)) - 1
+        b += ((b >> maskbits) & 1) + half_quantum1
+        b &= mask
+        return b
+
+    def decode(self, buf, out=None):
+        """Remake floats from ints
+
+        As with ``encode``, preserves itemsize.
+        """
+        buf = ensure_ndarray_like(buf)
+        # Cast back from `int` to `float` type (noop if a `float`ing type buffer is provided)
+        dt = np.dtype(buf.dtype.str.replace("i", "f"))
+        data = buf.view(dt)
+        return ndarray_copy(data, out)
diff --git a/numcodecs/tests/test_bitround.py b/numcodecs/tests/test_bitround.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+import pytest
+
+from numcodecs.bitround import BitRound, max_bits
+
+# adapted from https://github.com/milankl/BitInformation.jl/blob/main/test/round_nearest.jl
+
+
+# TODO: add other dtypes
+@pytest.fixture(params=["float32", "float64"])
+def dtype(request):
+    return request.param
+
+
+def round(data, keepbits):
+    codec = BitRound(keepbits=keepbits)
+    data = data.copy()  # otherwise overwrites the input
+    encoded = codec.encode(data)
+    return codec.decode(encoded)
+
+
+def test_round_zero_to_zero(dtype):
+    a = np.zeros((3, 2), dtype=dtype)
+    # Don't understand Milan's original test:
+    # How is it possible to have negative keepbits?
+    # for k in range(-5, 50):
+    for k in range(0, max_bits[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_round_one_to_one(dtype):
+    a = np.ones((3, 2), dtype=dtype)
+    for k in range(0, max_bits[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_round_minus_one_to_minus_one(dtype):
+    a = -np.ones((3, 2), dtype=dtype)
+    for k in range(0, max_bits[dtype]):
+        ar = round(a, k)
+        np.testing.assert_equal(a, ar)
+
+
+def test_no_rounding(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    keepbits = max_bits[dtype]
+    ar = round(a, keepbits)
+    np.testing.assert_equal(a, ar)
+
+
+APPROX_KEEPBITS = {"float32": 11, "float64": 18}
+
+
+def test_approx_equal(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    ar = round(a, APPROX_KEEPBITS[dtype])
+    # Mimic julia behavior - https://docs.julialang.org/en/v1/base/math/#Base.isapprox
+    rtol = np.sqrt(np.finfo(np.float32).eps)
+    # This gets us much closer but still failing for ~6% of the array
+    # It does pass if we add 1 to keepbits (11 instead of 10)
+    # Is there an off-by-one issue here?
+    np.testing.assert_allclose(a, ar, rtol=rtol)
+
+
+def test_idempotence(dtype):
+    a = np.random.random_sample((300, 200)).astype(dtype)
+    for k in range(20):
+        ar = round(a, k)
+        ar2 = round(a, k)
+        np.testing.assert_equal(ar, ar2)
+
+
+def test_errors():
+    with pytest.raises(ValueError):
+        BitRound(keepbits=99).encode(np.array([0], dtype="float32"))
+    with pytest.raises(TypeError):
+        BitRound(keepbits=10).encode(np.array([0]))