data_diff/lexicographic_space.py

"""Contains the implementation of two classes:

- LexicographicSpace - a lexicographic space of arbitrary dimensions.
- BoundedLexicographicSpace - a lexicographic space, where the lowest point may be non-zero.

A lexicographic space is a space of increasing natural values, ordered by lexicographic order.
Read more: https://mathworld.wolfram.com/LexicographicOrder.html

These abstractions were written to support compound keys in the hashdiff algorithm.
In the hashdiff algorithm, we rely on the order of the column keys, to segment the table correctly.
SQL orders the columns of tables based on lexicographic ordering.
Since we need an evenly spaced "range" function over the space, which has arbitrary dimensions, we have
to implement it ourself.

As a further optimization, since we each time operate on segments of the ordered table, we add support
for working with a restricted space, which will reduce the likelihood of gaps in our "select", when the
keys are not evenly distributed.
"""

from random import randint, randrange

from typing import Tuple

import attrs

from data_diff.utils import safezip

Vector = Tuple[int]
Interval = Tuple[int]


class Overflow(ValueError):
    pass


def neg_interval(interval):
    return tuple(-i for i in interval)


def neg_v(v: Vector):
    return tuple(-i for i in v)


def sub_v(v1: Vector, v2: Vector):
    return tuple(i1 - i2 for i1, i2 in safezip(v1, v2))


def add_v(v1: Vector, v2: Vector):
    return tuple(i1 + i2 for i1, i2 in safezip(v1, v2))


def rand_v_in_range(v1: Vector, v2: Vector):
    return tuple(irandrange(i1, i2) for i1, i2 in safezip(v1, v2))


def irandrange(start, stop):
    if start == stop:
        return start
    return randrange(start, stop)


@attrs.define(frozen=True)
class LexicographicSpace:
    """Lexicographic space of arbitrary dimensions.

    All elements must be of the same length as the number of dimensions. (no rpadding)
    """

    def __init__(self, dims: Vector) -> None:
        super().__init__()
        self.dims = dims

    def __contains__(self, v: Vector) -> bool:
        return all(0 <= i < d for i, d in safezip(v, self.dims))

    def add(self, v1: Vector, v2: Vector) -> Vector:
        # assert v1 in self and v2 in self, (v1, v2)

        carry = 0
        res = []
        for i1, i2, d in reversed(list(safezip(v1, v2, self.dims))):
            n = i1 + i2 + carry
            carry = n // d
            assert carry <= 1
            n %= d
            res.append(n)

        if carry:
            raise Overflow("Overflow")

        new_v = tuple(reversed(res))
        assert new_v in self
        return new_v

    def sub(self, v1: Vector, v2: Vector):
        return self.add(v1, neg_v(v2))

    def _divide(self, v: Vector, count: int):
        n = 0
        for x, d in zip(v, self.dims[1:] + (1,), strict=True):
            x += n
            rem = x % count
            n = rem * d
            yield x // count

    def divide(self, v: Vector, count: int) -> Vector:
        return tuple(self._divide(v, count))

    def range(self, min_value: Vector, max_value: Vector, count: int):
        assert min_value in self and max_value in self
        count -= 1
        size = self.sub(max_value, min_value)
        interval = self.divide(size, count)
        n = min_value
        for i in range(count):
            yield n
            n = self.add(n, interval)
        yield n


class BoundedLexicographicSpace:
    """Lexicographic space of arbitrary dimensions, where the lowest point may be non-zero.

    i.e. a space resticted by a "bounding-box" between two arbitrary points.
    """

    def __init__(self, min_bound: Vector, max_bound: Vector) -> None:
        super().__init__()

        dims = tuple(mx - mn for mn, mx in safezip(min_bound, max_bound))
        if not all(d >= 0 for d in dims):
            raise ValueError("Error: Negative dimension!")
        if not (dims[0] > 0):
            raise ValueError("First dimension must be non-zero!")

        self.min_bound = min_bound
        self.max_bound = max_bound

        self.uspace = LexicographicSpace(dims)

    def __contains__(self, p: Vector) -> bool:
        return all(mn <= i < mx for i, mn, mx in safezip(p, self.min_bound, self.max_bound))

    def to_uspace(self, v: Vector) -> Vector:
        assert v in self
        return sub_v(v, self.min_bound)

    def from_uspace(self, v: Vector) -> Vector:
        res = add_v(v, self.min_bound)
        assert res in self
        return res

    def add_interval(self, v1: Vector, interval: Interval) -> Vector:
        return self.from_uspace(self.uspace.add(self.to_uspace(v1), interval))

    def sub_interval(self, v1: Vector, interval: Interval) -> Vector:
        return self.from_uspace(self.uspace.sub(self.to_uspace(v1), interval))

    def sub(self, v1: Vector, v2: Vector) -> Interval:
        return self.uspace.sub(self.to_uspace(v1), self.to_uspace(v2))

    def range(self, min_value: Vector, max_value: Vector, count: int):
        return [
            self.from_uspace(v) for v in self.uspace.range(self.to_uspace(min_value), self.to_uspace(max_value), count)
        ]


def test_lex_space():
    # Test add
    binspace = LexicographicSpace((2, 2, 2, 2))
    zero = (0, 0, 0, 0)
    one = (0, 0, 0, 1)
    bin_nums = [zero]
    for i in range(15):
        last = bin_nums[-1]
        bin_nums.append(binspace.add(last, one))
    five = bin_nums[5]
    seven = bin_nums[7]
    eight = bin_nums[8]
    fifteen = bin_nums[15]

    assert binspace.add(binspace.add(one, five), one) == seven
    assert binspace.add(one, seven) == eight
    assert binspace.add(seven, eight) == fifteen

    assert binspace.sub(eight, one) == seven
    assert binspace.sub(fifteen, seven) == eight

    r = list(binspace.range(one, seven, 4))
    assert r == [one, bin_nums[3], five, seven], r

    decspace = LexicographicSpace((10, 10, 10))
    assert decspace.divide((4, 5, 2), 2) == (2, 2, 6)
    assert decspace.divide((3, 0, 2), 2) == (1, 5, 1)

    # Restricted space

    rspace1 = BoundedLexicographicSpace((2, 2), (8, 8))
    assert rspace1.add_interval((2, 2), (0, 0)) == (2, 2)
    assert rspace1.add_interval((2, 2), (0, 1)) == (2, 3)
    assert rspace1.add_interval((2, 2), (0, 6)) == (3, 2)
    assert rspace1.add_interval((2, 2), (0, 7)) == (3, 3)
    # space.add((2,2), (6, 0))    # Overflow

    rspace2 = BoundedLexicographicSpace((4, 4, 4, 4), (6, 6, 6, 6))
    _one = (4, 4, 4, 5)
    _three = (4, 4, 5, 5)
    _five = (4, 5, 4, 5)
    _seven = (4, 5, 5, 5)
    assert rspace2.add_interval(rspace2.add_interval(_five, one), one) == _seven
    assert rspace2.sub_interval(rspace2.sub_interval(_seven, one), one) == _five

    r = list(rspace2.range(_one, _seven, 4))
    assert r == [_one, _three, _five, _seven], r

    # Test range -
    # For random bounds and min/max values, assert that range() generates steps with uniform distances
    MAX_COLUMNS = 16
    MAX_DIM = 10000
    MAX_BISECTION = 128

    for n in range(1, MAX_COLUMNS):
        min_bound = tuple(randint(0, MAX_DIM) for i in range(n))
        size = tuple(randint(1, MAX_DIM) for i in range(n))
        max_bound = add_v(min_bound, size)

        sp = BoundedLexicographicSpace(min_bound, max_bound)

        max_value = rand_v_in_range(min_bound, max_bound)
        min_value = rand_v_in_range(min_bound, max_value)
        for count in range(2, MAX_BISECTION):
            r = sp.range(min_value, max_value, count)
            assert len(r) == count
            diffs = [sp.sub(b, a) for a, b in zip(r[:-1], r[1:])]
            assert len(set(diffs)) == 1  # Uniform!
            # print('.', end='')


if __name__ == "__main__":
    test_lex_space()