Cohort utilities

tomwhite · tomwhite · commit 55e8e89bb87d · 2020-11-20T11:52:03.000Z
diff --git a/sgkit/cohorts.py b/sgkit/cohorts.py
@@ -0,0 +1,60 @@
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from sgkit.typing import ArrayLike
+
+
+def _tuple_len(t: Union[int, Tuple[int, ...], str, Tuple[str, ...]]) -> int:
+    """Return the length of a tuple, or 1 for an int or string value."""
+    if isinstance(t, int) or isinstance(t, str):
+        return 1
+    return len(t)
+
+
+def _cohorts_to_array(
+    cohorts: Sequence[Union[int, Tuple[int, ...], str, Tuple[str, ...]]],
+    index: Optional[pd.Index] = None,
+) -> ArrayLike:
+    """Convert cohorts or cohort tuples specified as a sequence of values or
+    tuples to an array of ints used to match samples in ``sample_cohorts``.
+
+    Cohorts can be specified by index (as used in ``sample_cohorts``), or a label, in
+    which case an ``index`` must be provided to find index locations for cohorts.
+
+    Parameters
+    ----------
+    cohorts
+        A sequence of values or tuple representing cohorts or cohort tuples.
+    index
+        An index to turn labels into index locations, by default None.
+
+    Returns
+    -------
+    An array of shape ``(len(cohorts), tuple_len)``, where ``tuple_len`` is the length
+    of the tuples, or 1 if ``cohorts`` is a sequence of values.,
+
+    Raises
+    ------
+    ValueError
+        If the cohort tuples are not all the same length.
+    """
+    if len(cohorts) == 0:
+        return np.array([], np.int32)
+
+    tuple_len = _tuple_len(cohorts[0])
+    if not all(_tuple_len(cohort) == tuple_len for cohort in cohorts):
+        raise ValueError("Cohort tuples must all be the same length")
+
+    # convert cohort IDs using an index
+    if index is not None:
+        if isinstance(cohorts[0], str):
+            cohorts = [index.get_loc(id) for id in cohorts]
+        elif tuple_len > 1 and isinstance(cohorts[0][0], str):  # type: ignore
+            cohorts = [tuple(index.get_loc(id) for id in t) for t in cohorts]  # type: ignore
+
+    ct = np.empty((len(cohorts), tuple_len), np.int32)
+    for n, t in enumerate(cohorts):
+        ct[n, :] = t
+    return ct
diff --git a/sgkit/tests/test_cohorts.py b/sgkit/tests/test_cohorts.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from sgkit.cohorts import _cohorts_to_array, _tuple_len
+
+
+def test_tuple_len():
+    assert _tuple_len(tuple()) == 0
+    assert _tuple_len(1) == 1
+    assert _tuple_len("a") == 1
+    assert _tuple_len("ab") == 1
+    assert _tuple_len((1,)) == 1
+    assert _tuple_len(("a",)) == 1
+    assert _tuple_len(("ab",)) == 1
+    assert _tuple_len((1, 2)) == 2
+    assert _tuple_len(("a", "b")) == 2
+    assert _tuple_len(("ab", "cd")) == 2
+
+
+def test_cohorts_to_array__indexes():
+    with pytest.raises(ValueError, match="Cohort tuples must all be the same length"):
+        _cohorts_to_array([(0, 1), (0, 1, 2)])
+
+    np.testing.assert_equal(_cohorts_to_array([]), np.array([]))
+    np.testing.assert_equal(_cohorts_to_array([0, 1]), np.array([[0], [1]]))
+    np.testing.assert_equal(
+        _cohorts_to_array([(0, 1), (2, 1)]), np.array([[0, 1], [2, 1]])
+    )
+    np.testing.assert_equal(
+        _cohorts_to_array([(0, 1, 2), (3, 1, 2)]), np.array([[0, 1, 2], [3, 1, 2]])
+    )
+
+
+def test_cohorts_to_array__ids():
+    with pytest.raises(ValueError, match="Cohort tuples must all be the same length"):
+        _cohorts_to_array([("c0", "c1"), ("c0", "c1", "c2")])
+
+    np.testing.assert_equal(_cohorts_to_array([]), np.array([]))
+    np.testing.assert_equal(
+        _cohorts_to_array(["c0", "c1"], pd.Index(["c0", "c1"])),
+        np.array([[0], [1]]),
+    )
+    np.testing.assert_equal(
+        _cohorts_to_array([("c0", "c1"), ("c2", "c1")], pd.Index(["c0", "c1", "c2"])),
+        np.array([[0, 1], [2, 1]]),
+    )
+    np.testing.assert_equal(
+        _cohorts_to_array(
+            [("c0", "c1", "c2"), ("c3", "c1", "c2")], pd.Index(["c0", "c1", "c2", "c3"])
+        ),
+        np.array([[0, 1, 2], [3, 1, 2]]),
+    )