Add cohort_statistic function pystatgen#730

timothymillar · timothymillar · commit 778d71bbef14 · 2021-12-08T21:13:03.000+13:00
diff --git a/sgkit/cohorts.py b/sgkit/cohorts.py
@@ -1,5 +1,6 @@
-from typing import Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
 
+import dask.array as da
 import numpy as np
 import pandas as pd
 
@@ -70,3 +71,40 @@ def _cohorts_to_array(
     for n, t in enumerate(cohorts):
         ct[n, :] = t
     return ct
+
+
+def cohort_statistic(
+    values: ArrayLike,
+    statistic: Callable[..., ArrayLike],
+    cohorts: ArrayLike,
+    sample_axis: int = 1,
+    **kwargs: Any,
+) -> da.Array:
+    """Calculate a statistic for each cohort of samples.
+
+    Parameters
+    ----------
+    values
+        An n-dimensional array of sample values.
+    statistic
+        A callable to apply to the samples of each cohort. The callable is
+        expected to consume the samples axis.
+    cohorts
+        An array of integers indicating which cohort each sample is assigned to.
+        Negative integers indicate that a sample is not assigned to any cohort.
+    sample_axis
+        Integer indicating the samples axis of the values array.
+    kwargs
+        Key word arguments to pass to the callable statistic.
+
+    Returns
+    -------
+    Array of results for each cohort.
+    """
+    values = da.asarray(values)
+    cohorts = np.array(cohorts)
+    n_cohorts = cohorts.max() + 1
+    idx = [cohorts == c for c in range(n_cohorts)]
+    seq = [da.take(values, i, axis=sample_axis) for i in idx]
+    out = da.stack([statistic(c, **kwargs) for c in seq], axis=sample_axis)
+    return out
diff --git a/sgkit/tests/test_cohorts.py b/sgkit/tests/test_cohorts.py
@@ -1,8 +1,9 @@
+import dask.array as da
 import numpy as np
 import pandas as pd
 import pytest
 
-from sgkit.cohorts import _cohorts_to_array, _tuple_len
+from sgkit.cohorts import _cohorts_to_array, _tuple_len, cohort_statistic
 
 
 def test_tuple_len():
@@ -51,3 +52,53 @@ def test_cohorts_to_array__ids():
         ),
         np.array([[0, 1, 2], [3, 1, 2]]),
     )
+
+
+@pytest.mark.parametrize(
+    "statistic,expect",
+    [
+        (
+            np.mean,
+            [
+                [1.0, 0.75, 0.5],
+                [2 / 3, 0.25, 0.0],
+                [2 / 3, 0.75, 0.5],
+                [2 / 3, 0.5, 1.0],
+                [1 / 3, 0.5, 0.0],
+            ],
+        ),
+        (np.sum, [[3, 3, 1], [2, 1, 0], [2, 3, 1], [2, 2, 2], [1, 2, 0]]),
+    ],
+)
+@pytest.mark.parametrize(
+    "chunks",
+    [
+        ((5,), (10,)),
+        ((3, 2), (10,)),
+        ((3, 2), (5, 5)),
+    ],
+)
+def test_cohort_statistic(statistic, expect, chunks):
+    variables = da.asarray(
+        [
+            [1, 1, 1, 0, 1, 1, 0, 0, 1, 1],
+            [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+            [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 0, 0, 0, 0, 1, 1],
+            [0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
+        ],
+        chunks=chunks,
+    )
+    cohorts = np.array([0, 1, 0, 2, 0, 1, -1, 1, 1, 2])
+    np.testing.assert_array_equal(
+        expect, cohort_statistic(variables, statistic, cohorts, axis=1)
+    )
+
+
+def test_cohort_statistic_axis0():
+    variables = da.asarray([2, 3, 2, 4, 3, 1, 4, 5, 3, 1])
+    cohorts = np.array([0, 0, 0, 0, 0, -1, 1, 1, 1, 2])
+    np.testing.assert_array_equal(
+        [2.8, 4.0, 1.0],
+        cohort_statistic(variables, np.mean, cohorts, sample_axis=0, axis=0),
+    )