Cohort subsets for Garud H

tomwhite · tomwhite · commit 74a9229881d9 · 2020-11-20T11:52:03.000Z
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -1,5 +1,5 @@
 import collections
-from typing import Hashable, Optional
+from typing import Hashable, Optional, Sequence, Union
 
 import dask.array as da
 import numpy as np
@@ -718,12 +718,12 @@ def _Garud_h(haplotypes: ArrayLike) -> ArrayLike:
 
 
 def _Garud_h_cohorts(
-    gt: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int
+    gt: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int, ct: ArrayLike
 ) -> ArrayLike:
     # transpose to hash columns (haplotypes)
     haplotypes = hash_array(gt.transpose()).transpose().flatten()
-    arr = np.empty((n_cohorts, N_GARUD_H_STATS))
-    for c in range(n_cohorts):
+    arr = np.full((n_cohorts, N_GARUD_H_STATS), np.nan)
+    for c in np.nditer(ct):
         arr[c, :] = _Garud_h(haplotypes[sample_cohort == c])
     return arr
 
@@ -732,6 +732,7 @@ def Garud_h(
     ds: Dataset,
     *,
     call_genotype: Hashable = variables.call_genotype,
+    cohorts: Optional[Sequence[Union[int, str]]] = None,
     merge: bool = True,
 ) -> Dataset:
     """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
@@ -749,6 +750,10 @@ def Garud_h(
         Input variable name holding call_genotype as defined by
         :data:`sgkit.variables.call_genotype_spec`.
         Must be present in ``ds``.
+    cohorts
+        The cohorts to compute statistics for, specified as a sequence of
+        cohort indexes or IDs. None (the default) means compute statistics
+        for all cohorts.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only
@@ -824,10 +829,12 @@ def Garud_h(
     sc = ds.sample_cohort.values
     hsc = np.stack((sc, sc), axis=1).ravel()  # TODO: assumes diploid
     n_cohorts = sc.max() + 1  # 0-based indexing
+    cohorts = cohorts or range(n_cohorts)
+    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts", None))
 
     gh = window_statistic(
         gt,
-        lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts),
+        lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts, ct),
         ds.window_start.values,
         ds.window_stop.values,
         dtype=np.float64,
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -362,18 +362,25 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
         ac_j = ds.cohort_allele_count.values[:, j, :]
         ac_k = ds.cohort_allele_count.values[:, k, :]
 
-        ska_pbs_value = allel.pbs(ac_i, ac_j, ac_k, window_size=25, window_step=25)
+        ska_pbs_value = allel.pbs(ac_i, ac_j, ac_k, window_size=25)
 
         # scikit-allel has final window missing
         np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
 
 
 @pytest.mark.parametrize(
-    "n_variants, n_samples, n_contigs, n_cohorts",
-    [(9, 5, 1, 1), (9, 5, 1, 2)],
+    "n_variants, n_samples, n_contigs, n_cohorts, cohorts, cohort_indexes",
+    [
+        (9, 5, 1, 1, None, None),
+        (9, 5, 1, 2, None, None),
+        (9, 5, 1, 2, [1], [1]),
+        (9, 5, 1, 2, ["co_1"], [1]),
+    ],
 )
 @pytest.mark.parametrize("chunks", [(-1, -1), (5, -1)])
-def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
+def test_Garud_h(
+    n_variants, n_samples, n_contigs, n_cohorts, cohorts, cohort_indexes, chunks
+):
     ds = simulate_genotype_call_dataset(
         n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
     )
@@ -383,25 +390,37 @@ def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
         [np.full_like(subset, i) for i, subset in enumerate(subsets)]
     )
     ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
+    cohort_names = [f"co_{i}" for i in range(n_cohorts)]
+    coords = {k: cohort_names for k in ["cohorts"]}
+    ds = ds.assign_coords(coords)  # type: ignore[no-untyped-call]
     ds = window(ds, size=3)
 
-    gh = Garud_h(ds)
+    gh = Garud_h(ds, cohorts=cohorts)
     h1 = gh.stat_Garud_h1.values
     h12 = gh.stat_Garud_h12.values
     h123 = gh.stat_Garud_h123.values
     h2_h1 = gh.stat_Garud_h2_h1.values
 
     # scikit-allel
     for c in range(n_cohorts):
-        gt = ds.call_genotype.values[:, sample_cohorts == c, :]
-        ska_gt = allel.GenotypeArray(gt)
-        ska_ha = ska_gt.to_haplotypes()
-        ska_h = allel.moving_garud_h(ska_ha, size=3)
-
-        np.testing.assert_allclose(h1[:, c], ska_h[0])
-        np.testing.assert_allclose(h12[:, c], ska_h[1])
-        np.testing.assert_allclose(h123[:, c], ska_h[2])
-        np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
+        if cohort_indexes is not None and c not in cohort_indexes:
+            # cohorts that were not computed should be nan
+            np.testing.assert_array_equal(h1[:, c], np.full_like(h1[:, c], np.nan))
+            np.testing.assert_array_equal(h12[:, c], np.full_like(h12[:, c], np.nan))
+            np.testing.assert_array_equal(h123[:, c], np.full_like(h123[:, c], np.nan))
+            np.testing.assert_array_equal(
+                h2_h1[:, c], np.full_like(h2_h1[:, c], np.nan)
+            )
+        else:
+            gt = ds.call_genotype.values[:, sample_cohorts == c, :]
+            ska_gt = allel.GenotypeArray(gt)
+            ska_ha = ska_gt.to_haplotypes()
+            ska_h = allel.moving_garud_h(ska_ha, size=3)
+
+            np.testing.assert_allclose(h1[:, c], ska_h[0])
+            np.testing.assert_allclose(h12[:, c], ska_h[1])
+            np.testing.assert_allclose(h123[:, c], ska_h[2])
+            np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
 
 
 def test_Garud_h__raise_on_non_diploid():