Merge branch 'master' into cohort-subsets

mergify[bot] · web-flow · commit 189c9646049f · 2020-11-24T14:58:35.000Z
diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py
@@ -88,7 +88,9 @@ def _count_cohort_alleles(
     n_samples, n_alleles = ac.shape
     for i in range(n_samples):
         for j in range(n_alleles):
-            out[cohorts[i], j] += ac[i, j]
+            c = cohorts[i]
+            if c >= 0:
+                out[c, j] += ac[i, j]
 
 
 def count_call_alleles(
diff --git a/sgkit/stats/association.py b/sgkit/stats/association.py
@@ -96,9 +96,10 @@ def linear_regression(
     T = B / np.sqrt(RSS / dof / XLPS)
     assert T.shape == (n_loop_covar, n_outcome)
     # Match to p-values
-    # Note: t dist not implemented in Dask so this must be delayed
+    # Note: t dist not implemented in Dask so this must be delayed,
+    # see https://github.com/dask/dask/issues/6857
     P = da.map_blocks(
-        lambda t: 2 * stats.distributions.t.sf(np.abs(T), dof), T, dtype="float64"
+        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
     )
     assert P.shape == (n_loop_covar, n_outcome)
 
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -149,13 +149,14 @@ def _divergence(ac: ArrayLike, out: ArrayLike) -> None:  # pragma: no cover
     for i in range(n_cohorts):
         for j in range(i + 1, n_cohorts):
             n_pairs = an[i] * an[j]
-            n_same = 0
-            for k in range(n_alleles):
-                n_same += ac[i, k] * ac[j, k]
-            n_diff = n_pairs - n_same
-            div = n_diff / n_pairs
-            out[i, j] = div
-            out[j, i] = div
+            if n_pairs != 0.0:
+                n_same = 0
+                for k in range(n_alleles):
+                    n_same += ac[i, k] * ac[j, k]
+                n_diff = n_pairs - n_same
+                div = n_diff / n_pairs
+                out[i, j] = div
+                out[j, i] = div
 
     # calculate the diversity for each cohort
     for i in range(n_cohorts):
diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py
@@ -216,13 +216,14 @@ def test_count_call_alleles__chunked():
 def test_count_cohort_alleles__multi_variant_multi_sample():
     ds = get_dataset(
         [
-            [[0, 0], [0, 0], [0, 0]],
-            [[0, 0], [0, 0], [0, 1]],
-            [[1, 1], [0, 1], [1, 0]],
-            [[1, 1], [1, 1], [1, 1]],
+            [[0, 0], [0, 0], [0, 0], [0, 0]],
+            [[0, 0], [0, 0], [0, 1], [0, 1]],
+            [[1, 1], [0, 1], [1, 0], [1, 0]],
+            [[1, 1], [1, 1], [1, 1], [1, 1]],
         ]
     )
-    ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples")
+    # -1 means that the sample is not in any cohort
+    ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1, -1]), dims="samples")
     ds = count_cohort_alleles(ds)
     ac = ds.cohort_allele_count
     np.testing.assert_equal(
diff --git a/sgkit/tests/test_association.py b/sgkit/tests/test_association.py
@@ -9,7 +9,6 @@
 from pandas import DataFrame
 from xarray import Dataset
 
-from sgkit import variables
 from sgkit.stats.association import gwas_linear_regression, linear_regression
 from sgkit.typing import ArrayLike
 
@@ -175,16 +174,24 @@ def validate(dfp: DataFrame, dft: DataFrame) -> None:
 
 def test_gwas_linear_regression__lazy_results(ds):
     res = gwas_linear_regression(
-        ds, dosage="dosage", covariates="covar_0", traits="trait_0"
+        ds, dosage="dosage", covariates="covar_0", traits="trait_0", merge=False
     )
-    for v in [
-        variables.variant_beta,
-        variables.variant_t_value,
-        variables.variant_p_value,
-    ]:
+    for v in res:
         assert isinstance(res[v].data, da.Array)
 
 
+@pytest.mark.parametrize("chunks", [5, -1, "auto"])
+def test_gwas_linear_regression__variable_shapes(ds, chunks):
+    ds = ds.chunk(chunks=chunks)
+    res = gwas_linear_regression(
+        ds, dosage="dosage", covariates="covar_0", traits="trait_0", merge=False
+    )
+    shape = (ds.dims["variants"], 1)
+    for v in res:
+        assert res[v].data.shape == shape
+        assert res[v].data.compute().shape == shape
+
+
 def test_gwas_linear_regression__multi_trait(ds):
     def run(traits: Sequence[str]) -> Dataset:
         return gwas_linear_regression(
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -22,6 +22,8 @@
 )
 from sgkit.window import window
 
+from .test_aggregation import get_dataset
+
 
 def ts_to_dataset(ts, chunks=None, samples=None):
     """
@@ -203,6 +205,17 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
     )  # scikit-allel has final window missing
 
 
+def test_divergence__missing_calls():
+    ds = get_dataset(
+        [
+            [[0, 0], [-1, -1], [-1, -1]],  # all of cohort 1 calls are missing
+        ]
+    )
+    ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples")
+    ds = divergence(ds)
+    np.testing.assert_equal(ds["stat_divergence"].values[0, 1], np.nan)
+
+
 @pytest.mark.parametrize("sample_size", [2, 3, 10, 100])
 def test_Fst__Hudson(sample_size):
     # scikit-allel can only calculate Fst for pairs of cohorts (populations)