Skip to content

Commit 189c964

Browse files
authored
Merge branch 'master' into cohort-subsets
2 parents 2332fe7 + 3b13a7b commit 189c964

File tree

6 files changed

+47
-22
lines changed

6 files changed

+47
-22
lines changed

sgkit/stats/aggregation.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ def _count_cohort_alleles(
8888
n_samples, n_alleles = ac.shape
8989
for i in range(n_samples):
9090
for j in range(n_alleles):
91-
out[cohorts[i], j] += ac[i, j]
91+
c = cohorts[i]
92+
if c >= 0:
93+
out[c, j] += ac[i, j]
9294

9395

9496
def count_call_alleles(

sgkit/stats/association.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,10 @@ def linear_regression(
9696
T = B / np.sqrt(RSS / dof / XLPS)
9797
assert T.shape == (n_loop_covar, n_outcome)
9898
# Match to p-values
99-
# Note: t dist not implemented in Dask so this must be delayed
99+
# Note: t dist not implemented in Dask so this must be delayed,
100+
# see https://github.com/dask/dask/issues/6857
100101
P = da.map_blocks(
101-
lambda t: 2 * stats.distributions.t.sf(np.abs(T), dof), T, dtype="float64"
102+
lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
102103
)
103104
assert P.shape == (n_loop_covar, n_outcome)
104105

sgkit/stats/popgen.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,13 +149,14 @@ def _divergence(ac: ArrayLike, out: ArrayLike) -> None: # pragma: no cover
149149
for i in range(n_cohorts):
150150
for j in range(i + 1, n_cohorts):
151151
n_pairs = an[i] * an[j]
152-
n_same = 0
153-
for k in range(n_alleles):
154-
n_same += ac[i, k] * ac[j, k]
155-
n_diff = n_pairs - n_same
156-
div = n_diff / n_pairs
157-
out[i, j] = div
158-
out[j, i] = div
152+
if n_pairs != 0.0:
153+
n_same = 0
154+
for k in range(n_alleles):
155+
n_same += ac[i, k] * ac[j, k]
156+
n_diff = n_pairs - n_same
157+
div = n_diff / n_pairs
158+
out[i, j] = div
159+
out[j, i] = div
159160

160161
# calculate the diversity for each cohort
161162
for i in range(n_cohorts):

sgkit/tests/test_aggregation.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,14 @@ def test_count_call_alleles__chunked():
216216
def test_count_cohort_alleles__multi_variant_multi_sample():
217217
ds = get_dataset(
218218
[
219-
[[0, 0], [0, 0], [0, 0]],
220-
[[0, 0], [0, 0], [0, 1]],
221-
[[1, 1], [0, 1], [1, 0]],
222-
[[1, 1], [1, 1], [1, 1]],
219+
[[0, 0], [0, 0], [0, 0], [0, 0]],
220+
[[0, 0], [0, 0], [0, 1], [0, 1]],
221+
[[1, 1], [0, 1], [1, 0], [1, 0]],
222+
[[1, 1], [1, 1], [1, 1], [1, 1]],
223223
]
224224
)
225-
ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples")
225+
# -1 means that the sample is not in any cohort
226+
ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1, -1]), dims="samples")
226227
ds = count_cohort_alleles(ds)
227228
ac = ds.cohort_allele_count
228229
np.testing.assert_equal(

sgkit/tests/test_association.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pandas import DataFrame
1010
from xarray import Dataset
1111

12-
from sgkit import variables
1312
from sgkit.stats.association import gwas_linear_regression, linear_regression
1413
from sgkit.typing import ArrayLike
1514

@@ -175,16 +174,24 @@ def validate(dfp: DataFrame, dft: DataFrame) -> None:
175174

176175
def test_gwas_linear_regression__lazy_results(ds):
177176
res = gwas_linear_regression(
178-
ds, dosage="dosage", covariates="covar_0", traits="trait_0"
177+
ds, dosage="dosage", covariates="covar_0", traits="trait_0", merge=False
179178
)
180-
for v in [
181-
variables.variant_beta,
182-
variables.variant_t_value,
183-
variables.variant_p_value,
184-
]:
179+
for v in res:
185180
assert isinstance(res[v].data, da.Array)
186181

187182

183+
@pytest.mark.parametrize("chunks", [5, -1, "auto"])
184+
def test_gwas_linear_regression__variable_shapes(ds, chunks):
185+
ds = ds.chunk(chunks=chunks)
186+
res = gwas_linear_regression(
187+
ds, dosage="dosage", covariates="covar_0", traits="trait_0", merge=False
188+
)
189+
shape = (ds.dims["variants"], 1)
190+
for v in res:
191+
assert res[v].data.shape == shape
192+
assert res[v].data.compute().shape == shape
193+
194+
188195
def test_gwas_linear_regression__multi_trait(ds):
189196
def run(traits: Sequence[str]) -> Dataset:
190197
return gwas_linear_regression(

sgkit/tests/test_popgen.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
)
2323
from sgkit.window import window
2424

25+
from .test_aggregation import get_dataset
26+
2527

2628
def ts_to_dataset(ts, chunks=None, samples=None):
2729
"""
@@ -203,6 +205,17 @@ def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, ch
203205
) # scikit-allel has final window missing
204206

205207

208+
def test_divergence__missing_calls():
209+
ds = get_dataset(
210+
[
211+
[[0, 0], [-1, -1], [-1, -1]], # all of cohort 1 calls are missing
212+
]
213+
)
214+
ds["sample_cohort"] = xr.DataArray(np.array([0, 1, 1]), dims="samples")
215+
ds = divergence(ds)
216+
np.testing.assert_equal(ds["stat_divergence"].values[0, 1], np.nan)
217+
218+
206219
@pytest.mark.parametrize("sample_size", [2, 3, 10, 100])
207220
def test_Fst__Hudson(sample_size):
208221
# scikit-allel can only calculate Fst for pairs of cohorts (populations)

0 commit comments

Comments
 (0)