-
Notifications
You must be signed in to change notification settings - Fork 35
Fst windowing #303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fst windowing #303
Changes from all commits
420da80
72358d2
f23ac1d
a0dd4c3
47d042f
7aa3513
2fda1fd
7e22bb5
5be490f
718c9a3
7eb61e5
3773658
7f99d15
1d2d878
f6a5b09
231328a
d57aefd
0c5a3f4
e28d0c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,21 @@ | ||
import itertools | ||
|
||
import allel | ||
import msprime # type: ignore | ||
import numpy as np | ||
import pytest | ||
import xarray as xr | ||
from allel import hudson_fst | ||
|
||
from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity | ||
from sgkit import ( | ||
Fst, | ||
Tajimas_D, | ||
count_variant_alleles, | ||
create_genotype_call_dataset, | ||
divergence, | ||
diversity, | ||
) | ||
from sgkit.window import window | ||
|
||
|
||
def ts_to_dataset(ts, chunks=None, samples=None): | ||
|
@@ -39,28 +48,57 @@ def ts_to_dataset(ts, chunks=None, samples=None): | |
return ds | ||
|
||
|
||
@pytest.mark.parametrize("size", [2, 3, 10, 100]) | ||
@pytest.mark.parametrize("sample_size", [2, 3, 10, 100]) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) | ||
def test_diversity(size, chunks): | ||
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) | ||
def test_diversity(sample_size, chunks): | ||
ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
ds = ds.chunk(dict(zip(["variants", "samples"], chunks))) | ||
sample_cohorts = np.full_like(ts.samples(), 0) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
ds = ds.assign_coords({"cohorts": ["co_0"]}) | ||
ds = diversity(ds) | ||
div = ds.stat_diversity.sel(cohorts="co_0").values | ||
div = ds.stat_diversity.sum(axis=0, skipna=False).sel(cohorts="co_0").values | ||
ts_div = ts.diversity(span_normalise=False) | ||
np.testing.assert_allclose(div, ts_div) | ||
|
||
|
||
@pytest.mark.parametrize("sample_size", [10]) | ||
def test_diversity__windowed(sample_size): | ||
ts = msprime.simulate(sample_size, length=200, mutation_rate=0.05, random_seed=42) | ||
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.full_like(ts.samples(), 0) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
ds = ds.assign_coords({"cohorts": ["co_0"]}) | ||
ds = window(ds, size=25, step=25) | ||
ds = diversity(ds) | ||
div = ds["stat_diversity"].sel(cohorts="co_0").compute() | ||
|
||
# Calculate diversity using tskit windows | ||
# Find the variant positions so we can have windows with a fixed number of variants | ||
positions = ts.tables.sites.position | ||
windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) | ||
ts_div = ts.diversity(windows=windows, span_normalise=False) | ||
np.testing.assert_allclose(div, ts_div) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably a bit overarching question, not necessary this PR, but since we heavily depend on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea. I opened #332 |
||
|
||
# Calculate diversity using scikit-allel moving_statistic | ||
# (Don't use windowed_diversity, since it treats the last window differently) | ||
ds = count_variant_alleles(ts_to_dataset(ts)) # type: ignore[no-untyped-call] | ||
ac = ds["variant_allele_count"].values | ||
mpd = allel.mean_pairwise_difference(ac, fill=0) | ||
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) | ||
np.testing.assert_allclose( | ||
div[:-1], ska_div | ||
) # scikit-allel has final window missing | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"size, n_cohorts", | ||
"sample_size, n_cohorts", | ||
[(2, 2), (3, 2), (3, 3), (10, 2), (10, 3), (10, 4), (100, 2), (100, 3), (100, 4)], | ||
) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) | ||
def test_divergence(size, n_cohorts, chunks): | ||
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) | ||
def test_divergence(sample_size, n_cohorts, chunks): | ||
ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
|
@@ -70,7 +108,7 @@ def test_divergence(size, n_cohorts, chunks): | |
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
ds = divergence(ds) | ||
div = ds.stat_divergence.values | ||
div = ds.stat_divergence.sum(axis=0, skipna=False).values | ||
|
||
# entries on the diagonal are diversity values | ||
for i in range(n_cohorts): | ||
|
@@ -86,20 +124,86 @@ def test_divergence(size, n_cohorts, chunks): | |
np.testing.assert_allclose(div, ts_div) | ||
|
||
|
||
@pytest.mark.parametrize("size", [2, 3, 10, 100]) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) | ||
def test_Fst__Hudson(size, chunks): | ||
@pytest.mark.parametrize("sample_size, n_cohorts", [(10, 2)]) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (50, -1)]) | ||
def test_divergence__windowed(sample_size, n_cohorts, chunks): | ||
ts = msprime.simulate(sample_size, length=200, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
[np.full_like(subset, i) for i, subset in enumerate(subsets)] | ||
) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
ds = window(ds, size=25, step=25) | ||
ds = divergence(ds) | ||
div = ds["stat_divergence"].values | ||
# test off-diagonal entries, by replacing diagonal with NaNs | ||
div[:, np.arange(2), np.arange(2)] = np.nan | ||
|
||
# Calculate diversity using tskit windows | ||
# Find the variant positions so we can have windows with a fixed number of variants | ||
positions = ts.tables.sites.position | ||
windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) | ||
n_windows = len(windows) - 1 | ||
ts_div = np.full([n_windows, n_cohorts, n_cohorts], np.nan) | ||
for i, j in itertools.combinations(range(n_cohorts), 2): | ||
ts_div[:, i, j] = ts.divergence( | ||
[subsets[i], subsets[j]], windows=windows, span_normalise=False | ||
) | ||
ts_div[:, j, i] = ts_div[:, i, j] | ||
np.testing.assert_allclose(div, ts_div) | ||
|
||
|
||
@pytest.mark.parametrize("sample_size, n_cohorts", [(10, 2)]) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (50, -1)]) | ||
@pytest.mark.xfail() # combine with test_divergence__windowed when this is passing | ||
def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, chunks): | ||
ts = msprime.simulate(sample_size, length=200, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
[np.full_like(subset, i) for i, subset in enumerate(subsets)] | ||
) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
ds = window(ds, size=25, step=25) | ||
ds = divergence(ds) | ||
div = ds["stat_divergence"].values | ||
# test off-diagonal entries, by replacing diagonal with NaNs | ||
div[:, np.arange(2), np.arange(2)] = np.nan | ||
|
||
# Calculate divergence using scikit-allel moving_statistic | ||
# (Don't use windowed_divergence, since it treats the last window differently) | ||
ds1 = count_variant_alleles(ts_to_dataset(ts, samples=ts.samples()[:1])) # type: ignore[no-untyped-call] | ||
ds2 = count_variant_alleles(ts_to_dataset(ts, samples=ts.samples()[1:])) # type: ignore[no-untyped-call] | ||
ac1 = ds1["variant_allele_count"].values | ||
ac2 = ds2["variant_allele_count"].values | ||
mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) | ||
ska_div = allel.moving_statistic(mpd, np.sum, size=25, step=25) # noqa: F841 | ||
# TODO: investigate why numbers are different | ||
np.testing.assert_allclose( | ||
div[:-1], ska_div | ||
) # scikit-allel has final window missing | ||
|
||
|
||
@pytest.mark.parametrize("sample_size", [2, 3, 10, 100]) | ||
def test_Fst__Hudson(sample_size): | ||
# scikit-allel can only calculate Fst for pairs of cohorts (populations) | ||
n_cohorts = 2 | ||
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) | ||
ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
[np.full_like(subset, i) for i, subset in enumerate(subsets)] | ||
) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
n_variants = ds.dims["variants"] | ||
ds = window(ds, size=n_variants, step=n_variants) # single window | ||
ds = Fst(ds, estimator="Hudson") | ||
fst = ds.stat_Fst.sel(cohorts_0="co_0", cohorts_1="co_1").values | ||
|
||
|
@@ -113,27 +217,28 @@ def test_Fst__Hudson(size, chunks): | |
|
||
|
||
@pytest.mark.parametrize( | ||
"size, n_cohorts", | ||
"sample_size, n_cohorts", | ||
[(2, 2), (3, 2), (3, 3), (10, 2), (10, 3), (10, 4), (100, 2), (100, 3), (100, 4)], | ||
) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) | ||
def test_Fst__Nei(size, n_cohorts, chunks): | ||
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) | ||
def test_Fst__Nei(sample_size, n_cohorts): | ||
ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
[np.full_like(subset, i) for i, subset in enumerate(subsets)] | ||
) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
n_variants = ds.dims["variants"] | ||
ds = window(ds, size=n_variants, step=n_variants) # single window | ||
ds = Fst(ds, estimator="Nei") | ||
fst = ds.stat_Fst.values | ||
|
||
ts_fst = np.full([n_cohorts, n_cohorts], np.nan) | ||
ts_fst = np.full([1, n_cohorts, n_cohorts], np.nan) | ||
for i, j in itertools.combinations(range(n_cohorts), 2): | ||
ts_fst[i, j] = ts.Fst([subsets[i], subsets[j]]) | ||
ts_fst[j, i] = ts_fst[i, j] | ||
ts_fst[0, i, j] = ts.Fst([subsets[i], subsets[j]]) | ||
ts_fst[0, j, i] = ts_fst[0, i, j] | ||
np.testing.assert_allclose(fst, ts_fst) | ||
|
||
|
||
|
@@ -146,13 +251,59 @@ def test_Fst__unknown_estimator(): | |
Fst(ds, estimator="Unknown") | ||
|
||
|
||
@pytest.mark.parametrize("size", [2, 3, 10, 100]) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (10, -1)]) | ||
def test_Tajimas_D(size, chunks): | ||
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42) | ||
@pytest.mark.parametrize( | ||
"sample_size, n_cohorts", | ||
[(10, 2)], | ||
) | ||
@pytest.mark.parametrize("chunks", [(-1, -1), (50, -1)]) | ||
def test_Fst__windowed(sample_size, n_cohorts, chunks): | ||
ts = msprime.simulate(sample_size, length=200, mutation_rate=0.05, random_seed=42) | ||
subsets = np.array_split(ts.samples(), n_cohorts) | ||
ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.concatenate( | ||
[np.full_like(subset, i) for i, subset in enumerate(subsets)] | ||
) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
cohort_names = [f"co_{i}" for i in range(n_cohorts)] | ||
ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) | ||
ds = window(ds, size=25, step=25) | ||
fst_ds = Fst(ds, estimator="Nei") | ||
fst = fst_ds["stat_Fst"].values | ||
|
||
# Calculate Fst using tskit windows | ||
# Find the variant positions so we can have windows with a fixed number of variants | ||
positions = ts.tables.sites.position | ||
windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) | ||
n_windows = len(windows) - 1 | ||
ts_fst = np.full([n_windows, n_cohorts, n_cohorts], np.nan) | ||
for i, j in itertools.combinations(range(n_cohorts), 2): | ||
ts_fst[:, i, j] = ts.Fst( | ||
[subsets[i], subsets[j]], windows=windows, span_normalise=False | ||
) | ||
ts_fst[:, j, i] = ts_fst[:, i, j] | ||
|
||
np.testing.assert_allclose(fst, ts_fst) | ||
|
||
fst_ds = Fst(ds, estimator="Hudson") | ||
fst = fst_ds["stat_Fst"].sel(cohorts_0="co_0", cohorts_1="co_1").values | ||
|
||
ac1 = fst_ds.cohort_allele_count.values[:, 0, :] | ||
ac2 = fst_ds.cohort_allele_count.values[:, 1, :] | ||
ska_fst = allel.moving_hudson_fst(ac1, ac2, size=25, step=25) | ||
|
||
np.testing.assert_allclose( | ||
fst[:-1], ska_fst | ||
) # scikit-allel has final window missing | ||
|
||
|
||
@pytest.mark.parametrize("sample_size", [2, 3, 10, 100]) | ||
def test_Tajimas_D(sample_size): | ||
ts = msprime.simulate(sample_size, length=100, mutation_rate=0.05, random_seed=42) | ||
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] | ||
sample_cohorts = np.full_like(ts.samples(), 0) | ||
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") | ||
n_variants = ds.dims["variants"] | ||
ds = window(ds, size=n_variants, step=n_variants) # single window | ||
ds = Tajimas_D(ds) | ||
d = ds.stat_Tajimas_D.compute() | ||
ts_d = ts.Tajimas_D() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ds.window_{start, stop}
is never lazy right?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, not at the moment at least.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be? The loose guideline I have in my head is that anything that doesn't run interactively (<1s or so) across a whole genome dataset (~100M variants) should probably be lazy. Is it worth adding an issue to make this lazy or test that the latter is true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Opened #340