Skip to content

Commit e5486c8

Browse files
committed
add minimal diversity and divergence
remove ts_to_dataset from public api make divergence take in two datasets add minimal fst Add read_vcfzarr (sgkit-dev#40) add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add minimal fst
1 parent 10c25f8 commit e5486c8

File tree

5 files changed

+187
-29
lines changed

5 files changed

+187
-29
lines changed

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ ignore =
5959
profile = black
6060
default_section = THIRDPARTY
6161
known_first_party = sgkit
62-
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
62+
known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
6363
multi_line_output = 3
6464
include_trailing_comma = True
6565
force_grid_wrap = 0

sgkit/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .stats.aggregation import count_call_alleles, count_variant_alleles
1212
from .stats.association import gwas_linear_regression
1313
from .stats.hwe import hardy_weinberg_test
14+
from .stats.popgen import Fst, Tajimas_D, divergence, diversity
1415
from .stats.regenie import regenie
1516

1617
__all__ = [
@@ -27,4 +28,8 @@
2728
"read_vcfzarr",
2829
"regenie",
2930
"hardy_weinberg_test",
31+
"diversity",
32+
"divergence",
33+
"Fst",
34+
"Tajimas_D",
3035
]

sgkit/api.py

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def create_genotype_call_dataset(
2424
variant_id: Any = None,
2525
) -> xr.Dataset:
2626
"""Create a dataset of genotype calls.
27+
2728
Parameters
2829
----------
2930
variant_contig_names : list of str
@@ -45,10 +46,12 @@ def create_genotype_call_dataset(
4546
omitted all calls are unphased.
4647
variant_id: array_like, str or object, optional
4748
The unique identifier of the variant.
49+
4850
Returns
4951
-------
5052
:class:`xarray.Dataset`
5153
The dataset of genotype calls.
54+
5255
"""
5356
check_array_like(variant_contig, kind="i", ndim=1)
5457
check_array_like(variant_position, kind="i", ndim=1)
@@ -91,6 +94,7 @@ def create_genotype_dosage_dataset(
9194
variant_id: Any = None,
9295
) -> xr.Dataset:
9396
"""Create a dataset of genotype calls.
97+
9498
Parameters
9599
----------
96100
variant_contig_names : list of str
@@ -111,10 +115,12 @@ def create_genotype_dosage_dataset(
111115
missing value.
112116
variant_id: array_like, str or object, optional
113117
The unique identifier of the variant.
118+
114119
Returns
115120
-------
116121
xr.Dataset
117122
The dataset of genotype calls.
123+
118124
"""
119125
check_array_like(variant_contig, kind="i", ndim=1)
120126
check_array_like(variant_position, kind="i", ndim=1)
@@ -143,31 +149,3 @@ def create_genotype_dosage_dataset(
143149
data_vars["variant_id"] = ([DIM_VARIANT], variant_id)
144150
attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
145151
return xr.Dataset(data_vars=data_vars, attrs=attrs)
146-
147-
148-
def ts_to_dataset(ts, samples=None):
149-
"""
150-
Convert the specified tskit tree sequence into an sgkit dataset.
151-
Note this just generates haploids for now. With msprime 1.0, we'll be
152-
able to generate diploid/whatever-ploid individuals easily.
153-
"""
154-
if samples is None:
155-
samples = ts.samples()
156-
tables = ts.dump_tables()
157-
alleles = []
158-
genotypes = []
159-
for var in ts.variants(samples=samples):
160-
alleles.append(var.alleles)
161-
genotypes.append(var.genotypes)
162-
alleles = np.array(alleles).astype("S")
163-
genotypes = np.expand_dims(genotypes, axis=2)
164-
165-
df = create_genotype_call_dataset(
166-
variant_contig_names=["1"],
167-
variant_contig=np.zeros(len(tables.sites), dtype=int),
168-
variant_position=tables.sites.position.astype(int),
169-
variant_alleles=alleles,
170-
sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
171-
call_genotype=genotypes,
172-
)
173-
return df

sgkit/stats/popgen.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
from typing import Hashable
2+
3+
import dask.array as da
4+
import xarray as xr
5+
from xarray import DataArray, Dataset
6+
7+
from .aggregation import count_alleles
8+
9+
10+
def diversity(
11+
ds: Dataset, allele_counts: Hashable = "variant_allele_counts",
12+
) -> DataArray:
13+
if allele_counts not in ds:
14+
ds[allele_counts] = count_alleles(ds)
15+
ac = ds[allele_counts]
16+
an = ac.sum(axis=1)
17+
n_pairs = an * (an - 1) / 2
18+
n_same = (ac * (ac - 1) / 2).sum(axis=1)
19+
n_diff = n_pairs - n_same
20+
# Let's ignore missing data and division by zero for now.
21+
pi = n_diff / n_pairs
22+
# Because we're not providing any arguments on windowing, etc,
23+
# we return the total over the whole region. Maybe this isn't
24+
# the behaviour we want, but it's a starting point. Note that
25+
# this is different to the tskit default behaviour where we
26+
# normalise by the size of windows so that results
27+
# in different windows are comparable. However, we don't have
28+
# any information about the overall length of the sequence here
29+
# so we can't normalise by it.
30+
return pi.sum() # type: ignore[no-any-return]
31+
32+
33+
def divergence(
34+
ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_counts",
35+
) -> DataArray:
36+
if allele_counts not in ds1:
37+
ds1[allele_counts] = count_alleles(ds1)
38+
ac1 = ds1[allele_counts]
39+
if allele_counts not in ds2:
40+
ds2[allele_counts] = count_alleles(ds2)
41+
ac2 = ds2[allele_counts]
42+
an1 = ds1[allele_counts].sum(axis=1)
43+
an2 = ds2[allele_counts].sum(axis=1)
44+
45+
n_pairs = an1 * an2
46+
n_same = (ac1 * ac2).sum(axis=1)
47+
n_diff = n_pairs - n_same
48+
# Ignore missing data and division by zero for now.
49+
div = n_diff / n_pairs
50+
return div.sum() # type: ignore[no-any-return]
51+
52+
53+
def Fst(
54+
ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_counts",
55+
) -> DataArray:
56+
total_div = diversity(ds1) + diversity(ds2)
57+
gs = divergence(ds1, ds2)
58+
den = total_div + 2 * gs # type: ignore[operator]
59+
fst = 1 - (2 * total_div / den)
60+
return fst # type: ignore[no-any-return]
61+
62+
63+
def Tajimas_D(
64+
ds: Dataset, allele_counts: Hashable = "variant_allele_counts",
65+
) -> DataArray:
66+
if allele_counts not in ds:
67+
ds[allele_counts] = count_alleles(ds)
68+
ac = ds[allele_counts]
69+
70+
# count segregating
71+
S = ((ac > 0).sum(axis=1) > 1).sum()
72+
73+
# assume number of chromosomes sampled is constant for all variants
74+
n = ac.sum(axis=1).max()
75+
76+
# (n-1)th harmonic number
77+
a1 = (1 / da.arange(1, n)).sum()
78+
79+
# calculate Watterson's theta (absolute value)
80+
theta = S / a1
81+
82+
# calculate diversity
83+
div = diversity(ds)
84+
85+
# N.B., both theta estimates are usually divided by the number of
86+
# (accessible) bases but here we want the absolute difference
87+
d = div - theta
88+
89+
# calculate the denominator (standard deviation)
90+
a2 = (1 / (da.arange(1, n) ** 2)).sum()
91+
b1 = (n + 1) / (3 * (n - 1))
92+
b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
93+
c1 = b1 - (1 / a1)
94+
c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
95+
e1 = c1 / a1
96+
e2 = c2 / (a1 ** 2 + a2)
97+
d_stdev = xr.ufuncs.sqrt((e1 * S) + (e2 * S * (S - 1))) # type: ignore[attr-defined]
98+
99+
# finally calculate Tajima's D
100+
D = d / d_stdev
101+
return D # type: ignore[no-any-return]

sgkit/tests/test_popgen.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import msprime # type: ignore
2+
import numpy as np
3+
import pytest
4+
5+
from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity
6+
7+
8+
def ts_to_dataset(ts, samples=None):
9+
"""
10+
Convert the specified tskit tree sequence into an sgkit dataset.
11+
Note this just generates haploids for now. With msprime 1.0, we'll be
12+
able to generate diploid/whatever-ploid individuals easily.
13+
"""
14+
if samples is None:
15+
samples = ts.samples()
16+
tables = ts.dump_tables()
17+
alleles = []
18+
genotypes = []
19+
for var in ts.variants(samples=samples):
20+
alleles.append(var.alleles)
21+
genotypes.append(var.genotypes)
22+
alleles = np.array(alleles).astype("S")
23+
genotypes = np.expand_dims(genotypes, axis=2)
24+
25+
df = create_genotype_call_dataset(
26+
variant_contig_names=["1"],
27+
variant_contig=np.zeros(len(tables.sites), dtype=int),
28+
variant_position=tables.sites.position.astype(int),
29+
variant_alleles=alleles,
30+
sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
31+
call_genotype=genotypes,
32+
)
33+
return df
34+
35+
36+
@pytest.mark.parametrize("size", [2, 3, 10, 100])
37+
def test_diversity(size):
38+
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
39+
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call]
40+
div = diversity(ds).compute()
41+
ts_div = ts.diversity(span_normalise=False)
42+
assert np.allclose(div, ts_div)
43+
44+
45+
@pytest.mark.parametrize("size", [2, 3, 10, 100])
46+
def test_divergence(size):
47+
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
48+
subset_1 = ts.samples()[: ts.num_samples // 2]
49+
subset_2 = ts.samples()[ts.num_samples // 2 :]
50+
ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call]
51+
ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call]
52+
div = divergence(ds1, ds2).compute()
53+
ts_div = ts.divergence([subset_1, subset_2], span_normalise=False)
54+
assert np.allclose(div, ts_div)
55+
56+
@pytest.mark.parametrize("size", [2, 3, 10, 100])
57+
def test_Fst(size):
58+
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
59+
subset_1 = ts.samples()[: ts.num_samples // 2]
60+
subset_2 = ts.samples()[ts.num_samples // 2 :]
61+
ds1 = ts_to_dataset(ts, subset_1) # type: ignore[no-untyped-call]
62+
ds2 = ts_to_dataset(ts, subset_2) # type: ignore[no-untyped-call]
63+
fst = Fst(ds1, ds2).compute()
64+
ts_fst = ts.Fst([subset_1, subset_2])
65+
assert np.allclose(fst, ts_fst)
66+
67+
68+
@pytest.mark.parametrize("size", [2, 3, 10, 100])
69+
def test_Tajimas_D(size):
70+
ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
71+
ds = ts_to_dataset(ts) # type: ignore[no-untyped-call]
72+
ts_d = ts.Tajimas_D()
73+
d = Tajimas_D(ds).compute()
74+
assert np.allclose(d, ts_d)

0 commit comments

Comments
 (0)