add minimal diversity and divergence

daletovar · daletovar · commit e5486c8446ff · 2020-08-29T18:36:54.000-07:00
remove ts_to_dataset from public api make divergence take in two datasets add minimal fst Add read_vcfzarr (sgkit-dev#40) add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api make divergence take in two datasets add minimal fst add tajimas d add ts_to_dataset add minimal diversity and divergence remove ts_to_dataset from public api add minimal fst
diff --git a/setup.cfg b/setup.cfg
@@ -59,7 +59,7 @@ ignore =
 profile = black
 default_section = THIRDPARTY
 known_first_party = sgkit
-known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
+known_third_party = dask,fire,glow,hail,hypothesis,invoke,msprime,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
 multi_line_output = 3
 include_trailing_comma = True
 force_grid_wrap = 0
diff --git a/sgkit/__init__.py b/sgkit/__init__.py
@@ -11,6 +11,7 @@
 from .stats.aggregation import count_call_alleles, count_variant_alleles
 from .stats.association import gwas_linear_regression
 from .stats.hwe import hardy_weinberg_test
+from .stats.popgen import Fst, Tajimas_D, divergence, diversity
 from .stats.regenie import regenie
 
 __all__ = [
@@ -27,4 +28,8 @@
     "read_vcfzarr",
     "regenie",
     "hardy_weinberg_test",
+    "diversity",
+    "divergence",
+    "Fst",
+    "Tajimas_D",
 ]
diff --git a/sgkit/api.py b/sgkit/api.py
@@ -24,6 +24,7 @@ def create_genotype_call_dataset(
     variant_id: Any = None,
 ) -> xr.Dataset:
     """Create a dataset of genotype calls.
+    
     Parameters
     ----------
     variant_contig_names : list of str
@@ -45,10 +46,12 @@ def create_genotype_call_dataset(
         omitted all calls are unphased.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
+    
     Returns
     -------
     :class:`xarray.Dataset`
         The dataset of genotype calls.
+    
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -91,6 +94,7 @@ def create_genotype_dosage_dataset(
     variant_id: Any = None,
 ) -> xr.Dataset:
     """Create a dataset of genotype calls.
+    
     Parameters
     ----------
     variant_contig_names : list of str
@@ -111,10 +115,12 @@ def create_genotype_dosage_dataset(
         missing value.
     variant_id: array_like, str or object, optional
         The unique identifier of the variant.
+    
     Returns
     -------
     xr.Dataset
         The dataset of genotype calls.
+    
     """
     check_array_like(variant_contig, kind="i", ndim=1)
     check_array_like(variant_position, kind="i", ndim=1)
@@ -143,31 +149,3 @@ def create_genotype_dosage_dataset(
         data_vars["variant_id"] = ([DIM_VARIANT], variant_id)
     attrs: Dict[Hashable, Any] = {"contigs": variant_contig_names}
     return xr.Dataset(data_vars=data_vars, attrs=attrs)
-
-
-def ts_to_dataset(ts, samples=None):
-    """
-    Convert the specified tskit tree sequence into an sgkit dataset.
-    Note this just generates haploids for now. With msprime 1.0, we'll be
-    able to generate diploid/whatever-ploid individuals easily.
-    """
-    if samples is None:
-        samples = ts.samples()
-    tables = ts.dump_tables()
-    alleles = []
-    genotypes = []
-    for var in ts.variants(samples=samples):
-        alleles.append(var.alleles)
-        genotypes.append(var.genotypes)
-    alleles = np.array(alleles).astype("S")
-    genotypes = np.expand_dims(genotypes, axis=2)
-
-    df = create_genotype_call_dataset(
-        variant_contig_names=["1"],
-        variant_contig=np.zeros(len(tables.sites), dtype=int),
-        variant_position=tables.sites.position.astype(int),
-        variant_alleles=alleles,
-        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
-        call_genotype=genotypes,
-    )
-    return df
diff --git a/sgkit/stats/popgen.py b/sgkit/stats/popgen.py
@@ -0,0 +1,101 @@
+from typing import Hashable
+
+import dask.array as da
+import xarray as xr
+from xarray import DataArray, Dataset
+
+from .aggregation import count_alleles
+
+
+def diversity(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_counts",
+) -> DataArray:
+    if allele_counts not in ds:
+        ds[allele_counts] = count_alleles(ds)
+    ac = ds[allele_counts]
+    an = ac.sum(axis=1)
+    n_pairs = an * (an - 1) / 2
+    n_same = (ac * (ac - 1) / 2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    # Let's ignore missing data and division by zero for now.
+    pi = n_diff / n_pairs
+    # Because we're not providing any arguments on windowing, etc,
+    # we return the total over the whole region. Maybe this isn't
+    # the behaviour we want, but it's a starting point. Note that
+    # this is different to the tskit default behaviour where we
+    # normalise by the size of windows so that results
+    # in different windows are comparable. However, we don't have
+    # any information about the overall length of the sequence here
+    # so we can't normalise by it.
+    return pi.sum()  # type: ignore[no-any-return]
+
+
+def divergence(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_counts",
+) -> DataArray:
+    if allele_counts not in ds1:
+        ds1[allele_counts] = count_alleles(ds1)
+    ac1 = ds1[allele_counts]
+    if allele_counts not in ds2:
+        ds2[allele_counts] = count_alleles(ds2)
+    ac2 = ds2[allele_counts]
+    an1 = ds1[allele_counts].sum(axis=1)
+    an2 = ds2[allele_counts].sum(axis=1)
+
+    n_pairs = an1 * an2
+    n_same = (ac1 * ac2).sum(axis=1)
+    n_diff = n_pairs - n_same
+    # Ignore missing data and division by zero for now.
+    div = n_diff / n_pairs
+    return div.sum()  # type: ignore[no-any-return]
+
+
+def Fst(
+    ds1: Dataset, ds2: Dataset, allele_counts: Hashable = "variant_allele_counts",
+) -> DataArray:
+    total_div = diversity(ds1) + diversity(ds2)
+    gs = divergence(ds1, ds2)
+    den = total_div + 2 * gs  # type: ignore[operator]
+    fst = 1 - (2 * total_div / den)
+    return fst  # type: ignore[no-any-return]
+
+
+def Tajimas_D(
+    ds: Dataset, allele_counts: Hashable = "variant_allele_counts",
+) -> DataArray:
+    if allele_counts not in ds:
+        ds[allele_counts] = count_alleles(ds)
+    ac = ds[allele_counts]
+
+    # count segregating
+    S = ((ac > 0).sum(axis=1) > 1).sum()
+
+    # assume number of chromosomes sampled is constant for all variants
+    n = ac.sum(axis=1).max()
+
+    # (n-1)th harmonic number
+    a1 = (1 / da.arange(1, n)).sum()
+
+    # calculate Watterson's theta (absolute value)
+    theta = S / a1
+
+    # calculate diversity
+    div = diversity(ds)
+
+    # N.B., both theta estimates are usually divided by the number of
+    # (accessible) bases but here we want the absolute difference
+    d = div - theta
+
+    # calculate the denominator (standard deviation)
+    a2 = (1 / (da.arange(1, n) ** 2)).sum()
+    b1 = (n + 1) / (3 * (n - 1))
+    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
+    c1 = b1 - (1 / a1)
+    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
+    e1 = c1 / a1
+    e2 = c2 / (a1 ** 2 + a2)
+    d_stdev = xr.ufuncs.sqrt((e1 * S) + (e2 * S * (S - 1)))  # type: ignore[attr-defined]
+
+    # finally calculate Tajima's D
+    D = d / d_stdev
+    return D  # type: ignore[no-any-return]
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
@@ -0,0 +1,74 @@
+import msprime  # type: ignore
+import numpy as np
+import pytest
+
+from sgkit import Fst, Tajimas_D, create_genotype_call_dataset, divergence, diversity
+
+
+def ts_to_dataset(ts, samples=None):
+    """
+    Convert the specified tskit tree sequence into an sgkit dataset.
+    Note this just generates haploids for now. With msprime 1.0, we'll be
+    able to generate diploid/whatever-ploid individuals easily.
+    """
+    if samples is None:
+        samples = ts.samples()
+    tables = ts.dump_tables()
+    alleles = []
+    genotypes = []
+    for var in ts.variants(samples=samples):
+        alleles.append(var.alleles)
+        genotypes.append(var.genotypes)
+    alleles = np.array(alleles).astype("S")
+    genotypes = np.expand_dims(genotypes, axis=2)
+
+    df = create_genotype_call_dataset(
+        variant_contig_names=["1"],
+        variant_contig=np.zeros(len(tables.sites), dtype=int),
+        variant_position=tables.sites.position.astype(int),
+        variant_alleles=alleles,
+        sample_id=np.array([f"tsk_{u}" for u in samples]).astype("U"),
+        call_genotype=genotypes,
+    )
+    return df
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_diversity(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    div = diversity(ds).compute()
+    ts_div = ts.diversity(span_normalise=False)
+    assert np.allclose(div, ts_div)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_divergence(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    div = divergence(ds1, ds2).compute()
+    ts_div = ts.divergence([subset_1, subset_2], span_normalise=False)
+    assert np.allclose(div, ts_div)
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Fst(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    subset_1 = ts.samples()[: ts.num_samples // 2]
+    subset_2 = ts.samples()[ts.num_samples // 2 :]
+    ds1 = ts_to_dataset(ts, subset_1)  # type: ignore[no-untyped-call]
+    ds2 = ts_to_dataset(ts, subset_2)  # type: ignore[no-untyped-call]
+    fst = Fst(ds1, ds2).compute()
+    ts_fst = ts.Fst([subset_1, subset_2])
+    assert np.allclose(fst, ts_fst)
+
+
+@pytest.mark.parametrize("size", [2, 3, 10, 100])
+def test_Tajimas_D(size):
+    ts = msprime.simulate(size, length=100, mutation_rate=0.05, random_seed=42)
+    ds = ts_to_dataset(ts)  # type: ignore[no-untyped-call]
+    ts_d = ts.Tajimas_D()
+    d = Tajimas_D(ds).compute()
+    assert np.allclose(d, ts_d)