Skip to content

HWE Test Implementation #76

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 17, 2020
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ numpy
xarray
dask[array]
scipy
zarr
numba
zarr
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ ignore =
profile = black
default_section = THIRDPARTY
known_first_party = sgkit
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
known_third_party = dask,fire,glow,hail,hypothesis,invoke,numba,numpy,pandas,pkg_resources,pyspark,pytest,setuptools,sgkit_plink,xarray,yaml,zarr
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
Expand All @@ -71,6 +71,8 @@ ignore_missing_imports = True
ignore_missing_imports = True
[mypy-pandas.*]
ignore_missing_imports = True
[mypy-numba.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-statsmodels.*]
Expand Down
2 changes: 2 additions & 0 deletions sgkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .io.vcfzarr_reader import read_vcfzarr
from .stats.aggregation import count_alleles
from .stats.association import gwas_linear_regression
from .stats.hwe import hardy_weinberg_test
from .stats.regenie import regenie

__all__ = [
Expand All @@ -24,4 +25,5 @@
"gwas_linear_regression",
"read_vcfzarr",
"regenie",
"hardy_weinberg_test",
]
18 changes: 3 additions & 15 deletions sgkit/stats/association.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,24 +164,13 @@ def gwas_linear_regression(
-------
:class:`xarray.Dataset`
Dataset containing (N = num variants, O = num traits):
beta : (N, O) array-like
variant_beta : (N, O) array-like
Beta values associated with each variant and trait
t_value : (N, O) array-like
variant_t_value : (N, O) array-like
T statistics for each beta
p_value : (N, O) array-like
variant_p_value : (N, O) array-like
P values as float in [0, 1]

Warnings
--------
Regression statistics from this implementation are only valid when an
intercept is present. The `add_intercept` flag is a convenience for adding one
when not already present, but there is currently no parameterization for
intercept-free regression.

Additionally, both covariate and trait arrays will be rechunked to have blocks
along the sample (row) dimension but not the column dimension (i.e.
they must be tall and skinny).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this warning removed for a reason?

Copy link
Collaborator Author

@eric-czech eric-czech Aug 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was duplicated.

References
----------
- [1] Hastie, Trevor, Robert Tibshirani, and Jerome Friedman. 2009. The Elements
Expand All @@ -192,7 +181,6 @@ def gwas_linear_regression(
Bayesian Mixed-Model Analysis Increases Association Power in Large Cohorts.”
Nature Genetics 47 (3): 284–90.


"""
if isinstance(covariates, str):
covariates = [covariates]
Expand Down
178 changes: 178 additions & 0 deletions sgkit/stats/hwe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
from typing import Hashable, Optional

import dask.array as da
import numpy as np
import xarray as xr
from numba import njit
from numpy import ndarray
from xarray import Dataset


def hardy_weinberg_p_value(obs_hets: int, obs_hom1: int, obs_hom2: int) -> float:
"""Exact test for HWE as described in Wigginton et al. 2005 [1].

Parameters
----------
obs_hets : int
Number of heterozygotes with minor variant.
obs_hom1 : int
Number of reference/major homozygotes.
obs_hom2 : int
Number of alternate/minor homozygotes.

Returns
-------
float
P value in [0, 1]

References
----------
- [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
Human Genetics 76 (5): 887–93.

Raises
------
ValueError
If any observed counts are negative.
"""
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
raise ValueError("Observed genotype counts must be positive")

obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1
obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2
obs_mac = 2 * obs_homr + obs_hets
obs_n = obs_hets + obs_homc + obs_homr
het_probs = np.zeros(obs_mac + 1, dtype=np.float64)

if obs_n == 0:
return np.nan # type: ignore[no-any-return]

# Identify distribution midpoint
mid = int(obs_mac * (2 * obs_n - obs_mac) / (2 * obs_n))
if (obs_mac & 1) ^ (mid & 1):
mid += 1
het_probs[mid] = 1.0
prob_sum = het_probs[mid]

# Integrate downward from distribution midpoint
curr_hets = mid
curr_homr = int((obs_mac - mid) / 2)
curr_homc = obs_n - curr_hets - curr_homr
while curr_hets > 1:
het_probs[curr_hets - 2] = (
het_probs[curr_hets]
* curr_hets
* (curr_hets - 1.0)
/ (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
)
prob_sum += het_probs[curr_hets - 2]
curr_homr += 1
curr_homc += 1
curr_hets -= 2

# Integrate upward from distribution midpoint
curr_hets = mid
curr_homr = int((obs_mac - mid) / 2)
curr_homc = obs_n - curr_hets - curr_homr
while curr_hets <= obs_mac - 2:
het_probs[curr_hets + 2] = (
het_probs[curr_hets]
* 4.0
* curr_homr
* curr_homc
/ ((curr_hets + 2.0) * (curr_hets + 1.0))
)
prob_sum += het_probs[curr_hets + 2]
curr_homr -= 1
curr_homc -= 1
curr_hets += 2

if prob_sum <= 0: # pragma: no cover
return np.nan # type: ignore[no-any-return]
het_probs = het_probs / prob_sum
p = het_probs[het_probs <= het_probs[obs_hets]].sum()
p = max(min(1.0, p), 0.0)

return p # type: ignore[no-any-return]


# Benchmarks show ~25% improvement w/ fastmath on large (~10M) counts
hardy_weinberg_p_value_jit = njit(hardy_weinberg_p_value, fastmath=True)


def hardy_weinberg_p_value_vec(
obs_hets: ndarray, obs_hom1: ndarray, obs_hom2: ndarray
) -> ndarray:
arrs = [obs_hets, obs_hom1, obs_hom2]
if len(set(map(len, arrs))) != 1:
raise ValueError("All arrays must have same length")
if list(set(map(lambda x: x.ndim, arrs))) != [1]:
raise ValueError("All arrays must be 1D")
n = len(obs_hets)
p = np.empty(n, dtype=np.float64)
for i in range(n):
p[i] = hardy_weinberg_p_value_jit(obs_hets[i], obs_hom1[i], obs_hom2[i])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Observation: we're using Dask for parallelization here (across blocks) which is fine, but we may want to consider numba's parallel=True in the future for this loop as another dispatch option.

return p


hardy_weinberg_p_value_vec_jit = njit(hardy_weinberg_p_value_vec, fastmath=True)


def hardy_weinberg_test(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function should be added to the top-level __init__.py so it's a part of the public API.

Also, +1 on the name (even though it's not a verb!). I think using the abbreviation hwe internally is fine too (as you have done).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ds: Dataset, genotype_counts: Optional[Hashable] = None
) -> Dataset:
"""Exact test for HWE as described in Wigginton et al. 2005 [1].

Parameters
----------
ds : Dataset
Dataset containing genotype calls or precomputed genotype counts.
genotype_counts : Optional[Hashable], optional
Name of variable containing precomputed genotype counts, by default
None. If not provided, these counts will be computed automatically
from genotype calls. If present, must correspond to an (`N`, 3) array
where `N` is equal to the number of variants and the 3 columns contain
heterozygous, homozygous reference, and homozygous alternate counts
(in that order) across all samples for a variant.

Warnings
--------
This function is only applicable to diploid, biallelic datasets.

Returns
-------
Dataset
Dataset containing (N = num variants):
variant_hwe_p_value : (N,) ArrayLike
P values from HWE test for each variant as float in [0, 1].

References
----------
- [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
“A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
Human Genetics 76 (5): 887–93.

Raises
------
NotImplementedError
* If ploidy of provided dataset != 2
* If maximum number of alleles in provided dataset != 2
"""
if ds.dims["ploidy"] != 2:
raise NotImplementedError("HWE test only implemented for diploid genotypes")
if ds.dims["alleles"] != 2:
raise NotImplementedError("HWE test only implemented for biallelic genotypes")
# Use precomputed genotype counts if provided
if genotype_counts is not None:
obs = list(da.asarray(ds[genotype_counts]).T)
# Otherwise compute genotype counts from calls
else:
# TODO: Use API genotype counting function instead, e.g.
# https://github.com/pystatgen/sgkit/issues/29#issuecomment-656691069
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you open an issue to track this please.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

M = ds["call_genotype_mask"].any(dim="ploidy")
AC = xr.where(M, -1, ds["call_genotype"].sum(dim="ploidy")) # type: ignore[no-untyped-call]
cts = [1, 0, 2] # arg order: hets, hom1, hom2
obs = [da.asarray((AC == ct).sum(dim="samples")) for ct in cts]
p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
return xr.Dataset({"variant_hwe_p_value": ("variants", p)})
Loading