Skip to content

Commit 3ddbd78

Browse files
committed
Doc updates and assert statement changes
1 parent 84c761f commit 3ddbd78

File tree

3 files changed

+96
-8
lines changed

3 files changed

+96
-8
lines changed

sgkit/stats/regenie.py

Lines changed: 83 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -140,16 +140,20 @@ def ridge_regression(
140140
if XtX.shape[0] != XtY.shape[0]:
141141
raise ValueError("Array arguments must have same size in first dimension")
142142
diags = []
143-
for i in range(len(alphas)):
143+
n_alpha, n_obs, n_outcome = len(alphas), XtX.shape[0], XtY.shape[1]
144+
for i in range(n_alpha):
144145
diag = np.ones(XtX.shape[1]) * alphas[i]
145146
if n_zero_reg:
146-
# Optionally remove regularization from leading covariates
147+
# Optionally fix regularization for leading covariates
148+
# TODO: This should probably be zero for consistency
149+
# with orthogonalization, see:
150+
# https://github.com/projectglow/glow/issues/266
147151
diag[:n_zero_reg] = 1
148152
diags.append(np.diag(diag))
149153
diags = np.stack(diags)
150154
B = np.linalg.inv(XtX + diags) @ XtY
151155
B = B.astype(dtype or XtX.dtype)
152-
# Coefficients have shape (n_alpha, n_covar, n_outcome)
156+
assert_array_shape(B, n_alpha, n_obs, n_outcome)
153157
return B
154158

155159

@@ -235,6 +239,22 @@ def _ridge_regression_cv(
235239

236240

237241
def _stage_1(G: Array, X: Array, Y: Array, alphas: Optional[ndarray] = None) -> Array:
242+
"""Stage 1 - WGR Base Regression
243+
244+
This stage will predict outcomes separately for each alpha parameter and variant
245+
block. This "compresses" the variant dimension into a smaller space that is
246+
much more amenable to efficient blockwise regressions in stage 2. Another
247+
interpretation for this operation is that all sample blocks are treated
248+
as folds in a K-fold CV fit within one single variant block. Predictions for
249+
any one combination of variant and sample block then correspond to a
250+
regression model fit all across sample blocks for that range of variants
251+
except for a single sample block. In other words, the predictions are
252+
out of sample which enables training of a stage 2 regressor based on
253+
these predictions, a technique commonly referred to as stacking.
254+
255+
For more details, see the level 0 regression model described in step 1
256+
of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
257+
"""
238258
assert G.ndim == 2
239259
assert X.ndim == 2
240260
assert Y.ndim == 2
@@ -283,6 +303,17 @@ def _stage_2(
283303
_glow_adj_alpha: bool = False,
284304
_glow_adj_scaling: bool = False,
285305
) -> Tuple[Array, Array]:
306+
"""Stage 2 - WGR Meta Regression
307+
308+
This stage will train separate ridge regression models for each outcome
309+
using the predictions from stage 1 for that same outcome as features. These
310+
predictions are then evaluated based on R2 score to determine an optimal
311+
"meta" estimator (see `_stage_1` for the "base" estimator description). Results
312+
then include only predictions and coefficients from this optimal model.
313+
314+
For more details, see the level 1 regression model described in step 1
315+
of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
316+
"""
286317
assert YP.ndim == 4
287318
assert X.ndim == 2
288319
assert Y.ndim == 2
@@ -406,6 +437,17 @@ def _stage_3(
406437
contigs: Array,
407438
variant_chunk_start: ndarray,
408439
) -> Optional[Array]:
440+
"""Stage 3 - Leave-one-chromosome-out (LOCO) Estimation
441+
442+
This stage will use the coefficients for the optimal model in
443+
stage 2 to re-estimate predictions in a LOCO scheme. This scheme
444+
involves omitting coefficients that correspond to all variant
445+
blocks for a single chromosome in the stage 2 model and then
446+
recomputing predictions without those coefficients.
447+
448+
For more details, see the "LOCO predictions" section of the Supplementary Methods
449+
in [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
450+
"""
409451
assert B.ndim == 2
410452
assert YP.ndim == 4
411453
assert X.ndim == 2
@@ -699,8 +741,8 @@ def regenie(
699741
tests. These estimates are subtracted from trait values and
700742
sampling statistics (p-values, standard errors, etc.) are evaluated
701743
against the residuals. See the REGENIE preprint [1] for more details.
702-
703-
[1] - https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2
744+
For a simpler technical overview, see [2] for a detailed description
745+
of the individual stages and separate regression models involved.
704746
705747
Parameters
706748
----------
@@ -738,6 +780,11 @@ def regenie(
738780
**Experimental**: Remove covariates through orthogonalization
739781
of genotypes and traits, by default False.
740782
783+
Warnings
784+
--------
785+
Binary traits are not yet supported so all outcomes provided
786+
must be continuous.
787+
741788
Returns
742789
-------
743790
Dataset
@@ -752,11 +799,40 @@ def regenie(
752799
blocks on held out contigs. This will be absent if the
753800
data provided does not contain at least 2 contigs.
754801
802+
Examples
803+
--------
804+
805+
>>> import numpy as np
806+
>>> from sgkit.testing import simulate_genotype_call_dataset
807+
>>> from sgkit.stats.regenie import regenie
808+
>>> n_variant, n_sample, n_contig, n_covariate, n_trait, seed = 100, 50, 2, 3, 5, 0
809+
>>> rs = np.random.RandomState(seed)
810+
>>> ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig, seed=seed)
811+
>>> ds["call_dosage"] = (("variants", "samples"), rs.normal(size=(n_variant, n_sample)))
812+
>>> ds["sample_covariate"] = (("samples", "covariates"), rs.normal(size=(n_sample, n_covariate)))
813+
>>> ds["sample_trait"] = (("samples", "traits"), rs.normal(size=(n_sample, n_trait)))
814+
>>> res = regenie(ds, dosage="call_dosage", covariates="sample_covariate", traits="sample_trait")
815+
>>> res.compute() # doctest: +NORMALIZE_WHITESPACE
816+
<xarray.Dataset>
817+
Dimensions: (alphas: 5, blocks: 2, contigs: 2, outcomes: 5, samples: 50)
818+
Dimensions without coordinates: alphas, blocks, contigs, outcomes, samples
819+
Data variables:
820+
base_prediction (blocks, alphas, samples, outcomes) float64 0.3343 ... -...
821+
meta_prediction (samples, outcomes) float64 -0.4588 0.78 ... -0.3984 0.3734
822+
loco_prediction (contigs, samples, outcomes) float64 0.4886 ... -0.01498
823+
824+
References
825+
----------
826+
[1] - Mbatchou, J., L. Barnard, J. Backman, and A. Marcketta. 2020.
827+
“Computationally Efficient Whole Genome Regression for Quantitative and Binary
828+
Traits.” bioRxiv. https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2.abstract.
829+
[2] - https://glow.readthedocs.io/en/latest/tertiary/whole-genome-regression.html
830+
755831
Raises
756832
------
757833
ValueError
758-
If `G`, `X`, and `Y` do not have the same size along
759-
the first (samples) dimension.
834+
If dosage, covariates, and trait arrays do not have the same number
835+
of samples.
760836
"""
761837
if isinstance(covariates, str):
762838
covariates = [covariates]

sgkit/tests/test_regenie.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,18 @@ def test_ridge_regression():
373373
np.testing.assert_equal(res1, res2)
374374

375375

376+
def test_ridge_regression__raise_on_non_symmetric():
377+
with pytest.raises(ValueError, match="First argument must be symmetric"):
378+
ridge_regression(np.ones((2, 1)), np.ones((2, 1)), np.array([1.0]))
379+
380+
381+
def test_ridge_regression__raise_on_non_equal_first_dim():
382+
with pytest.raises(
383+
ValueError, match="Array arguments must have same size in first dimension"
384+
):
385+
ridge_regression(np.ones((2, 2)), np.ones((1, 1)), np.array([1.0]))
386+
387+
376388
@pytest.mark.parametrize( # type: ignore[misc]
377389
"x,size,expected_index,expected_sizes", # type: ignore[no-untyped-def]
378390
[

validation/gwas/method/regenie/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ All of the above are represented as pyinvoke tasks in [tasks.py](tasks.py).
1515

1616
The definition of each simulated dataset and parameterizations run against them can be seen in [config.yml](config.yml).
1717

18-
At TOW, these commands were used to generate the current test data:
18+
At time of writing, these commands were used to generate the current test data:
1919

2020
```bash
2121
# Build the simulated inputs and outputs

0 commit comments

Comments
 (0)