@@ -140,16 +140,20 @@ def ridge_regression(
140
140
if XtX .shape [0 ] != XtY .shape [0 ]:
141
141
raise ValueError ("Array arguments must have same size in first dimension" )
142
142
diags = []
143
- for i in range (len (alphas )):
143
+ n_alpha , n_obs , n_outcome = len (alphas ), XtX .shape [0 ], XtY .shape [1 ]
144
+ for i in range (n_alpha ):
144
145
diag = np .ones (XtX .shape [1 ]) * alphas [i ]
145
146
if n_zero_reg :
146
- # Optionally remove regularization from leading covariates
147
+ # Optionally fix regularization for leading covariates
148
+ # TODO: This should probably be zero for consistency
149
+ # with orthogonalization, see:
150
+ # https://github.com/projectglow/glow/issues/266
147
151
diag [:n_zero_reg ] = 1
148
152
diags .append (np .diag (diag ))
149
153
diags = np .stack (diags )
150
154
B = np .linalg .inv (XtX + diags ) @ XtY
151
155
B = B .astype (dtype or XtX .dtype )
152
- # Coefficients have shape ( n_alpha, n_covar , n_outcome)
156
+ assert_array_shape ( B , n_alpha , n_obs , n_outcome )
153
157
return B
154
158
155
159
@@ -235,6 +239,22 @@ def _ridge_regression_cv(
235
239
236
240
237
241
def _stage_1 (G : Array , X : Array , Y : Array , alphas : Optional [ndarray ] = None ) -> Array :
242
+ """Stage 1 - WGR Base Regression
243
+
244
+ This stage will predict outcomes separately for each alpha parameter and variant
245
+ block. This "compresses" the variant dimension into a smaller space that is
246
+ much more amenable to efficient blockwise regressions in stage 2. Another
247
+ interpretation for this operation is that all sample blocks are treated
248
+ as folds in a K-fold CV fit within one single variant block. Predictions for
249
+ any one combination of variant and sample block then correspond to a
250
+ regression model fit all across sample blocks for that range of variants
251
+ except for a single sample block. In other words, the predictions are
252
+ out of sample which enables training of a stage 2 regressor based on
253
+ these predictions, a technique commonly referred to as stacking.
254
+
255
+ For more details, see the level 0 regression model described in step 1
256
+ of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
257
+ """
238
258
assert G .ndim == 2
239
259
assert X .ndim == 2
240
260
assert Y .ndim == 2
@@ -283,6 +303,17 @@ def _stage_2(
283
303
_glow_adj_alpha : bool = False ,
284
304
_glow_adj_scaling : bool = False ,
285
305
) -> Tuple [Array , Array ]:
306
+ """Stage 2 - WGR Meta Regression
307
+
308
+ This stage will train separate ridge regression models for each outcome
309
+ using the predictions from stage 1 for that same outcome as features. These
310
+ predictions are then evaluated based on R2 score to determine an optimal
311
+ "meta" estimator (see `_stage_1` for the "base" estimator description). Results
312
+ then include only predictions and coefficients from this optimal model.
313
+
314
+ For more details, see the level 1 regression model described in step 1
315
+ of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
316
+ """
286
317
assert YP .ndim == 4
287
318
assert X .ndim == 2
288
319
assert Y .ndim == 2
@@ -406,6 +437,17 @@ def _stage_3(
406
437
contigs : Array ,
407
438
variant_chunk_start : ndarray ,
408
439
) -> Optional [Array ]:
440
+ """Stage 3 - Leave-one-chromosome-out (LOCO) Estimation
441
+
442
+ This stage will use the coefficients for the optimal model in
443
+ stage 2 to re-estimate predictions in a LOCO scheme. This scheme
444
+ involves omitting coefficients that correspond to all variant
445
+ blocks for a single chromosome in the stage 2 model and then
446
+ recomputing predictions without those coefficients.
447
+
448
+ For more details, see the "LOCO predictions" section of the Supplementary Methods
449
+ in [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
450
+ """
409
451
assert B .ndim == 2
410
452
assert YP .ndim == 4
411
453
assert X .ndim == 2
@@ -699,8 +741,8 @@ def regenie(
699
741
tests. These estimates are subtracted from trait values and
700
742
sampling statistics (p-values, standard errors, etc.) are evaluated
701
743
against the residuals. See the REGENIE preprint [1] for more details.
702
-
703
- [1] - https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2
744
+ For a simpler technical overview, see [2] for a detailed description
745
+ of the individual stages and separate regression models involved.
704
746
705
747
Parameters
706
748
----------
@@ -738,6 +780,11 @@ def regenie(
738
780
**Experimental**: Remove covariates through orthogonalization
739
781
of genotypes and traits, by default False.
740
782
783
+ Warnings
784
+ --------
785
+ Binary traits are not yet supported so all outcomes provided
786
+ must be continuous.
787
+
741
788
Returns
742
789
-------
743
790
Dataset
@@ -752,11 +799,40 @@ def regenie(
752
799
blocks on held out contigs. This will be absent if the
753
800
data provided does not contain at least 2 contigs.
754
801
802
+ Examples
803
+ --------
804
+
805
+ >>> import numpy as np
806
+ >>> from sgkit.testing import simulate_genotype_call_dataset
807
+ >>> from sgkit.stats.regenie import regenie
808
+ >>> n_variant, n_sample, n_contig, n_covariate, n_trait, seed = 100, 50, 2, 3, 5, 0
809
+ >>> rs = np.random.RandomState(seed)
810
+ >>> ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig, seed=seed)
811
+ >>> ds["call_dosage"] = (("variants", "samples"), rs.normal(size=(n_variant, n_sample)))
812
+ >>> ds["sample_covariate"] = (("samples", "covariates"), rs.normal(size=(n_sample, n_covariate)))
813
+ >>> ds["sample_trait"] = (("samples", "traits"), rs.normal(size=(n_sample, n_trait)))
814
+ >>> res = regenie(ds, dosage="call_dosage", covariates="sample_covariate", traits="sample_trait")
815
+ >>> res.compute() # doctest: +NORMALIZE_WHITESPACE
816
+ <xarray.Dataset>
817
+ Dimensions: (alphas: 5, blocks: 2, contigs: 2, outcomes: 5, samples: 50)
818
+ Dimensions without coordinates: alphas, blocks, contigs, outcomes, samples
819
+ Data variables:
820
+ base_prediction (blocks, alphas, samples, outcomes) float64 0.3343 ... -...
821
+ meta_prediction (samples, outcomes) float64 -0.4588 0.78 ... -0.3984 0.3734
822
+ loco_prediction (contigs, samples, outcomes) float64 0.4886 ... -0.01498
823
+
824
+ References
825
+ ----------
826
+ [1] - Mbatchou, J., L. Barnard, J. Backman, and A. Marcketta. 2020.
827
+ “Computationally Efficient Whole Genome Regression for Quantitative and Binary
828
+ Traits.” bioRxiv. https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2.abstract.
829
+ [2] - https://glow.readthedocs.io/en/latest/tertiary/whole-genome-regression.html
830
+
755
831
Raises
756
832
------
757
833
ValueError
758
- If `G`, `X` , and `Y` do not have the same size along
759
- the first ( samples) dimension .
834
+ If dosage, covariates , and trait arrays do not have the same number
835
+ of samples.
760
836
"""
761
837
if isinstance (covariates , str ):
762
838
covariates = [covariates ]
0 commit comments