8
8
from xarray import Dataset
9
9
10
10
from sgkit import variables
11
+ from sgkit .stats .aggregation import genotype_count
11
12
from sgkit .utils import conditional_merge_datasets
12
13
13
14
@@ -153,11 +154,15 @@ def hardy_weinberg_test(
153
154
Input variable name holding call_genotype_mask.
154
155
Defined by :data:`sgkit.variables.call_genotype_mask_spec`
155
156
ploidy
156
- Genotype ploidy, defaults to ``ploidy`` dimension of genotype
157
- call array (:data:`sgkit.variables.call_genotype_spec`) if present.
158
- If that variable is not present, then this value must be set.
157
+ Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset.
158
+ If the `ploidy` dimension is not present, then this value must be set explicitly.
159
159
Currently HWE calculations are only supported for diploid datasets,
160
160
i.e. ``ploidy`` must equal 2.
161
+ alleles
162
+ Genotype allele count, defaults to ``alleles`` dimension of provided dataset.
163
+ If the `alleles` dimension is not present, then this value must be set explicitly.
164
+ Currently HWE calculations are only supported for biallelic datasets,
165
+ i.e. ``alleles`` must equal 2.
161
166
merge
162
167
If True (the default), merge the input dataset and the computed
163
168
output variables into a single dataset, otherwise return only
@@ -171,8 +176,9 @@ def hardy_weinberg_test(
171
176
Returns
172
177
-------
173
178
Dataset containing (N = num variants):
179
+
174
180
variant_hwe_p_value : [array-like, shape: (N, O)]
175
- P values from HWE test for each variant as float in [0, 1].
181
+ P values from HWE test for each variant as float in [0, 1].
176
182
177
183
References
178
184
----------
@@ -190,12 +196,19 @@ def hardy_weinberg_test(
190
196
ploidy = ploidy or ds .dims .get ("ploidy" )
191
197
if not ploidy :
192
198
raise ValueError (
193
- "`ploidy` parameter must be set when not present as array dimension."
199
+ "`ploidy` parameter must be set when not present as dataset dimension."
194
200
)
195
201
if ploidy != 2 :
196
202
raise NotImplementedError ("HWE test only implemented for diploid genotypes" )
197
- if ds .dims ["alleles" ] != 2 :
203
+
204
+ alleles = alleles or ds .dims .get ("alleles" )
205
+ if not alleles :
206
+ raise ValueError (
207
+ "`alleles` parameter must be set when not present as dataset dimension."
208
+ )
209
+ if alleles != 2 :
198
210
raise NotImplementedError ("HWE test only implemented for biallelic genotypes" )
211
+
199
212
# Use precomputed genotype counts if provided
200
213
if genotype_counts is not None :
201
214
variables .validate (ds , {genotype_counts : variables .genotype_counts_spec })
@@ -209,12 +222,16 @@ def hardy_weinberg_test(
209
222
call_genotype : variables .call_genotype_spec ,
210
223
},
211
224
)
212
- # TODO: Use API genotype counting function instead, e.g.
213
- # https://github.com/pystatgen/sgkit/issues/29#issuecomment-656691069
214
- M = ds [call_genotype_mask ].any (dim = "ploidy" )
215
- AC = xr .where (M , - 1 , ds [call_genotype ].sum (dim = "ploidy" )) # type: ignore[no-untyped-call]
216
- cts = [1 , 0 , 2 ] # arg order: hets, hom1, hom2
217
- obs = [da .asarray ((AC == ct ).sum (dim = "samples" )) for ct in cts ]
225
+ ds_ct = genotype_count (
226
+ ds ,
227
+ dim = "samples" ,
228
+ call_genotype = call_genotype ,
229
+ call_genotype_mask = call_genotype_mask ,
230
+ )
231
+ obs = [
232
+ da .asarray (ds_ct [v ])
233
+ for v in ["variant_n_het" , "variant_n_hom_ref" , "variant_n_hom_alt" ]
234
+ ]
218
235
p = da .map_blocks (hardy_weinberg_p_value_vec_jit , * obs )
219
236
new_ds = xr .Dataset ({variables .variant_hwe_p_value : ("variants" , p )})
220
237
return conditional_merge_datasets (ds , variables .validate (new_ds ), merge )
0 commit comments