@@ -161,7 +161,7 @@ def for_field(
161
161
) -> "VcfFieldHandler" :
162
162
if field == "FORMAT/GT" :
163
163
return GenotypeFieldHandler (
164
- vcf , chunk_length , ploidy , mixed_ploidy , truncate_calls
164
+ vcf , chunk_length , ploidy , mixed_ploidy , truncate_calls , max_alt_alleles
165
165
)
166
166
category = field .split ("/" )[0 ]
167
167
vcf_field_defs = _get_vcf_field_defs (vcf , category )
@@ -286,13 +286,15 @@ def __init__(
286
286
ploidy : int ,
287
287
mixed_ploidy : bool ,
288
288
truncate_calls : bool ,
289
+ max_alt_alleles : int ,
289
290
) -> None :
290
291
n_sample = len (vcf .samples )
291
292
self .call_genotype = np .empty ((chunk_length , n_sample , ploidy ), dtype = "i1" )
292
293
self .call_genotype_phased = np .empty ((chunk_length , n_sample ), dtype = bool )
293
294
self .ploidy = ploidy
294
295
self .mixed_ploidy = mixed_ploidy
295
296
self .truncate_calls = truncate_calls
297
+ self .max_alt_alleles = max_alt_alleles
296
298
297
299
def add_variant (self , i : int , variant : Any ) -> None :
298
300
fill = - 2 if self .mixed_ploidy else - 1
@@ -305,6 +307,10 @@ def add_variant(self, i: int, variant: Any) -> None:
305
307
self .call_genotype [i , ..., 0 :n ] = gt [..., 0 :n ]
306
308
self .call_genotype [i , ..., n :] = fill
307
309
self .call_genotype_phased [i ] = gt [..., - 1 ]
310
+
311
+ # set any calls that exceed maximum number of alt alleles as missing
312
+ self .call_genotype [i ][self .call_genotype [i ] > self .max_alt_alleles ] = - 1
313
+
308
314
else :
309
315
self .call_genotype [i ] = fill
310
316
self .call_genotype_phased [i ] = 0
@@ -616,7 +622,8 @@ def vcf_to_zarrs(
616
622
max_alt_alleles
617
623
The (maximum) number of alternate alleles in the VCF file. Any records with more than
618
624
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
619
- variable will be truncated). Call genotype fields will however be unaffected.
625
+ variable will be truncated). Any call genotype fields with the extra alleles will
626
+ be changed to the missing-allele sentinel value of -1.
620
627
fields
621
628
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
622
629
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
@@ -784,7 +791,8 @@ def vcf_to_zarr(
784
791
max_alt_alleles
785
792
The (maximum) number of alternate alleles in the VCF file. Any records with more than
786
793
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
787
- variable will be truncated). Call genotype fields will however be unaffected.
794
+ variable will be truncated). Any call genotype fields with the extra alleles will
795
+ be changed to the missing-allele sentinel value of -1.
788
796
fields
789
797
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
790
798
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
0 commit comments