Skip to content

Commit 45b9f21

Browse files
committed
Truncate genotype calls that exceed max alt alleles
1 parent 7c17bac commit 45b9f21

File tree

2 files changed

+18
-4
lines changed

2 files changed

+18
-4
lines changed

sgkit/io/vcf/vcf_reader.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def for_field(
152152
) -> "VcfFieldHandler":
153153
if field == "FORMAT/GT":
154154
return GenotypeFieldHandler(
155-
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls
155+
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles
156156
)
157157
category = field.split("/")[0]
158158
vcf_field_defs = _get_vcf_field_defs(vcf, category)
@@ -277,13 +277,15 @@ def __init__(
277277
ploidy: int,
278278
mixed_ploidy: bool,
279279
truncate_calls: bool,
280+
max_alt_alleles: int,
280281
) -> None:
281282
n_sample = len(vcf.samples)
282283
self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1")
283284
self.call_genotype_phased = np.empty((chunk_length, n_sample), dtype=bool)
284285
self.ploidy = ploidy
285286
self.mixed_ploidy = mixed_ploidy
286287
self.truncate_calls = truncate_calls
288+
self.max_alt_alleles = max_alt_alleles
287289

288290
def add_variant(self, i: int, variant: Any) -> None:
289291
fill = -2 if self.mixed_ploidy else -1
@@ -296,6 +298,10 @@ def add_variant(self, i: int, variant: Any) -> None:
296298
self.call_genotype[i, ..., 0:n] = gt[..., 0:n]
297299
self.call_genotype[i, ..., n:] = fill
298300
self.call_genotype_phased[i] = gt[..., -1]
301+
302+
# set any calls that exceed maximum number of alt alleles as missing
303+
self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1
304+
299305
else:
300306
self.call_genotype[i] = fill
301307
self.call_genotype_phased[i] = 0
@@ -583,7 +589,8 @@ def vcf_to_zarrs(
583589
max_alt_alleles
584590
The (maximum) number of alternate alleles in the VCF file. Any records with more than
585591
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
586-
variable will be truncated). Call genotype fields will however be unaffected.
592+
variable will be truncated). Any call genotype fields with the extra alleles will
593+
be changed to the missing-allele sentinel value of -1.
587594
fields
588595
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
589596
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
@@ -739,7 +746,8 @@ def vcf_to_zarr(
739746
max_alt_alleles
740747
The (maximum) number of alternate alleles in the VCF file. Any records with more than
741748
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
742-
variable will be truncated). Call genotype fields will however be unaffected.
749+
variable will be truncated). Any call genotype fields with the extra alleles will
750+
be changed to the missing-allele sentinel value of -1.
743751
fields
744752
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
745753
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.

sgkit/tests/io/vcf/test_vcf_reader.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
101101
output = tmp_path.joinpath("vcf.zarr").as_posix()
102102

103103
with pytest.warns(MaxAltAllelesExceededWarning):
104-
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
104+
max_alt_alleles = 1
105+
vcf_to_zarr(
106+
path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
107+
)
105108
ds = xr.open_zarr(output)
106109

107110
# extra alt alleles are dropped
@@ -120,6 +123,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
120123
],
121124
)
122125

126+
# genotype calls are truncated
127+
assert np.all(ds["call_genotype"].values <= max_alt_alleles)
128+
123129
# the maximum number of alt alleles actually seen is stored as an attribute
124130
assert ds.attrs["max_alt_alleles_seen"] == 3
125131

0 commit comments

Comments
 (0)