Skip to content

Commit f487994

Browse files
tomwhitemergify[bot]
authored andcommitted
Truncate genotype calls that exceed max alt alleles
1 parent 3f58b3a commit f487994

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

sgkit/io/vcf/vcf_reader.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def for_field(
161161
) -> "VcfFieldHandler":
162162
if field == "FORMAT/GT":
163163
return GenotypeFieldHandler(
164-
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls
164+
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles
165165
)
166166
category = field.split("/")[0]
167167
vcf_field_defs = _get_vcf_field_defs(vcf, category)
@@ -286,13 +286,15 @@ def __init__(
286286
ploidy: int,
287287
mixed_ploidy: bool,
288288
truncate_calls: bool,
289+
max_alt_alleles: int,
289290
) -> None:
290291
n_sample = len(vcf.samples)
291292
self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1")
292293
self.call_genotype_phased = np.empty((chunk_length, n_sample), dtype=bool)
293294
self.ploidy = ploidy
294295
self.mixed_ploidy = mixed_ploidy
295296
self.truncate_calls = truncate_calls
297+
self.max_alt_alleles = max_alt_alleles
296298

297299
def add_variant(self, i: int, variant: Any) -> None:
298300
fill = -2 if self.mixed_ploidy else -1
@@ -305,6 +307,10 @@ def add_variant(self, i: int, variant: Any) -> None:
305307
self.call_genotype[i, ..., 0:n] = gt[..., 0:n]
306308
self.call_genotype[i, ..., n:] = fill
307309
self.call_genotype_phased[i] = gt[..., -1]
310+
311+
# set any calls that exceed maximum number of alt alleles as missing
312+
self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1
313+
308314
else:
309315
self.call_genotype[i] = fill
310316
self.call_genotype_phased[i] = 0
@@ -616,7 +622,8 @@ def vcf_to_zarrs(
616622
max_alt_alleles
617623
The (maximum) number of alternate alleles in the VCF file. Any records with more than
618624
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
619-
variable will be truncated). Call genotype fields will however be unaffected.
625+
variable will be truncated). Any call genotype fields with the extra alleles will
626+
be changed to the missing-allele sentinel value of -1.
620627
fields
621628
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
622629
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
@@ -784,7 +791,8 @@ def vcf_to_zarr(
784791
max_alt_alleles
785792
The (maximum) number of alternate alleles in the VCF file. Any records with more than
786793
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
787-
variable will be truncated). Call genotype fields will however be unaffected.
794+
variable will be truncated). Any call genotype fields with the extra alleles will
795+
be changed to the missing-allele sentinel value of -1.
788796
fields
789797
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
790798
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.

sgkit/tests/io/vcf/test_vcf_reader.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
103103
output = tmp_path.joinpath("vcf.zarr").as_posix()
104104

105105
with pytest.warns(MaxAltAllelesExceededWarning):
106-
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
106+
max_alt_alleles = 1
107+
vcf_to_zarr(
108+
path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
109+
)
107110
ds = xr.open_zarr(output)
108111

109112
# extra alt alleles are dropped
@@ -122,6 +125,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
122125
],
123126
)
124127

128+
# genotype calls are truncated
129+
assert np.all(ds["call_genotype"].values <= max_alt_alleles)
130+
125131
# the maximum number of alt alleles actually seen is stored as an attribute
126132
assert ds.attrs["max_alt_alleles_seen"] == 3
127133

sgkit/tests/io/vcf/test_vcf_roundtrip.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -114,17 +114,23 @@ def test_DP_field(shared_datadir, tmpdir):
114114

115115

116116
@pytest.mark.parametrize(
117-
"vcf_file,allel_exclude_fields,sgkit_exclude_fields",
117+
"vcf_file,allel_exclude_fields,sgkit_exclude_fields,max_alt_alleles",
118118
[
119-
("sample.vcf.gz", None, None),
120-
("mixed.vcf.gz", None, None),
119+
("sample.vcf.gz", None, None, 3),
120+
("mixed.vcf.gz", None, None, 3),
121121
# exclude PL since it has Number=G, which is not yet supported
122-
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"]),
122+
# increase max_alt_alleles since scikit-allel does not truncate genotype calls
123+
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"], 7),
123124
],
124125
)
125126
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
126127
def test_all_fields(
127-
shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields
128+
shared_datadir,
129+
tmpdir,
130+
vcf_file,
131+
allel_exclude_fields,
132+
sgkit_exclude_fields,
133+
max_alt_alleles,
128134
):
129135
# change scikit-allel type defaults back to the VCF default
130136
types = {
@@ -140,6 +146,7 @@ def test_all_fields(
140146
fields=["*"],
141147
exclude_fields=allel_exclude_fields,
142148
types=types,
149+
alt_number=max_alt_alleles,
143150
)
144151

145152
field_defs = {
@@ -159,6 +166,7 @@ def test_all_fields(
159166
exclude_fields=sgkit_exclude_fields,
160167
field_defs=field_defs,
161168
truncate_calls=True,
169+
max_alt_alleles=max_alt_alleles,
162170
)
163171
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
164172
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel

0 commit comments

Comments
 (0)