Skip to content

Commit 8b3caf7

Browse files
committed
Truncate genotype calls that exceed max alt alleles
1 parent 7c17bac commit 8b3caf7

File tree

3 files changed

+31
-9
lines changed

3 files changed

+31
-9
lines changed

sgkit/io/vcf/vcf_reader.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def for_field(
152152
) -> "VcfFieldHandler":
153153
if field == "FORMAT/GT":
154154
return GenotypeFieldHandler(
155-
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls
155+
vcf, chunk_length, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles
156156
)
157157
category = field.split("/")[0]
158158
vcf_field_defs = _get_vcf_field_defs(vcf, category)
@@ -277,13 +277,15 @@ def __init__(
277277
ploidy: int,
278278
mixed_ploidy: bool,
279279
truncate_calls: bool,
280+
max_alt_alleles: int,
280281
) -> None:
281282
n_sample = len(vcf.samples)
282283
self.call_genotype = np.empty((chunk_length, n_sample, ploidy), dtype="i1")
283284
self.call_genotype_phased = np.empty((chunk_length, n_sample), dtype=bool)
284285
self.ploidy = ploidy
285286
self.mixed_ploidy = mixed_ploidy
286287
self.truncate_calls = truncate_calls
288+
self.max_alt_alleles = max_alt_alleles
287289

288290
def add_variant(self, i: int, variant: Any) -> None:
289291
fill = -2 if self.mixed_ploidy else -1
@@ -296,6 +298,10 @@ def add_variant(self, i: int, variant: Any) -> None:
296298
self.call_genotype[i, ..., 0:n] = gt[..., 0:n]
297299
self.call_genotype[i, ..., n:] = fill
298300
self.call_genotype_phased[i] = gt[..., -1]
301+
302+
# set any calls that exceed maximum number of alt alleles as missing
303+
self.call_genotype[i][self.call_genotype[i] > self.max_alt_alleles] = -1
304+
299305
else:
300306
self.call_genotype[i] = fill
301307
self.call_genotype_phased[i] = 0
@@ -583,7 +589,8 @@ def vcf_to_zarrs(
583589
max_alt_alleles
584590
The (maximum) number of alternate alleles in the VCF file. Any records with more than
585591
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
586-
variable will be truncated). Call genotype fields will however be unaffected.
592+
variable will be truncated). Any call genotype fields with the extra alleles will
593+
be changed to the missing-allele sentinel value of -1.
587594
fields
588595
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
589596
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.
@@ -739,7 +746,8 @@ def vcf_to_zarr(
739746
max_alt_alleles
740747
The (maximum) number of alternate alleles in the VCF file. Any records with more than
741748
this number of alternate alleles will have the extra alleles dropped (the `variant_allele`
742-
variable will be truncated). Call genotype fields will however be unaffected.
749+
variable will be truncated). Any call genotype fields with the extra alleles will
750+
be changed to the missing-allele sentinel value of -1.
743751
fields
744752
Extra fields to extract data for. A list of strings, with ``INFO`` or ``FORMAT`` prefixes.
745753
Wildcards are permitted too, for example: ``["INFO/*", "FORMAT/DP"]``.

sgkit/tests/io/vcf/test_vcf_reader.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,10 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
101101
output = tmp_path.joinpath("vcf.zarr").as_posix()
102102

103103
with pytest.warns(MaxAltAllelesExceededWarning):
104-
vcf_to_zarr(path, output, chunk_length=5, chunk_width=2, max_alt_alleles=1)
104+
max_alt_alleles = 1
105+
vcf_to_zarr(
106+
path, output, chunk_length=5, chunk_width=2, max_alt_alleles=max_alt_alleles
107+
)
105108
ds = xr.open_zarr(output)
106109

107110
# extra alt alleles are dropped
@@ -120,6 +123,9 @@ def test_vcf_to_zarr__max_alt_alleles(shared_datadir, is_path, tmp_path):
120123
],
121124
)
122125

126+
# genotype calls are truncated
127+
assert np.all(ds["call_genotype"].values <= max_alt_alleles)
128+
123129
# the maximum number of alt alleles actually seen is stored as an attribute
124130
assert ds.attrs["max_alt_alleles_seen"] == 3
125131

sgkit/tests/io/vcf/test_vcf_roundtrip.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -114,17 +114,23 @@ def test_DP_field(shared_datadir, tmpdir):
114114

115115

116116
@pytest.mark.parametrize(
117-
"vcf_file,allel_exclude_fields,sgkit_exclude_fields",
117+
"vcf_file,allel_exclude_fields,sgkit_exclude_fields,max_alt_alleles",
118118
[
119-
("sample.vcf.gz", None, None),
120-
("mixed.vcf.gz", None, None),
119+
("sample.vcf.gz", None, None, 3),
120+
("mixed.vcf.gz", None, None, 3),
121121
# exclude PL since it has Number=G, which is not yet supported
122-
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"]),
122+
# increase max_alt_alleles since scikit-allel does not truncate genotype calls
123+
("CEUTrio.20.21.gatk3.4.g.vcf.bgz", ["calldata/PL"], ["FORMAT/PL"], 7),
123124
],
124125
)
125126
@pytest.mark.filterwarnings("ignore::sgkit.io.vcf.MaxAltAllelesExceededWarning")
126127
def test_all_fields(
127-
shared_datadir, tmpdir, vcf_file, allel_exclude_fields, sgkit_exclude_fields
128+
shared_datadir,
129+
tmpdir,
130+
vcf_file,
131+
allel_exclude_fields,
132+
sgkit_exclude_fields,
133+
max_alt_alleles,
128134
):
129135
# change scikit-allel type defaults back to the VCF default
130136
types = {
@@ -140,6 +146,7 @@ def test_all_fields(
140146
fields=["*"],
141147
exclude_fields=allel_exclude_fields,
142148
types=types,
149+
alt_number=max_alt_alleles,
143150
)
144151

145152
field_defs = {
@@ -159,6 +166,7 @@ def test_all_fields(
159166
exclude_fields=sgkit_exclude_fields,
160167
field_defs=field_defs,
161168
truncate_calls=True,
169+
max_alt_alleles=max_alt_alleles,
162170
)
163171
sg_ds = sg.load_dataset(str(sg_vcfzarr_path))
164172
sg_ds = sg_ds.drop_vars("call_genotype_phased") # not included in scikit-allel

0 commit comments

Comments
 (0)