Skip to content

Commit 34d640a

Browse files
tomwhitemergify[bot]
authored andcommitted
Filters encoding
1 parent 8f75b3e commit 34d640a

File tree

4 files changed

+34
-16
lines changed

4 files changed

+34
-16
lines changed

sgkit/io/vcf/vcf_reader.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
INT_FILL,
3232
INT_MISSING,
3333
STR_FILL,
34-
STR_MISSING,
3534
)
3635
from sgkit.io.vcf import partition_into_regions
3736
from sgkit.io.vcf.utils import build_url, chunks, temporary_directory, url_filename
@@ -40,6 +39,7 @@
4039
vcf_number_to_dimension_and_size,
4140
)
4241
from sgkit.model import (
42+
DIM_FILTER,
4343
DIM_PLOIDY,
4444
DIM_SAMPLE,
4545
DIM_VARIANT,
@@ -375,6 +375,16 @@ def vcf_to_zarr_sequential(
375375

376376
variant_contig_names = vcf.seqnames
377377

378+
filters = [
379+
h["ID"]
380+
for h in vcf.header_iter()
381+
if h["HeaderType"] == "FILTER" and isinstance(h["ID"], str)
382+
]
383+
# Ensure PASS is the first filter if present
384+
if "PASS" in filters:
385+
filters.remove("PASS")
386+
filters.insert(0, "PASS")
387+
378388
# Remember max lengths of variable-length strings
379389
max_variant_id_length = 0
380390
max_variant_allele_length = 0
@@ -417,7 +427,7 @@ def vcf_to_zarr_sequential(
417427
variant_ids = []
418428
variant_alleles = []
419429
variant_quality = np.empty(chunk_length, dtype="f4")
420-
variant_filter = np.empty(chunk_length, dtype="O")
430+
variant_filter = np.full((chunk_length, len(filters)), False, dtype="bool")
421431

422432
i = -1 # initialize in case of empty variants_chunk
423433
for i, variant in enumerate(variants_chunk):
@@ -446,14 +456,9 @@ def vcf_to_zarr_sequential(
446456
variant_quality[i] = (
447457
variant.QUAL if variant.QUAL is not None else FLOAT32_MISSING
448458
)
449-
# filters are stored as a semicolon-delimited string (like VCF) since Zarr can't store
450-
# variable-length arrays of variable-length strings
451-
# (related to https://github.com/pystatgen/sgkit/issues/634)
452-
variant_filter[i] = (
453-
";".join(variant.FILTERS)
454-
if len(variant.FILTERS) > 0
455-
else STR_MISSING
456-
)
459+
for f in variant.FILTERS:
460+
variant_filter[i][filters.index(f)] = True
461+
457462
for field_handler in field_handlers:
458463
field_handler.add_variant(i, variant)
459464

@@ -487,7 +492,8 @@ def vcf_to_zarr_sequential(
487492
variant_id_mask,
488493
)
489494
ds["variant_quality"] = ([DIM_VARIANT], variant_quality)
490-
ds["variant_filter"] = ([DIM_VARIANT], variant_filter)
495+
ds["variant_filter"] = ([DIM_VARIANT, DIM_FILTER], variant_filter)
496+
ds.attrs["filters"] = filters
491497
ds.attrs["vcf_zarr_version"] = "0.1"
492498
ds.attrs["vcf_header"] = vcf.raw_header
493499

sgkit/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
DIM_PLOIDY = "ploidy"
1414
DIM_ALLELE = "alleles"
1515
DIM_GENOTYPE = "genotypes"
16+
DIM_FILTER = "filters"
1617

1718

1819
def create_genotype_call_dataset(

sgkit/tests/io/vcf/test_vcf_reader.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1181,10 +1181,10 @@ def test_spec(shared_datadir, tmp_path):
11811181
check_field(
11821182
group,
11831183
"variant_filter",
1184-
ndim=1,
1185-
shape=(variants,),
1186-
dimension_names=["variants"],
1187-
dtype=str,
1184+
ndim=2,
1185+
shape=(variants, 3),
1186+
dimension_names=["variants", "filters"],
1187+
dtype=bool,
11881188
)
11891189

11901190
# INFO fields
@@ -1338,7 +1338,17 @@ def test_spec(shared_datadir, tmp_path):
13381338
) # missing nan
13391339
assert_array_equal(
13401340
group["variant_filter"],
1341-
[".", ".", "PASS", "q10;s50", "PASS", "PASS", "PASS", ".", "PASS"],
1341+
[
1342+
[False, False, False],
1343+
[False, False, False],
1344+
[True, False, False],
1345+
[False, True, True],
1346+
[True, False, False],
1347+
[True, False, False],
1348+
[True, False, False],
1349+
[False, False, False],
1350+
[True, False, False],
1351+
],
13421352
)
13431353

13441354
assert_array_equal(

sgkit/tests/io/vcf/test_vcf_roundtrip.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def fix_missing_fields(ds: Dataset) -> Dataset:
7979
# drop variables and attributes that are not included in scikit-allel
8080
ds = ds.drop_vars("call_genotype_phased")
8181
ds = ds.drop_vars("variant_filter")
82+
del ds.attrs["filters"]
8283
del ds.attrs["max_alt_alleles_seen"]
8384
del ds.attrs["vcf_zarr_version"]
8485
del ds.attrs["vcf_header"]

0 commit comments

Comments
 (0)