Skip to content

Commit 10ba9eb

Browse files
committed
Add read_vcfzarr
1 parent 21a03c9 commit 10ba9eb

11 files changed

+187
-7
lines changed

requirements-dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ pre-commit
33
pytest
44
pytest-cov
55
hypothesis
6-
statsmodels
6+
pytest-datadir
7+
statsmodels

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1+
dask[array]
12
numpy
23
xarray
3-
dask[array]
44
scipy

setup.cfg

+6-3
Original file line numberDiff line numberDiff line change
@@ -54,19 +54,19 @@ ignore =
5454
profile = black
5555
default_section = THIRDPARTY
5656
known_first_party = sgkit
57-
known_third_party = dask,hypothesis,numpy,pandas,pytest,setuptools,xarray
57+
known_third_party = dask,hypothesis,numpy,pandas,pytest,setuptools,xarray,zarr
5858
multi_line_output = 3
5959
include_trailing_comma = True
6060
force_grid_wrap = 0
6161
use_parentheses = True
6262
line_length = 88
6363

64+
[mypy-dask.*]
65+
ignore_missing_imports = True
6466
[mypy-numpy.*]
6567
ignore_missing_imports = True
6668
[mypy-pandas.*]
6769
ignore_missing_imports = True
68-
[mypy-dask.*]
69-
ignore_missing_imports = True
7070
[mypy-pytest.*]
7171
ignore_missing_imports = True
7272
[mypy-statsmodels.*]
@@ -75,5 +75,8 @@ ignore_missing_imports = True
7575
ignore_missing_imports = True
7676
[mypy-setuptools]
7777
ignore_missing_imports = True
78+
[mypy-zarr]
79+
ignore_missing_imports = True
7880
[mypy-sgkit.tests.*]
81+
disallow_untyped_calls = False
7982
disallow_untyped_defs = False

sgkit/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
create_genotype_call_dataset,
77
create_genotype_dosage_dataset,
88
)
9+
from .io.vcfzarr_reader import read_vcfzarr
910
from .stats.aggregation import count_alleles
1011
from .stats.association import gwas_linear_regression
1112

@@ -18,4 +19,5 @@
1819
"count_alleles",
1920
"create_genotype_dosage_dataset",
2021
"gwas_linear_regression",
22+
"read_vcfzarr",
2123
]

sgkit/api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def create_genotype_call_dataset(
3333
The (index of the) contig for each variant.
3434
variant_position : array_like, int
3535
The reference position of the variant.
36-
variant_alleles : array_like, S1
36+
variant_alleles : array_like, zero-terminated bytes, e.g. "S1"
3737
The possible alleles for the variant.
3838
sample_id : array_like, str
3939
The unique identifier of the sample.

sgkit/io/__init__.py

Whitespace-only changes.

sgkit/io/vcfzarr_reader.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from typing import Any
2+
3+
import dask.array as da
4+
import numpy as np
5+
import xarray as xr
6+
import zarr
7+
8+
from ..api import DIM_VARIANT, create_genotype_call_dataset
9+
from ..typing import ArrayLike, PathType
10+
from ..utils import encode_array
11+
12+
13+
def read_vcfzarr(path: PathType) -> xr.Dataset:
14+
"""Read a VCF Zarr file.
15+
16+
Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
17+
from a Zarr file created using scikit-allel's `vcf_to_zarr` function.
18+
19+
Since `vcf_to_zarr` does not preserve phasing information, there is no
20+
`call/genotype_phased` variable in the resulting dataset.
21+
22+
Parameters
23+
----------
24+
path : PathType
25+
Path to the Zarr file.
26+
27+
Returns
28+
-------
29+
xr.Dataset
30+
The dataset of genotype calls, created using `create_genotype_call_dataset`.
31+
"""
32+
33+
vcfzarr = zarr.open_group(str(path), mode="r")
34+
35+
# Index the contig names
36+
variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
37+
variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
38+
variant_contig = variant_contig.astype("int16")
39+
variant_contig_names = list(variant_contig_names)
40+
41+
# For variant alleles, combine REF and ALT into a single array
42+
# and calculate the number of alleles so we can set the dtype correctly
43+
variants_ref = da.from_zarr(vcfzarr["variants/REF"])
44+
variants_alt = da.from_zarr(vcfzarr["variants/ALT"])
45+
46+
def max_str_len(arr: ArrayLike) -> Any:
47+
return arr.map_blocks(
48+
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
49+
).max()
50+
51+
max_allele_length = max(
52+
da.compute(max_str_len(variants_ref), max_str_len(variants_alt))
53+
)
54+
variants_ref_alt = da.concatenate(
55+
[variants_ref.reshape(-1, 1), variants_alt], axis=1
56+
)
57+
variant_alleles = variants_ref_alt.astype(f"S{max_allele_length}")
58+
59+
variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str)
60+
61+
ds = create_genotype_call_dataset(
62+
variant_contig_names=variant_contig_names,
63+
variant_contig=variant_contig,
64+
variant_position=da.from_zarr(vcfzarr["variants/POS"]),
65+
variant_alleles=variant_alleles,
66+
sample_id=da.from_zarr(vcfzarr["samples"]).astype(str),
67+
call_genotype=da.from_zarr(vcfzarr["calldata/GT"]),
68+
variant_id=variants_id,
69+
)
70+
71+
# Add a mask for variant ID
72+
ds["variant/id_mask"] = (
73+
[DIM_VARIANT],
74+
variants_id == ".",
75+
)
76+
77+
return ds

sgkit/tests/data/sample.vcf

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
##fileformat=VCFv4.0
2+
##fileDate=20090805
3+
##source=myImputationProgramV3.1
4+
##reference=1000GenomesPilot-NCBI36
5+
##phasing=partial
6+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
7+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
8+
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
9+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
10+
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
11+
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
12+
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
13+
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
14+
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15+
##FILTER=<ID=q10,Description="Quality below 10">
16+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
17+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
18+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
19+
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
20+
##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
21+
##ALT=<ID=CNV,Description="Copy number variable region">
22+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
23+
19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3
24+
19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3
25+
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
26+
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,.
27+
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,.
28+
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,.
29+
20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3
30+
20 1235237 . T . . . . GT 0/0 0|0 ./.
31+
X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2

sgkit/tests/data/sample.vcf.zarr.zip

20.2 KB
Binary file not shown.

sgkit/tests/test_vcfzarr_reader.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import numpy as np
2+
from numpy.testing import assert_array_equal
3+
4+
from sgkit import read_vcfzarr
5+
6+
7+
def test_read_vcfzarr(shared_datadir):
8+
# The file sample.vcf.zarr.zip was created by running the following
9+
# in a python session with the scikit-allel package installed.
10+
#
11+
# import allel
12+
# allel.vcf_to_zarr("sample.vcf", "sample.vcf.zarr.zip")
13+
14+
path = shared_datadir / "sample.vcf.zarr.zip"
15+
ds = read_vcfzarr(path)
16+
17+
assert ds.attrs["contigs"] == ["19", "20", "X"]
18+
assert_array_equal(ds["variant/contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
19+
assert_array_equal(
20+
ds["variant/position"],
21+
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
22+
)
23+
assert_array_equal(
24+
ds["variant/alleles"],
25+
[
26+
[b"A", b"C", b"", b""],
27+
[b"A", b"G", b"", b""],
28+
[b"G", b"A", b"", b""],
29+
[b"T", b"A", b"", b""],
30+
[b"A", b"G", b"T", b""],
31+
[b"T", b"", b"", b""],
32+
[b"G", b"GA", b"GAC", b""],
33+
[b"T", b"", b"", b""],
34+
[b"AC", b"A", b"ATG", b"C"],
35+
],
36+
)
37+
assert_array_equal(
38+
ds["variant/id"],
39+
[".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
40+
)
41+
assert_array_equal(
42+
ds["variant/id_mask"],
43+
[True, True, False, True, False, True, False, True, False],
44+
)
45+
46+
assert_array_equal(ds["sample/id"], ["NA00001", "NA00002", "NA00003"])
47+
48+
call_genotype = np.array(
49+
[
50+
[[0, 0], [0, 0], [0, 1]],
51+
[[0, 0], [0, 0], [0, 1]],
52+
[[0, 0], [1, 0], [1, 1]],
53+
[[0, 0], [0, 1], [0, 0]],
54+
[[1, 2], [2, 1], [2, 2]],
55+
[[0, 0], [0, 0], [0, 0]],
56+
[[0, 1], [0, 2], [-1, -1]],
57+
[[0, 0], [0, 0], [-1, -1]],
58+
[[0, -1], [0, 1], [0, 2]],
59+
],
60+
dtype="i1",
61+
)
62+
assert_array_equal(ds["call/genotype"], call_genotype)
63+
assert_array_equal(ds["call/genotype_mask"], call_genotype < 0)
64+
assert "call/genotype_phased" not in ds

sgkit/typing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
from pathlib import Path
12
from typing import Any, Union
23

34
import dask.array as da
45
import numpy as np
56

6-
DType = Any
77
ArrayLike = Union[np.ndarray, da.Array]
8+
DType = Any
9+
PathType = Union[str, Path]

0 commit comments

Comments
 (0)