Skip to content

Commit a7ad4ae

Browse files
authored
Add read_vcfzarr (#40)
1 parent d4a6f73 commit a7ad4ae

10 files changed

+183
-8
lines changed

requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
numpy
22
xarray
33
dask[array]
4-
scipy
4+
scipy
5+
zarr

setup.cfg

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,12 @@ force_grid_wrap = 0
6464
use_parentheses = True
6565
line_length = 88
6666

67+
[mypy-dask.*]
68+
ignore_missing_imports = True
6769
[mypy-numpy.*]
6870
ignore_missing_imports = True
6971
[mypy-pandas.*]
7072
ignore_missing_imports = True
71-
[mypy-dask.*]
72-
ignore_missing_imports = True
7373
[mypy-pytest.*]
7474
ignore_missing_imports = True
7575
[mypy-statsmodels.*]

sgkit/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
create_genotype_dosage_dataset,
88
)
99
from .display import display_genotypes
10+
from .io.vcfzarr_reader import read_vcfzarr
1011
from .stats.aggregation import count_alleles
1112
from .stats.association import gwas_linear_regression
1213
from .stats.regenie import regenie
@@ -21,5 +22,6 @@
2122
"create_genotype_dosage_dataset",
2223
"display_genotypes",
2324
"gwas_linear_regression",
25+
"read_vcfzarr",
2426
"regenie",
2527
]

sgkit/api.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def create_genotype_call_dataset(
3333
The (index of the) contig for each variant.
3434
variant_position : array_like, int
3535
The reference position of the variant.
36-
variant_alleles : array_like, S1
36+
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
3737
The possible alleles for the variant.
3838
sample_id : array_like, str
3939
The unique identifier of the sample.
@@ -55,7 +55,7 @@ def create_genotype_call_dataset(
5555
"""
5656
check_array_like(variant_contig, kind="i", ndim=1)
5757
check_array_like(variant_position, kind="i", ndim=1)
58-
check_array_like(variant_alleles, kind="S", ndim=2)
58+
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
5959
check_array_like(sample_id, kind="U", ndim=1)
6060
check_array_like(call_genotype, kind="i", ndim=3)
6161
data_vars: Dict[Hashable, Any] = {
@@ -102,7 +102,7 @@ def create_genotype_dosage_dataset(
102102
The (index of the) contig for each variant.
103103
variant_position : array_like, int
104104
The reference position of the variant.
105-
variant_alleles : array_like, S1
105+
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
106106
The possible alleles for the variant.
107107
sample_id : array_like, str
108108
The unique identifier of the sample.
@@ -120,7 +120,7 @@ def create_genotype_dosage_dataset(
120120
"""
121121
check_array_like(variant_contig, kind="i", ndim=1)
122122
check_array_like(variant_position, kind="i", ndim=1)
123-
check_array_like(variant_alleles, kind="S", ndim=2)
123+
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
124124
check_array_like(sample_id, kind="U", ndim=1)
125125
check_array_like(call_dosage, kind="f", ndim=2)
126126
data_vars: Dict[Hashable, Any] = {

sgkit/io/__init__.py

Whitespace-only changes.

sgkit/io/vcfzarr_reader.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import dask.array as da
2+
import xarray as xr
3+
import zarr
4+
5+
from ..api import DIM_VARIANT, create_genotype_call_dataset
6+
from ..typing import ArrayLike, PathType
7+
from ..utils import encode_array
8+
9+
10+
def _ensure_2d(arr: ArrayLike) -> ArrayLike:
11+
if arr.ndim == 1:
12+
arr = arr.reshape(-1, 1)
13+
return arr
14+
15+
16+
def read_vcfzarr(path: PathType) -> xr.Dataset:
17+
"""Read a VCF Zarr file.
18+
19+
Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
20+
from a Zarr file created using scikit-allel's `vcf_to_zarr` function.
21+
22+
Since `vcf_to_zarr` does not preserve phasing information, there is no
23+
`call/genotype_phased` variable in the resulting dataset.
24+
25+
Parameters
26+
----------
27+
path : PathType
28+
Path to the Zarr file.
29+
30+
Returns
31+
-------
32+
xr.Dataset
33+
The dataset of genotype calls, created using `create_genotype_call_dataset`.
34+
"""
35+
36+
vcfzarr = zarr.open_group(str(path), mode="r")
37+
38+
# Index the contig names
39+
variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
40+
variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
41+
variant_contig = variant_contig.astype("int16")
42+
variant_contig_names = list(variant_contig_names)
43+
44+
# For variant alleles, combine REF and ALT into a single array
45+
variants_ref = da.from_zarr(vcfzarr["variants/REF"])
46+
variants_alt = da.from_zarr(vcfzarr["variants/ALT"])
47+
variant_alleles = da.concatenate(
48+
[_ensure_2d(variants_ref), _ensure_2d(variants_alt)], axis=1
49+
)
50+
51+
variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str)
52+
53+
ds = create_genotype_call_dataset(
54+
variant_contig_names=variant_contig_names,
55+
variant_contig=variant_contig,
56+
variant_position=da.from_zarr(vcfzarr["variants/POS"]),
57+
variant_alleles=variant_alleles,
58+
sample_id=da.from_zarr(vcfzarr["samples"]).astype(str),
59+
call_genotype=da.from_zarr(vcfzarr["calldata/GT"]),
60+
variant_id=variants_id,
61+
)
62+
63+
# Add a mask for variant ID
64+
ds["variant_id_mask"] = (
65+
[DIM_VARIANT],
66+
variants_id == ".",
67+
)
68+
69+
return ds

sgkit/tests/data/sample.vcf

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
##fileformat=VCFv4.0
2+
##fileDate=20090805
3+
##source=myImputationProgramV3.1
4+
##reference=1000GenomesPilot-NCBI36
5+
##phasing=partial
6+
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
7+
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
8+
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
9+
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
10+
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
11+
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
12+
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
13+
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
14+
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15+
##FILTER=<ID=q10,Description="Quality below 10">
16+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
17+
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
18+
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
19+
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
20+
##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
21+
##ALT=<ID=CNV,Description="Copy number variable region">
22+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
23+
19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3
24+
19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3
25+
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
26+
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,.
27+
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,.
28+
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,.
29+
20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3
30+
20 1235237 . T . . . . GT 0/0 0|0 ./.
31+
X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2

sgkit/tests/data/sample.vcf.zarr.zip

20.2 KB
Binary file not shown.

sgkit/tests/test_vcfzarr_reader.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import numpy as np
2+
from numpy.testing import assert_array_equal
3+
4+
from sgkit import read_vcfzarr
5+
from sgkit.io.vcfzarr_reader import _ensure_2d
6+
7+
8+
def test_read_vcfzarr(shared_datadir):
9+
# The file sample.vcf.zarr.zip was created by running the following
10+
# in a python session with the scikit-allel package installed.
11+
#
12+
# import allel
13+
# allel.vcf_to_zarr("sample.vcf", "sample.vcf.zarr.zip")
14+
15+
path = shared_datadir / "sample.vcf.zarr.zip"
16+
ds = read_vcfzarr(path)
17+
18+
assert ds.attrs["contigs"] == ["19", "20", "X"]
19+
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
20+
assert_array_equal(
21+
ds["variant_position"],
22+
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
23+
)
24+
assert_array_equal(
25+
ds["variant_allele"],
26+
[
27+
["A", "C", "", ""],
28+
["A", "G", "", ""],
29+
["G", "A", "", ""],
30+
["T", "A", "", ""],
31+
["A", "G", "T", ""],
32+
["T", "", "", ""],
33+
["G", "GA", "GAC", ""],
34+
["T", "", "", ""],
35+
["AC", "A", "ATG", "C"],
36+
],
37+
)
38+
assert_array_equal(
39+
ds["variant_id"],
40+
[".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
41+
)
42+
assert_array_equal(
43+
ds["variant_id_mask"],
44+
[True, True, False, True, False, True, False, True, False],
45+
)
46+
47+
assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
48+
49+
call_genotype = np.array(
50+
[
51+
[[0, 0], [0, 0], [0, 1]],
52+
[[0, 0], [0, 0], [0, 1]],
53+
[[0, 0], [1, 0], [1, 1]],
54+
[[0, 0], [0, 1], [0, 0]],
55+
[[1, 2], [2, 1], [2, 2]],
56+
[[0, 0], [0, 0], [0, 0]],
57+
[[0, 1], [0, 2], [-1, -1]],
58+
[[0, 0], [0, 0], [-1, -1]],
59+
[[0, -1], [0, 1], [0, 2]],
60+
],
61+
dtype="i1",
62+
)
63+
assert_array_equal(ds["call_genotype"], call_genotype)
64+
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
65+
assert "call_genotype_phased" not in ds
66+
67+
68+
def test_ensure_2d():
69+
assert_array_equal(_ensure_2d(np.array([0, 2, 1])), np.array([[0], [2], [1]]))
70+
assert_array_equal(_ensure_2d(np.array([[0], [2], [1]])), np.array([[0], [2], [1]]))

sgkit/typing.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
from pathlib import Path
12
from typing import Any, Union
23

34
import dask.array as da
45
import numpy as np
56

6-
DType = Any
77
ArrayLike = Union[np.ndarray, da.Array]
8+
DType = Any
9+
PathType = Union[str, Path]

0 commit comments

Comments
 (0)