Skip to content

Add read_vcfzarr #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
numpy
xarray
dask[array]
scipy
scipy
zarr
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ force_grid_wrap = 0
use_parentheses = True
line_length = 88

[mypy-dask.*]
ignore_missing_imports = True
[mypy-numpy.*]
ignore_missing_imports = True
[mypy-pandas.*]
ignore_missing_imports = True
[mypy-dask.*]
ignore_missing_imports = True
[mypy-pytest.*]
ignore_missing_imports = True
[mypy-statsmodels.*]
Expand Down
2 changes: 2 additions & 0 deletions sgkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
create_genotype_dosage_dataset,
)
from .display import display_genotypes
from .io.vcfzarr_reader import read_vcfzarr
from .stats.aggregation import count_alleles
from .stats.association import gwas_linear_regression
from .stats.regenie import regenie
Expand All @@ -21,5 +22,6 @@
"create_genotype_dosage_dataset",
"display_genotypes",
"gwas_linear_regression",
"read_vcfzarr",
"regenie",
]
8 changes: 4 additions & 4 deletions sgkit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def create_genotype_call_dataset(
The (index of the) contig for each variant.
variant_position : array_like, int
The reference position of the variant.
variant_alleles : array_like, S1
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
The possible alleles for the variant.
sample_id : array_like, str
The unique identifier of the sample.
Expand All @@ -55,7 +55,7 @@ def create_genotype_call_dataset(
"""
check_array_like(variant_contig, kind="i", ndim=1)
check_array_like(variant_position, kind="i", ndim=1)
check_array_like(variant_alleles, kind="S", ndim=2)
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
check_array_like(sample_id, kind="U", ndim=1)
check_array_like(call_genotype, kind="i", ndim=3)
data_vars: Dict[Hashable, Any] = {
Expand Down Expand Up @@ -102,7 +102,7 @@ def create_genotype_dosage_dataset(
The (index of the) contig for each variant.
variant_position : array_like, int
The reference position of the variant.
variant_alleles : array_like, S1
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
The possible alleles for the variant.
sample_id : array_like, str
The unique identifier of the sample.
Expand All @@ -120,7 +120,7 @@ def create_genotype_dosage_dataset(
"""
check_array_like(variant_contig, kind="i", ndim=1)
check_array_like(variant_position, kind="i", ndim=1)
check_array_like(variant_alleles, kind="S", ndim=2)
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
check_array_like(sample_id, kind="U", ndim=1)
check_array_like(call_dosage, kind="f", ndim=2)
data_vars: Dict[Hashable, Any] = {
Expand Down
Empty file added sgkit/io/__init__.py
Empty file.
69 changes: 69 additions & 0 deletions sgkit/io/vcfzarr_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import dask.array as da
import xarray as xr
import zarr

from ..api import DIM_VARIANT, create_genotype_call_dataset
from ..typing import ArrayLike, PathType
from ..utils import encode_array


def _ensure_2d(arr: ArrayLike) -> ArrayLike:
if arr.ndim == 1:
arr = arr.reshape(-1, 1)
return arr


def read_vcfzarr(path: PathType) -> xr.Dataset:
"""Read a VCF Zarr file.

Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
from a Zarr file created using scikit-allel's `vcf_to_zarr` function.

Since `vcf_to_zarr` does not preserve phasing information, there is no
`call/genotype_phased` variable in the resulting dataset.

Parameters
----------
path : PathType
Path to the Zarr file.

Returns
-------
xr.Dataset
The dataset of genotype calls, created using `create_genotype_call_dataset`.
"""

vcfzarr = zarr.open_group(str(path), mode="r")
Comment on lines +16 to +36
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def read_vcfzarr(path: PathType) -> xr.Dataset:
"""Read a VCF Zarr file.
Loads VCF variant, sample, and genotype data as Dask arrays within a Dataset
from a Zarr file created using scikit-allel's `vcf_to_zarr` function.
Since `vcf_to_zarr` does not preserve phasing information, there is no
`call/genotype_phased` variable in the resulting dataset.
Parameters
----------
path : PathType
Path to the Zarr file.
Returns
-------
xr.Dataset
The dataset of genotype calls, created using `create_genotype_call_dataset`.
"""
vcfzarr = zarr.open_group(str(path), mode="r")
def read_vcfzarr(store: Union[str, Mapping], consolidated=False: bool) -> xr.Dataset:
"""Read data in Zarr format that was created using scikit-allel's `vcf_to_zarr` function.
Loads variant, sample, and genotype data as Dask arrays within a Dataset.
Since `vcf_to_zarr` does not preserve phasing information, there is no
`call/genotype_phased` variable in the resulting dataset.
Parameters
----------
store : Union[str, Mapping]
Either a string providing a URL or local file system path, or a store object.
consolidated : bool
Set True if the store uses consolidated metadata.
Returns
-------
xr.Dataset
The dataset of genotype calls, created using `create_genotype_call_dataset`.
"""
if isinstance(path, str):
# assume path is an fsspec-style url path
import fsspec
store = fsspec.get_mapper(path)
else:
# assume path is a store
store = path
if consolidated:
vcfzarr = zarr.open_consolidated(store=store, mode="r")
else:
vcfzarr = zarr.open(store=store, mode="r")

Two suggestions here.

First suggestion is to allow the first argument to be either a string or a mapping (dict-like) object. If a string then assume it's an fsspec-style URL and load via fsspec - this gives the ability to read from cloud object stores. If it's a mapping then treat it as a store - this gives full flexibility to pass in any type of zarr store.

Second suggestion is to expose a consolidated argument, which allows the user to specify if consolidated metadata has been used (this is an optimisation for cloud stores).


# Index the contig names
variants_chrom = da.from_zarr(vcfzarr["variants/CHROM"]).astype(str)
variant_contig, variant_contig_names = encode_array(variants_chrom.compute())
variant_contig = variant_contig.astype("int16")
variant_contig_names = list(variant_contig_names)

# For variant alleles, combine REF and ALT into a single array
variants_ref = da.from_zarr(vcfzarr["variants/REF"])
variants_alt = da.from_zarr(vcfzarr["variants/ALT"])
variant_alleles = da.concatenate(
[_ensure_2d(variants_ref), _ensure_2d(variants_alt)], axis=1
)

variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str)

ds = create_genotype_call_dataset(
variant_contig_names=variant_contig_names,
variant_contig=variant_contig,
variant_position=da.from_zarr(vcfzarr["variants/POS"]),
variant_alleles=variant_alleles,
sample_id=da.from_zarr(vcfzarr["samples"]).astype(str),
call_genotype=da.from_zarr(vcfzarr["calldata/GT"]),
variant_id=variants_id,
)

# Add a mask for variant ID
ds["variant_id_mask"] = (
[DIM_VARIANT],
variants_id == ".",
)

return ds
31 changes: 31 additions & 0 deletions sgkit/tests/data/sample.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
##fileformat=VCFv4.0
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=1000GenomesPilot-NCBI36
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FILTER=<ID=q10,Description="Quality below 10">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
##ALT=<ID=CNV,Description="Copy number variable region">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
19 111 . A C 9.6 . . GT:HQ 0|0:10,15 0|0:10,10 0/1:3,3
19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3
20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,.
20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,.
20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,.
20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,.
20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 ./.:40:3
20 1235237 . T . . . . GT 0/0 0|0 ./.
X 10 rsTest AC A,ATG,C 10 PASS . GT 0 0/1 0|2
Binary file added sgkit/tests/data/sample.vcf.zarr.zip
Binary file not shown.
70 changes: 70 additions & 0 deletions sgkit/tests/test_vcfzarr_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import numpy as np
from numpy.testing import assert_array_equal

from sgkit import read_vcfzarr
from sgkit.io.vcfzarr_reader import _ensure_2d


def test_read_vcfzarr(shared_datadir):
# The file sample.vcf.zarr.zip was created by running the following
# in a python session with the scikit-allel package installed.
#
# import allel
# allel.vcf_to_zarr("sample.vcf", "sample.vcf.zarr.zip")

path = shared_datadir / "sample.vcf.zarr.zip"
ds = read_vcfzarr(path)

assert ds.attrs["contigs"] == ["19", "20", "X"]
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
assert_array_equal(
ds["variant_position"],
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
)
assert_array_equal(
ds["variant_allele"],
[
["A", "C", "", ""],
["A", "G", "", ""],
["G", "A", "", ""],
["T", "A", "", ""],
["A", "G", "T", ""],
["T", "", "", ""],
["G", "GA", "GAC", ""],
["T", "", "", ""],
["AC", "A", "ATG", "C"],
],
)
assert_array_equal(
ds["variant_id"],
[".", ".", "rs6054257", ".", "rs6040355", ".", "microsat1", ".", "rsTest"],
)
assert_array_equal(
ds["variant_id_mask"],
[True, True, False, True, False, True, False, True, False],
)

assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])

call_genotype = np.array(
[
[[0, 0], [0, 0], [0, 1]],
[[0, 0], [0, 0], [0, 1]],
[[0, 0], [1, 0], [1, 1]],
[[0, 0], [0, 1], [0, 0]],
[[1, 2], [2, 1], [2, 2]],
[[0, 0], [0, 0], [0, 0]],
[[0, 1], [0, 2], [-1, -1]],
[[0, 0], [0, 0], [-1, -1]],
[[0, -1], [0, 1], [0, 2]],
],
dtype="i1",
)
assert_array_equal(ds["call_genotype"], call_genotype)
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
assert "call_genotype_phased" not in ds


def test_ensure_2d():
assert_array_equal(_ensure_2d(np.array([0, 2, 1])), np.array([[0], [2], [1]]))
assert_array_equal(_ensure_2d(np.array([[0], [2], [1]])), np.array([[0], [2], [1]]))
4 changes: 3 additions & 1 deletion sgkit/typing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from pathlib import Path
from typing import Any, Union

import dask.array as da
import numpy as np

DType = Any
ArrayLike = Union[np.ndarray, da.Array]
DType = Any
PathType = Union[str, Path]