Skip to content

Commit bae0a22

Browse files
committed
Accept object dtype for variant_allele (don't change to string dtype)
1 parent 8f494ee commit bae0a22

File tree

3 files changed

+14
-28
lines changed

3 files changed

+14
-28
lines changed

sgkit/api.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def create_genotype_call_dataset(
3333
The (index of the) contig for each variant.
3434
variant_position : array_like, int
3535
The reference position of the variant.
36-
variant_alleles : array_like, zero-terminated bytes, e.g. "S1"
36+
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
3737
The possible alleles for the variant.
3838
sample_id : array_like, str
3939
The unique identifier of the sample.
@@ -55,7 +55,7 @@ def create_genotype_call_dataset(
5555
"""
5656
check_array_like(variant_contig, kind="i", ndim=1)
5757
check_array_like(variant_position, kind="i", ndim=1)
58-
check_array_like(variant_alleles, kind="S", ndim=2)
58+
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
5959
check_array_like(sample_id, kind="U", ndim=1)
6060
check_array_like(call_genotype, kind="i", ndim=3)
6161
data_vars: Dict[Hashable, Any] = {
@@ -102,7 +102,7 @@ def create_genotype_dosage_dataset(
102102
The (index of the) contig for each variant.
103103
variant_position : array_like, int
104104
The reference position of the variant.
105-
variant_alleles : array_like, S1
105+
variant_alleles : array_like, zero-terminated bytes, e.g. "S1", or object
106106
The possible alleles for the variant.
107107
sample_id : array_like, str
108108
The unique identifier of the sample.
@@ -120,7 +120,7 @@ def create_genotype_dosage_dataset(
120120
"""
121121
check_array_like(variant_contig, kind="i", ndim=1)
122122
check_array_like(variant_position, kind="i", ndim=1)
123-
check_array_like(variant_alleles, kind="S", ndim=2)
123+
check_array_like(variant_alleles, kind={"S", "O"}, ndim=2)
124124
check_array_like(sample_id, kind="U", ndim=1)
125125
check_array_like(call_dosage, kind="f", ndim=2)
126126
data_vars: Dict[Hashable, Any] = {

sgkit/io/vcfzarr_reader.py

+1-15
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
from typing import Any
2-
31
import dask.array as da
4-
import numpy as np
52
import xarray as xr
63
import zarr
74

@@ -45,22 +42,11 @@ def read_vcfzarr(path: PathType) -> xr.Dataset:
4542
variant_contig_names = list(variant_contig_names)
4643

4744
# For variant alleles, combine REF and ALT into a single array
48-
# and calculate the number of alleles so we can set the dtype correctly
4945
variants_ref = da.from_zarr(vcfzarr["variants/REF"])
5046
variants_alt = da.from_zarr(vcfzarr["variants/ALT"])
51-
52-
def max_str_len(arr: ArrayLike) -> Any:
53-
return arr.map_blocks(
54-
lambda s: np.char.str_len(s.astype(str)), dtype=np.int8
55-
).max()
56-
57-
max_allele_length = max(
58-
da.compute(max_str_len(variants_ref), max_str_len(variants_alt))
59-
)
60-
variants_ref_alt = da.concatenate(
47+
variant_alleles = da.concatenate(
6148
[_ensure_2d(variants_ref), _ensure_2d(variants_alt)], axis=1
6249
)
63-
variant_alleles = variants_ref_alt.astype(f"S{max_allele_length}")
6450

6551
variants_id = da.from_zarr(vcfzarr["variants/ID"]).astype(str)
6652

sgkit/tests/test_vcfzarr_reader.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ def test_read_vcfzarr(shared_datadir):
2424
assert_array_equal(
2525
ds["variant_allele"],
2626
[
27-
[b"A", b"C", b"", b""],
28-
[b"A", b"G", b"", b""],
29-
[b"G", b"A", b"", b""],
30-
[b"T", b"A", b"", b""],
31-
[b"A", b"G", b"T", b""],
32-
[b"T", b"", b"", b""],
33-
[b"G", b"GA", b"GAC", b""],
34-
[b"T", b"", b"", b""],
35-
[b"AC", b"A", b"ATG", b"C"],
27+
["A", "C", "", ""],
28+
["A", "G", "", ""],
29+
["G", "A", "", ""],
30+
["T", "A", "", ""],
31+
["A", "G", "T", ""],
32+
["T", "", "", ""],
33+
["G", "GA", "GAC", ""],
34+
["T", "", "", ""],
35+
["AC", "A", "ATG", "C"],
3636
],
3737
)
3838
assert_array_equal(

0 commit comments

Comments
 (0)