Skip to content

Commit 82aa817

Browse files
committed
Ensure chunking is not excessive in samples dimension.
For VCFs with a small number of samples, a Zarr chunk size of 1000 was being used by default which was wasteful and slow. This change limits the chunk size to the number of samples.
1 parent 843eb2a commit 82aa817

File tree

2 files changed

+9
-0
lines changed

2 files changed

+9
-0
lines changed

sgkit/io/vcf/vcf_reader.py

+4
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,10 @@ def vcf_to_zarr_sequential(
531531
ds.attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
532532

533533
if first_variants_chunk:
534+
# limit chunk width to actual number of samples seen in first chunk
535+
if ds.dims["samples"] > 0:
536+
chunk_width = min(chunk_width, ds.dims["samples"])
537+
534538
# ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386
535539
for var in ds.data_vars:
536540
if ds[var].dtype.kind == "b":

sgkit/tests/io/vcf/test_vcf_reader.py

+5
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,11 @@ def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
168168
assert ds["variant_allele"].dtype == "O"
169169
assert ds["variant_id"].dtype == "O"
170170

171+
# check underlying zarr chunk size is 1 in samples dim
172+
za = zarr.open(output)
173+
assert za["sample_id"].chunks == (1,)
174+
assert za["call_genotype"].chunks == (5000, 1, 2)
175+
171176

172177
def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
173178
path = path_for_test(

0 commit comments

Comments
 (0)