Ensure chunking is not excessive in samples dimension.

tomwhite · tomwhite · commit 82aa81714452 · 2022-10-25T15:48:36.000+01:00
For VCFs with a small number of samples, a Zarr chunk size of 1000
was being used by default which was wasteful and slow. This change
limits the chunk size to the number of samples.
diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py
@@ -531,6 +531,10 @@ def vcf_to_zarr_sequential(
             ds.attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
 
             if first_variants_chunk:
+                # limit chunk width to actual number of samples seen in first chunk
+                if ds.dims["samples"] > 0:
+                    chunk_width = min(chunk_width, ds.dims["samples"])
+
                 # ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386
                 for var in ds.data_vars:
                     if ds[var].dtype.kind == "b":
diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py
@@ -168,6 +168,11 @@ def test_vcf_to_zarr__large_vcf(shared_datadir, is_path, tmp_path):
     assert ds["variant_allele"].dtype == "O"
     assert ds["variant_id"].dtype == "O"
 
+    # check underlying zarr chunk size is 1 in samples dim
+    za = zarr.open(output)
+    assert za["sample_id"].chunks == (1,)
+    assert za["call_genotype"].chunks == (5000, 1, 2)
+
 
 def test_vcf_to_zarr__plain_vcf_with_no_index(shared_datadir, tmp_path):
     path = path_for_test(