Add more default filters for VCF

tomwhite · tomwhite · commit bd84bfbf3e80 · 2022-10-18T13:07:17.000+01:00
Issue warning for VCF FORMAT float fields
diff --git a/sgkit/io/vcf/__init__.py b/sgkit/io/vcf/__init__.py
@@ -3,13 +3,15 @@
 try:
     from .vcf_partition import partition_into_regions
     from .vcf_reader import (
+        FloatFormatFieldWarning,
         MaxAltAllelesExceededWarning,
         concat_zarrs,
         vcf_to_zarr,
         vcf_to_zarrs,
     )
 
     __all__ = [
+        "FloatFormatFieldWarning",
         "MaxAltAllelesExceededWarning",
         "concat_zarrs",
         "partition_into_regions",
diff --git a/sgkit/io/vcf/utils.py b/sgkit/io/vcf/utils.py
@@ -3,10 +3,11 @@
 import tempfile
 import uuid
 from contextlib import contextmanager
-from typing import IO, Any, Dict, Iterator, Optional, Sequence, TypeVar
+from typing import IO, Any, Dict, Hashable, Iterator, Optional, Sequence, TypeVar
 from urllib.parse import urlparse
 
 import fsspec
+from numcodecs import Delta, PackBits
 from yarl import URL
 
 from sgkit.typing import PathType
@@ -170,6 +171,36 @@ def temporary_directory(
         fs.rm(tempdir, recursive=True)
 
 
+def get_default_vcf_encoding(ds, chunk_length, chunk_width, compressor):
+    # Enforce uniform chunks in the variants dimension
+    # Also chunk in the samples direction
+    def get_chunk_size(dim: Hashable, size: int) -> int:
+        if dim == "variants":
+            return chunk_length
+        elif dim == "samples":
+            return chunk_width
+        else:
+            return size
+
+    default_encoding = {}
+    for var in ds.data_vars:
+        var_chunks = tuple(
+            get_chunk_size(dim, size)
+            for (dim, size) in zip(ds[var].dims, ds[var].shape)
+        )
+        default_encoding[var] = dict(chunks=var_chunks, compressor=compressor)
+
+        # Enable bit packing by default for boolean arrays
+        if ds[var].dtype.kind == "b":
+            default_encoding[var]["filters"] = [PackBits()]
+
+        # Position is monotonically increasing (within a contig) so benefits from delta encoding
+        if var == "variant_position":
+            default_encoding[var]["filters"] = [Delta(dtype="i4", astype="i4")]
+
+    return default_encoding
+
+
 def merge_encodings(
     default_encoding: Dict[str, Dict[str, Any]], overrides: Dict[str, Dict[str, Any]]
 ) -> Dict[str, Dict[str, Any]]:
diff --git a/sgkit/io/vcf/vcf_reader.py b/sgkit/io/vcf/vcf_reader.py
@@ -5,25 +5,14 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (
-    Any,
-    Dict,
-    Hashable,
-    Iterator,
-    MutableMapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-)
+from typing import Any, Dict, Iterator, MutableMapping, Optional, Sequence, Tuple, Union
 
 import dask
 import fsspec
 import numpy as np
 import xarray as xr
 import zarr
 from cyvcf2 import VCF, Variant
-from numcodecs import PackBits
 
 from sgkit import variables
 from sgkit.io.utils import (
@@ -40,6 +29,7 @@
 from sgkit.io.vcf.utils import (
     build_url,
     chunks,
+    get_default_vcf_encoding,
     merge_encodings,
     temporary_directory,
     url_filename,
@@ -71,6 +61,12 @@
     DEFAULT_COMPRESSOR = None
 
 
+class FloatFormatFieldWarning(UserWarning):
+    """Warning for VCF FORMAT float fields, which can use a lot of storage."""
+
+    pass
+
+
 class MaxAltAllelesExceededWarning(UserWarning):
     """Warning when the number of alt alleles exceeds the maximum specified."""
 
@@ -535,35 +531,34 @@ def vcf_to_zarr_sequential(
             ds.attrs["max_alt_alleles_seen"] = max_alt_alleles_seen
 
             if first_variants_chunk:
-                # Enforce uniform chunks in the variants dimension
-                # Also chunk in the samples direction
-
-                def get_chunk_size(dim: Hashable, size: int) -> int:
-                    if dim == "variants":
-                        return chunk_length
-                    elif dim == "samples":
-                        return chunk_width
-                    else:
-                        return size
-
-                default_encoding = {}
+                # ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386
                 for var in ds.data_vars:
-                    var_chunks = tuple(
-                        get_chunk_size(dim, size)
-                        for (dim, size) in zip(ds[var].dims, ds[var].shape)
-                    )
-                    default_encoding[var] = dict(
-                        chunks=var_chunks, compressor=compressor
-                    )
                     if ds[var].dtype.kind == "b":
-                        # ensure that booleans are not stored as int8 by xarray https://github.com/pydata/xarray/issues/4386
                         ds[var].attrs["dtype"] = "bool"
-                        default_encoding[var]["filters"] = [PackBits()]
 
                 # values from function args (encoding) take precedence over default_encoding
+                default_encoding = get_default_vcf_encoding(
+                    ds, chunk_length, chunk_width, compressor
+                )
                 encoding = encoding or {}
                 merged_encoding = merge_encodings(default_encoding, encoding)
 
+                for var in ds.data_vars:
+                    # Issue warning for VCF FORMAT float fields with no filter
+                    if (
+                        var.startswith("call_")
+                        and ds[var].dtype == np.float32
+                        and (
+                            var not in merged_encoding
+                            or "filters" not in merged_encoding[var]
+                        )
+                    ):
+                        warnings.warn(
+                            f"Storing call variable {var} (FORMAT field) as a float can result in large file sizes. "
+                            f"Consider setting the encoding filters for this variable to FixedScaleOffset or similar.",
+                            FloatFormatFieldWarning,
+                        )
+
                 ds.to_zarr(output, mode="w", encoding=merged_encoding)
                 first_variants_chunk = False
             else:
diff --git a/sgkit/tests/io/vcf/test_vcf_lossless_conversion.py b/sgkit/tests/io/vcf/test_vcf_lossless_conversion.py
@@ -17,6 +17,7 @@
     ],
 )
 @pytest.mark.filterwarnings(
+    "ignore::sgkit.io.vcf.FloatFormatFieldWarning",
     "ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning",
 )
 def test_lossless_conversion(shared_datadir, tmp_path, vcf_file):
diff --git a/sgkit/tests/io/vcf/test_vcf_reader.py b/sgkit/tests/io/vcf/test_vcf_reader.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 import zarr
-from numcodecs import Blosc, PackBits, VLenUTF8
+from numcodecs import Blosc, Delta, FixedScaleOffset, PackBits, VLenUTF8
 from numpy.testing import assert_allclose, assert_array_equal
 
 from sgkit import load_dataset, save_dataset
@@ -246,6 +246,10 @@ def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path):
     assert z["variant_id_mask"].filters is None
     assert z["variant_id_mask"].chunks == (5,)
 
+    assert z["variant_position"].filters == [
+        Delta(dtype="i4", astype="i4")
+    ]  # sgkit default
+
 
 @pytest.mark.parametrize(
     "is_path",
@@ -259,7 +263,7 @@ def test_vcf_to_zarr__parallel_compressor_and_filters(
     output = tmp_path.joinpath("vcf_concat.zarr").as_posix()
     regions = ["20", "21"]
 
-    default_compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
+    compressor = Blosc("zlib", 1, Blosc.NOSHUFFLE)
     variant_id_compressor = Blosc("zlib", 2, Blosc.NOSHUFFLE)
     encoding = dict(
         variant_id=dict(compressor=variant_id_compressor),
@@ -270,18 +274,29 @@ def test_vcf_to_zarr__parallel_compressor_and_filters(
         output,
         regions=regions,
         chunk_length=5_000,
-        compressor=default_compressor,
+        compressor=compressor,
         encoding=encoding,
     )
 
     # look at actual Zarr store to check compressor and filters
     z = zarr.open(output)
-    assert z["call_genotype"].compressor == default_compressor
-    assert z["call_genotype"].filters is None
-    assert z["call_genotype_mask"].filters == [PackBits()]
+    assert z["call_genotype"].compressor == compressor
+    assert z["call_genotype"].filters is None  # sgkit default
+    assert z["call_genotype"].chunks == (5000, 1, 2)
+    assert z["call_genotype_mask"].compressor == compressor
+    assert z["call_genotype_mask"].filters == [PackBits()]  # sgkit default
+    assert z["call_genotype_mask"].chunks == (5000, 1, 2)
 
     assert z["variant_id"].compressor == variant_id_compressor
+    assert z["variant_id"].filters == [VLenUTF8()]  # sgkit default
+    assert z["variant_id"].chunks == (5000,)
+    assert z["variant_id_mask"].compressor == compressor
     assert z["variant_id_mask"].filters is None
+    assert z["variant_id_mask"].chunks == (5000,)
+
+    assert z["variant_position"].filters == [
+        Delta(dtype="i4", astype="i4")
+    ]  # sgkit default
 
 
 @pytest.mark.parametrize(
@@ -992,7 +1007,20 @@ def test_vcf_to_zarr__field_number_G_non_diploid(shared_datadir, tmp_path):
     path = path_for_test(shared_datadir, "simple.output.mixed_depth.likelihoods.vcf")
     output = tmp_path.joinpath("vcf.zarr").as_posix()
 
-    vcf_to_zarr(path, output, ploidy=4, max_alt_alleles=3, fields=["FORMAT/GL"])
+    # store GL field as 2dp
+    encoding = {
+        "call_GL": {
+            "filters": [FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")]
+        }
+    }
+    vcf_to_zarr(
+        path,
+        output,
+        ploidy=4,
+        max_alt_alleles=3,
+        fields=["FORMAT/GL"],
+        encoding=encoding,
+    )
     ds = xr.open_zarr(output)
 
     # comb(n_alleles + ploidy - 1, ploidy) = comb(4 + 4 - 1, 4) = comb(7, 4) = 35

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`],`
`18`	`18`	`)`
`19`	`19`	`@pytest.mark.filterwarnings(`
	`20`	`+ "ignore::sgkit.io.vcf.FloatFormatFieldWarning",`
`20`	`21`	`"ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning",`
`21`	`22`	`)`
`22`	`23`	`def test_lossless_conversion(shared_datadir, tmp_path, vcf_file):`