4
4
import pytest
5
5
import xarray as xr
6
6
import zarr
7
- from numcodecs import Blosc , PackBits , VLenUTF8
7
+ from numcodecs import Blosc , Delta , FixedScaleOffset , PackBits , VLenUTF8
8
8
from numpy .testing import assert_allclose , assert_array_equal
9
9
10
10
from sgkit import load_dataset , save_dataset
@@ -246,6 +246,10 @@ def test_vcf_to_zarr__compressor_and_filters(shared_datadir, is_path, tmp_path):
246
246
assert z ["variant_id_mask" ].filters is None
247
247
assert z ["variant_id_mask" ].chunks == (5 ,)
248
248
249
+ assert z ["variant_position" ].filters == [
250
+ Delta (dtype = "i4" , astype = "i4" )
251
+ ] # sgkit default
252
+
249
253
250
254
@pytest .mark .parametrize (
251
255
"is_path" ,
@@ -259,7 +263,7 @@ def test_vcf_to_zarr__parallel_compressor_and_filters(
259
263
output = tmp_path .joinpath ("vcf_concat.zarr" ).as_posix ()
260
264
regions = ["20" , "21" ]
261
265
262
- default_compressor = Blosc ("zlib" , 1 , Blosc .NOSHUFFLE )
266
+ compressor = Blosc ("zlib" , 1 , Blosc .NOSHUFFLE )
263
267
variant_id_compressor = Blosc ("zlib" , 2 , Blosc .NOSHUFFLE )
264
268
encoding = dict (
265
269
variant_id = dict (compressor = variant_id_compressor ),
@@ -270,18 +274,29 @@ def test_vcf_to_zarr__parallel_compressor_and_filters(
270
274
output ,
271
275
regions = regions ,
272
276
chunk_length = 5_000 ,
273
- compressor = default_compressor ,
277
+ compressor = compressor ,
274
278
encoding = encoding ,
275
279
)
276
280
277
281
# look at actual Zarr store to check compressor and filters
278
282
z = zarr .open (output )
279
- assert z ["call_genotype" ].compressor == default_compressor
280
- assert z ["call_genotype" ].filters is None
281
- assert z ["call_genotype_mask" ].filters == [PackBits ()]
283
+ assert z ["call_genotype" ].compressor == compressor
284
+ assert z ["call_genotype" ].filters is None # sgkit default
285
+ assert z ["call_genotype" ].chunks == (5000 , 1 , 2 )
286
+ assert z ["call_genotype_mask" ].compressor == compressor
287
+ assert z ["call_genotype_mask" ].filters == [PackBits ()] # sgkit default
288
+ assert z ["call_genotype_mask" ].chunks == (5000 , 1 , 2 )
282
289
283
290
assert z ["variant_id" ].compressor == variant_id_compressor
291
+ assert z ["variant_id" ].filters == [VLenUTF8 ()] # sgkit default
292
+ assert z ["variant_id" ].chunks == (5000 ,)
293
+ assert z ["variant_id_mask" ].compressor == compressor
284
294
assert z ["variant_id_mask" ].filters is None
295
+ assert z ["variant_id_mask" ].chunks == (5000 ,)
296
+
297
+ assert z ["variant_position" ].filters == [
298
+ Delta (dtype = "i4" , astype = "i4" )
299
+ ] # sgkit default
285
300
286
301
287
302
@pytest .mark .parametrize (
@@ -992,7 +1007,20 @@ def test_vcf_to_zarr__field_number_G_non_diploid(shared_datadir, tmp_path):
992
1007
path = path_for_test (shared_datadir , "simple.output.mixed_depth.likelihoods.vcf" )
993
1008
output = tmp_path .joinpath ("vcf.zarr" ).as_posix ()
994
1009
995
- vcf_to_zarr (path , output , ploidy = 4 , max_alt_alleles = 3 , fields = ["FORMAT/GL" ])
1010
+ # store GL field as 2dp
1011
+ encoding = {
1012
+ "call_GL" : {
1013
+ "filters" : [FixedScaleOffset (offset = 0 , scale = 100 , dtype = "f4" , astype = "u1" )]
1014
+ }
1015
+ }
1016
+ vcf_to_zarr (
1017
+ path ,
1018
+ output ,
1019
+ ploidy = 4 ,
1020
+ max_alt_alleles = 3 ,
1021
+ fields = ["FORMAT/GL" ],
1022
+ encoding = encoding ,
1023
+ )
996
1024
ds = xr .open_zarr (output )
997
1025
998
1026
# comb(n_alleles + ploidy - 1, ploidy) = comb(4 + 4 - 1, 4) = comb(7, 4) = 35
0 commit comments