Skip to content

Commit 5c3ff56

Browse files
committed
VCF compression size test
1 parent f7d44a8 commit 5c3ff56

File tree

3 files changed

+77
-0
lines changed

3 files changed

+77
-0
lines changed
Binary file not shown.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import pytest
2+
from numcodecs import FixedScaleOffset
3+
4+
from sgkit.io.vcf.vcf_reader import vcf_to_zarr, zarr_array_sizes
5+
6+
from .utils import path_for_test
7+
8+
9+
@pytest.mark.parametrize(
10+
"vcf_file, encoding, compression_factor",
11+
[
12+
(
13+
"1kg_target_chr20_38_imputed_chr20.vcf.bgz",
14+
{
15+
"variant_AF": {
16+
"filters": [
17+
FixedScaleOffset(offset=0, scale=10000, dtype="f4", astype="u2")
18+
],
19+
},
20+
"call_DS": {
21+
"filters": [
22+
FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")
23+
],
24+
},
25+
"variant_DR2": {
26+
"filters": [
27+
FixedScaleOffset(offset=0, scale=100, dtype="f4", astype="u1")
28+
],
29+
},
30+
},
31+
0.75,
32+
),
33+
],
34+
)
35+
@pytest.mark.filterwarnings(
36+
"ignore::sgkit.io.vcf.FloatFormatFieldWarning",
37+
"ignore::sgkit.io.vcfzarr_reader.DimensionNameForFixedFormatFieldWarning",
38+
)
39+
def test_compression_size(
40+
shared_datadir, tmp_path, vcf_file, encoding, compression_factor
41+
):
42+
path = path_for_test(shared_datadir, vcf_file)
43+
output = tmp_path.joinpath("output.zarr")
44+
45+
kwargs = zarr_array_sizes(path)
46+
print(f"running vcf_to_zarr with kwargs {kwargs}")
47+
48+
vcf_to_zarr(
49+
path,
50+
output,
51+
fields=["INFO/*", "FORMAT/*"],
52+
chunk_length=500_000,
53+
encoding=encoding,
54+
**kwargs,
55+
)
56+
57+
original_size = du(path)
58+
zarr_size = du(output)
59+
60+
print(f"original size: {original_size}")
61+
print(f"zarr size: {zarr_size}")
62+
63+
assert zarr_size < original_size * compression_factor
64+
65+
66+
def get_file_size(file):
67+
return file.stat().st_size
68+
69+
70+
def get_dir_size(dir):
71+
return sum(f.stat().st_size for f in dir.glob("**/*") if f.is_file())
72+
73+
74+
def du(file):
75+
if file.is_file():
76+
return get_file_size(file)
77+
return get_dir_size(file)

0 commit comments

Comments
 (0)