Skip to content

Commit ea0c130

Browse files
committed
Benchmark speed of writing uncompressed VCF.
1 parent 99196a7 commit ea0c130

File tree

1 file changed

+32
-0
lines changed

1 file changed

+32
-0
lines changed

sgkit/tests/io/vcf/test_vcf_benchmark.py

+32
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55

66
from sgkit.io.vcf.vcf_reader import vcf_to_zarr
7+
from sgkit.io.vcf.vcf_writer import zarr_to_vcf
78
from sgkit.tests.io.vcf.utils import path_for_test
89

910

@@ -39,6 +40,37 @@ def test_vcf_read_speed(shared_datadir, tmp_path):
3940
print(f"speed: {speed:.1f} MB/s")
4041

4142

43+
def test_vcf_write_speed(shared_datadir, tmp_path):
44+
path = path_for_test(
45+
shared_datadir,
46+
"1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
47+
)
48+
output_zarr = tmp_path.joinpath("1000G.zarr").as_posix()
49+
output_vcf = tmp_path.joinpath("1000G.vcf").as_posix()
50+
51+
field_defs = {
52+
"FORMAT/AD": {"Number": "R"},
53+
}
54+
vcf_to_zarr(
55+
path,
56+
output_zarr,
57+
fields=["INFO/*", "FORMAT/*"],
58+
field_defs=field_defs,
59+
chunk_length=1_000,
60+
)
61+
62+
# throw away first run due to numba jit compilation
63+
for _ in range(2):
64+
duration = time_func(zarr_to_vcf, output_zarr, output_vcf)
65+
66+
bytes_written = os.path.getsize(output_vcf)
67+
speed = bytes_written / (1_000_000 * duration)
68+
69+
print(f"bytes written: {bytes_written}")
70+
print(f"duration: {duration:.2f} s")
71+
print(f"speed: {speed:.1f} MB/s")
72+
73+
4274
def time_func(func, *args, **kwargs):
4375
start = time.time()
4476
func(*args, **kwargs)

0 commit comments

Comments
 (0)