Skip to content

Commit 99196a7

Browse files
committed
Benchmark speed of reading uncompressed VCF.
1 parent 4ce7e55 commit 99196a7

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import gzip
2+
import os
3+
import shutil
4+
import time
5+
6+
from sgkit.io.vcf.vcf_reader import vcf_to_zarr
7+
from sgkit.tests.io.vcf.utils import path_for_test
8+
9+
10+
def test_vcf_read_speed(shared_datadir, tmp_path):
11+
path = path_for_test(
12+
shared_datadir,
13+
"1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz",
14+
)
15+
input_vcf = tmp_path.joinpath("1000G.vcf").as_posix()
16+
output_zarr = tmp_path.joinpath("1000G.zarr").as_posix()
17+
18+
field_defs = {
19+
"FORMAT/AD": {"Number": "R"},
20+
}
21+
22+
gunzip(path, input_vcf)
23+
24+
duration = time_func(
25+
vcf_to_zarr,
26+
input_vcf,
27+
output_zarr,
28+
fields=["INFO/*", "FORMAT/*"],
29+
field_defs=field_defs,
30+
chunk_length=1_000,
31+
target_part_size=None,
32+
)
33+
34+
bytes_read = os.path.getsize(input_vcf)
35+
speed = bytes_read / (1_000_000 * duration)
36+
37+
print(f"bytes read: {bytes_read}")
38+
print(f"duration: {duration:.2f} s")
39+
print(f"speed: {speed:.1f} MB/s")
40+
41+
42+
def time_func(func, *args, **kwargs):
43+
start = time.time()
44+
func(*args, **kwargs)
45+
end = time.time()
46+
return end - start
47+
48+
49+
def gunzip(input, output):
50+
with gzip.open(input, "rb") as f_in:
51+
with open(output, "wb") as f_out:
52+
shutil.copyfileobj(f_in, f_out)

0 commit comments

Comments
 (0)