File tree 1 file changed +52
-0
lines changed
1 file changed +52
-0
lines changed Original file line number Diff line number Diff line change
1
+ import gzip
2
+ import os
3
+ import shutil
4
+ import time
5
+
6
+ from sgkit .io .vcf .vcf_reader import vcf_to_zarr
7
+ from sgkit .tests .io .vcf .utils import path_for_test
8
+
9
+
10
+ def test_vcf_read_speed (shared_datadir , tmp_path ):
11
+ path = path_for_test (
12
+ shared_datadir ,
13
+ "1000G.phase3.broad.withGenotypes.chr20.10100000.vcf.gz" ,
14
+ )
15
+ input_vcf = tmp_path .joinpath ("1000G.vcf" ).as_posix ()
16
+ output_zarr = tmp_path .joinpath ("1000G.zarr" ).as_posix ()
17
+
18
+ field_defs = {
19
+ "FORMAT/AD" : {"Number" : "R" },
20
+ }
21
+
22
+ gunzip (path , input_vcf )
23
+
24
+ duration = time_func (
25
+ vcf_to_zarr ,
26
+ input_vcf ,
27
+ output_zarr ,
28
+ fields = ["INFO/*" , "FORMAT/*" ],
29
+ field_defs = field_defs ,
30
+ chunk_length = 1_000 ,
31
+ target_part_size = None ,
32
+ )
33
+
34
+ bytes_read = os .path .getsize (input_vcf )
35
+ speed = bytes_read / (1_000_000 * duration )
36
+
37
+ print (f"bytes read: { bytes_read } " )
38
+ print (f"duration: { duration :.2f} s" )
39
+ print (f"speed: { speed :.1f} MB/s" )
40
+
41
+
42
+ def time_func (func , * args , ** kwargs ):
43
+ start = time .time ()
44
+ func (* args , ** kwargs )
45
+ end = time .time ()
46
+ return end - start
47
+
48
+
49
+ def gunzip (input , output ):
50
+ with gzip .open (input , "rb" ) as f_in :
51
+ with open (output , "wb" ) as f_out :
52
+ shutil .copyfileobj (f_in , f_out )
You can’t perform that action at this time.
0 commit comments