1
1
import os
2
2
import shutil
3
3
import signal
4
+ import tarfile
4
5
import traceback
5
6
import uuid
6
7
from pathlib import Path
7
- from typing import Dict , List , Set
8
+ from typing import Dict , List , Set , Union
8
9
9
10
from ..common import jsonpickle
10
11
from ..config import config
11
12
from ..utils import (artifact_classes , assemble_path_object ,
12
13
catalog_of_local_artifact , convert_dflow_list , copy_file ,
13
14
expand , flatten , randstr , remove_empty_dir_tag )
14
- from .opio import Artifact , BigParameter , NestedDict , Parameter
15
+ from .opio import (Artifact , BigParameter , HDF5Dataset , HDF5Datasets ,
16
+ NestedDict , Parameter )
15
17
16
18
17
19
def get_slices (path_object , slices ):
@@ -78,7 +80,35 @@ def handle_input_artifact(name, sign, slices=None, data_root="/tmp",
78
80
79
81
path_object = get_slices (path_object , slices )
80
82
81
- if sign .type in [str , Path ]:
83
+ sign_type = sign .type
84
+ if getattr (sign_type , "__origin__" , None ) == Union :
85
+ args = sign_type .__args__
86
+ if HDF5Datasets in args :
87
+ if isinstance (path_object , list ) and all ([isinstance (
88
+ p , str ) and p .endswith (".h5" ) for p in path_object ]):
89
+ sign_type = HDF5Datasets
90
+ elif args [0 ] == HDF5Datasets :
91
+ sign_type = args [1 ]
92
+ elif args [1 ] == HDF5Datasets :
93
+ sign_type = args [0 ]
94
+
95
+ if sign_type == HDF5Datasets :
96
+ import h5py
97
+ assert isinstance (path_object , list )
98
+ res = None
99
+ for path in path_object :
100
+ f = h5py .File (path , "r" )
101
+ datasets = {k : HDF5Dataset (f [k ]) for k in f .keys ()}
102
+ datasets = expand (datasets )
103
+ if isinstance (datasets , list ):
104
+ if res is None :
105
+ res = []
106
+ res += datasets
107
+ elif isinstance (datasets , dict ):
108
+ if res is None :
109
+ res = {}
110
+ res .update (datasets )
111
+ if sign_type in [str , Path ]:
82
112
if path_object is None or isinstance (path_object , str ):
83
113
res = path_object
84
114
elif isinstance (path_object , list ) and len (path_object ) == 1 and (
@@ -87,8 +117,8 @@ def handle_input_artifact(name, sign, slices=None, data_root="/tmp",
87
117
res = path_object [0 ]
88
118
else :
89
119
res = art_path
90
- res = path_or_none (res ) if sign . type == Path else res
91
- elif sign . type in [List [str ], List [Path ], Set [str ], Set [Path ]]:
120
+ res = path_or_none (res ) if sign_type == Path else res
121
+ elif sign_type in [List [str ], List [Path ], Set [str ], Set [Path ]]:
92
122
if path_object is None :
93
123
return None
94
124
elif isinstance (path_object , str ):
@@ -99,17 +129,17 @@ def handle_input_artifact(name, sign, slices=None, data_root="/tmp",
99
129
else :
100
130
res = list (flatten (path_object ).values ())
101
131
102
- if sign . type == List [str ]:
132
+ if sign_type == List [str ]:
103
133
pass
104
- elif sign . type == List [Path ]:
134
+ elif sign_type == List [Path ]:
105
135
res = path_or_none (res )
106
- elif sign . type == Set [str ]:
136
+ elif sign_type == Set [str ]:
107
137
res = set (res )
108
138
else :
109
139
res = set (path_or_none (res ))
110
- elif sign . type in [Dict [str , str ], NestedDict [str ]]:
140
+ elif sign_type in [Dict [str , str ], NestedDict [str ]]:
111
141
res = path_object
112
- elif sign . type in [Dict [str , Path ], NestedDict [Path ]]:
142
+ elif sign_type in [Dict [str , Path ], NestedDict [Path ]]:
113
143
res = path_or_none (path_object )
114
144
115
145
if res is None :
@@ -169,6 +199,41 @@ def slice_to_dir(slice):
169
199
def handle_output_artifact (name , value , sign , slices = None , data_root = "/tmp" ,
170
200
create_dir = False ):
171
201
path_list = []
202
+ if sign .type == HDF5Datasets :
203
+ import h5py
204
+ os .makedirs (data_root + '/outputs/artifacts/' + name , exist_ok = True )
205
+ h5_name = "%s.h5" % uuid .uuid4 ()
206
+ h5_path = '%s/outputs/artifacts/%s/%s' % (data_root , name , h5_name )
207
+ with h5py .File (h5_path , "w" ) as f :
208
+ for s , v in flatten (value ).items ():
209
+ if isinstance (v , Path ):
210
+ if v .is_file ():
211
+ try :
212
+ data = v .read_text (encoding = "utf-8" )
213
+ dtype = "utf-8"
214
+ except Exception :
215
+ import numpy as np
216
+ data = np .void (v .read_bytes ())
217
+ dtype = "binary"
218
+ d = f .create_dataset (s , data = data )
219
+ d .attrs ["type" ] = "file"
220
+ d .attrs ["path" ] = str (v )
221
+ d .attrs ["dtype" ] = dtype
222
+ elif v .is_dir ():
223
+ tgz_path = Path ("%s.tgz" % v )
224
+ tf = tarfile .open (tgz_path , "w:gz" , dereference = True )
225
+ tf .add (v )
226
+ tf .close ()
227
+ import numpy as np
228
+ d = f .create_dataset (s , data = np .void (
229
+ tgz_path .read_bytes ()))
230
+ d .attrs ["type" ] = "dir"
231
+ d .attrs ["path" ] = str (v )
232
+ d .attrs ["dtype" ] = "binary"
233
+ else :
234
+ d = f .create_dataset (s , data = v )
235
+ d .attrs ["type" ] = "data"
236
+ path_list .append ({"dflow_list_item" : h5_name , "order" : slices or 0 })
172
237
if sign .type in [str , Path ]:
173
238
os .makedirs (data_root + '/outputs/artifacts/' + name , exist_ok = True )
174
239
if slices is None :
0 commit comments