Skip to content

Commit ac532d4

Browse files
committed
working on load listing recognition for files and provenance
1 parent d3048af commit ac532d4

File tree

5 files changed

+67
-29
lines changed

5 files changed

+67
-29
lines changed

cwltool/job.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,7 @@ def _execute(
299299
and isinstance(job_order, (list, dict))
300300
):
301301
runtimeContext.prov_obj.used_artefacts(
302-
job_order, runtimeContext.process_run_id, str(self.name)
302+
job_order, runtimeContext.process_run_id, str(self.name), load_listing=self.builder.loadListing
303303
)
304304
else:
305305
_logger.warning(
@@ -426,6 +426,7 @@ def stderr_stdout_log_path(
426426
runtimeContext.process_run_id,
427427
outputs,
428428
datetime.datetime.now(),
429+
builder.loadListing # TODO FIX THIS
429430
)
430431
if processStatus != "success":
431432
_logger.warning("[job %s] completed %s", self.name, processStatus)

cwltool/provenance_profile.py

+48-20
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,15 @@ def copy_job_order(
8989
iid = shortname(i["id"])
9090
# if iid in the load listing object and no_listing then....
9191
if iid in job_order_object:
92-
if iid in load_listing and load_listing[iid] != "no_listing":
93-
customised_job[iid] = copy.deepcopy(job_order_object[iid])
92+
if iid in load_listing:
93+
if load_listing[iid] == "no_listing":
94+
_logger.warning("Skip listing of " + iid)
95+
job_order_object[iid]['loadListing'] = 'no_listing'
96+
job_order_object[iid]['listing'] = []
97+
customised_job[iid] = job_order_object[iid]
98+
else:
99+
# Normal deep copy
100+
customised_job[iid] = copy.deepcopy(job_order_object[iid])
94101
# TODO Other listing options here?
95102
else:
96103
# add the input element in dictionary for provenance
@@ -270,13 +277,14 @@ def evaluate(
270277
# record provenance of independent commandline tool executions
271278
self.prospective_prov(job)
272279
customised_job = copy_job_order(job, job_order_object, process)
273-
self.used_artefacts(customised_job, self.workflow_run_uri)
280+
self.used_artefacts(customised_job, self.workflow_run_uri, job.builder.loadListing)
274281
research_obj.create_job(customised_job)
275282
elif hasattr(job, "workflow"):
276283
# record provenance of workflow executions
277284
self.prospective_prov(job)
278285
customised_job = copy_job_order(job, job_order_object, process)
279-
self.used_artefacts(customised_job, self.workflow_run_uri)
286+
self.used_artefacts(customised_job, self.workflow_run_uri, schema=process.inputs_record_schema)
287+
280288

281289
def record_process_start(
282290
self, process: Process, job: JobsType, process_run_id: Optional[str] = None
@@ -355,11 +363,12 @@ def record_process_end(
355363
process_run_id: str,
356364
outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
357365
when: datetime.datetime,
366+
load_listing: str = "deep_listing",
358367
) -> None:
359-
self.generate_output_prov(outputs, process_run_id, process_name)
368+
self.generate_output_prov(outputs, process_run_id, process_name, load_listing)
360369
self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when)
361370

362-
def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
371+
def declare_file(self, value: CWLObjectType, load_listing: str = "deep_listing") -> Tuple[ProvEntity, ProvEntity, str]:
363372
if value["class"] != "File":
364373
raise ValueError("Must have class:File: %s" % value)
365374
# Need to determine file hash aka RO filename
@@ -436,9 +445,9 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
436445
):
437446
# TODO: Record these in a specializationOf entity with UUID?
438447
if sec["class"] == "File":
439-
(sec_entity, _, _) = self.declare_file(sec)
448+
(sec_entity, _, _) = self.declare_file(sec, load_listing)
440449
elif sec["class"] == "Directory":
441-
sec_entity = self.declare_directory(sec)
450+
sec_entity = self.declare_directory(sec, load_listing)
442451
else:
443452
raise ValueError(f"Got unexpected secondaryFiles value: {sec}")
444453
# We don't know how/when/where the secondary file was generated,
@@ -453,7 +462,7 @@ def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, st
453462

454463
return file_entity, entity, checksum
455464

456-
def declare_directory(self, value: CWLObjectType) -> ProvEntity:
465+
def declare_directory(self, value: CWLObjectType, load_listing: str = "deep_listing") -> ProvEntity:
457466
"""Register any nested files/directories."""
458467
# FIXME: Calculate a hash-like identifier for directory
459468
# so we get same value if it's the same filenames/hashes
@@ -498,12 +507,19 @@ def declare_directory(self, value: CWLObjectType) -> ProvEntity:
498507
# if value['basename'] == "dirIgnore":
499508
# pass
500509
if "listing" not in value:
501-
get_listing(self.fsaccess, value)
510+
if load_listing == "no_listing":
511+
pass
512+
elif load_listing == "deep_listing":
513+
get_listing(self.fsaccess, value)
514+
elif load_listing == "shallow_listing":
515+
get_listing(self.fsaccess, value, False)
516+
else:
517+
raise ValueError("Invalid listing value: %s", load_listing)
502518

503519
for entry in cast(MutableSequence[CWLObjectType], value.get("listing", [])):
504520
is_empty = False
505521
# Declare child-artifacts
506-
entity = self.declare_artefact(entry)
522+
entity = self.declare_artefact(entry, load_listing)
507523
self.document.membership(coll, entity)
508524
# Membership relation aka our ORE Proxy
509525
m_id = uuid.uuid4().urn
@@ -573,7 +589,7 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
573589
)
574590
return entity, checksum
575591

576-
def declare_artefact(self, value: Any) -> ProvEntity:
592+
def declare_artefact(self, value: Any, load_listing: str = "deep_listing") -> ProvEntity:
577593
"""Create data artefact entities for all file objects."""
578594
if value is None:
579595
# FIXME: If this can happen in CWL, we'll
@@ -615,12 +631,12 @@ def declare_artefact(self, value: Any) -> ProvEntity:
615631

616632
# Base case - we found a File we need to update
617633
if value.get("class") == "File":
618-
(entity, _, _) = self.declare_file(value)
634+
(entity, _, _) = self.declare_file(value, load_listing)
619635
value["@id"] = entity.identifier.uri
620636
return entity
621637

622638
if value.get("class") == "Directory":
623-
entity = self.declare_directory(value)
639+
entity = self.declare_directory(value, load_listing)
624640
value["@id"] = entity.identifier.uri
625641
return entity
626642
coll_id = value.setdefault("@id", uuid.uuid4().urn)
@@ -643,7 +659,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
643659
# Let's iterate and recurse
644660
coll_attribs: List[Tuple[Union[str, Identifier], Any]] = []
645661
for (key, val) in value.items():
646-
v_ent = self.declare_artefact(val)
662+
v_ent = self.declare_artefact(val, load_listing)
647663
self.document.membership(coll, v_ent)
648664
m_entity = self.document.entity(uuid.uuid4().urn)
649665
# Note: only support PROV-O style dictionary
@@ -664,7 +680,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
664680
members = []
665681
for each_input_obj in iter(value):
666682
# Recurse and register any nested objects
667-
e = self.declare_artefact(each_input_obj)
683+
e = self.declare_artefact(each_input_obj, load_listing)
668684
members.append(e)
669685

670686
# If we reached this, then we were allowed to iterate
@@ -698,20 +714,31 @@ def used_artefacts(
698714
job_order: Union[CWLObjectType, List[CWLObjectType]],
699715
process_run_id: str,
700716
name: Optional[str] = None,
717+
schema: Any = None,
718+
load_listing: Optional[str] = None,
701719
) -> None:
702720
"""Add used() for each data artefact."""
703721
if isinstance(job_order, list):
704722
for entry in job_order:
705-
self.used_artefacts(entry, process_run_id, name)
723+
# for field in schema.fields:
724+
# if field['name'] == entry.
725+
# load_listing = schema.fields
726+
self.used_artefacts(entry, process_run_id, name, load_listing)
706727
else:
707728
# FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows
708729
base = "main"
709730
if name is not None:
710731
base += "/" + name
711732
for key, value in job_order.items():
712733
prov_role = self.wf_ns[f"{base}/{key}"]
734+
if not load_listing:
735+
load_listing = "deep_listing"
736+
for field in schema['fields']:
737+
if field['name'] == key:
738+
load_listing = field['loadListing']
739+
break
713740
try:
714-
entity = self.declare_artefact(value)
741+
entity = self.declare_artefact(value, load_listing)
715742
self.document.used(
716743
process_run_id,
717744
entity,
@@ -727,11 +754,12 @@ def generate_output_prov(
727754
final_output: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
728755
process_run_id: Optional[str],
729756
name: Optional[str],
757+
load_listing: str = "deep_listing"
730758
) -> None:
731759
"""Call wasGeneratedBy() for each output,copy the files into the RO."""
732760
if isinstance(final_output, MutableSequence):
733761
for entry in final_output:
734-
self.generate_output_prov(entry, process_run_id, name)
762+
self.generate_output_prov(entry, process_run_id, name, load_listing)
735763
elif final_output is not None:
736764
# Timestamp should be created at the earliest
737765
timestamp = datetime.datetime.now()
@@ -740,7 +768,7 @@ def generate_output_prov(
740768
# entity (UUID) and document it as generated in
741769
# a role corresponding to the output
742770
for output, value in final_output.items():
743-
entity = self.declare_artefact(value)
771+
entity = self.declare_artefact(value, load_listing)
744772
if name is not None:
745773
name = urllib.parse.quote(str(name), safe=":/,#")
746774
# FIXME: Probably not "main" in nested workflows

cwltool/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@
9797
ScatterOutputCallbackType = Callable[[Optional[ScatterDestinationsType], str], None]
9898
SinkType = Union[CWLOutputType, CWLObjectType]
9999
DirectoryType = TypedDict(
100-
"DirectoryType", {"class": str, "listing": List[CWLObjectType], "basename": str}
100+
"DirectoryType", {"class": str, "listing": List[CWLObjectType], "basename": str, "loadListing": str}
101101
)
102102
JSONAtomType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
103103
JSONType = Union[

tests/test_provenance.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ def test_directory_workflow(tmp_path: Path) -> None:
208208

209209
@needs_docker
210210
def test_directory_workflow_no_listing(tmp_path: Path) -> None:
211+
"""
212+
This test will check for 3 files that should be there and 3 files that should not be there.
213+
@param tmp_path:
214+
"""
211215
dir2 = tmp_path / "dir2"
212216
dir2.mkdir()
213217
sha1 = {
@@ -253,8 +257,8 @@ def test_directory_workflow_no_listing(tmp_path: Path) -> None:
253257
file_list = (
254258
folder
255259
/ "data"
256-
/ "3c"
257-
/ "3c363836cf4e16666669a25da280a1865c2d2874"
260+
/ "84"
261+
/ "84a516841ba77a5b4648de2cd0dfcb30ea46dbb4"
258262
# checksum as returned from:
259263
# echo -e "a\nb\nc" | sha1sum
260264
# 3ca69e8d6c234a469d16ac28a4a658c92267c423 -
@@ -273,8 +277,9 @@ def test_directory_workflow_no_listing(tmp_path: Path) -> None:
273277
# File should be empty and in the future not existing...
274278
# assert os.path.getsize(p.absolute()) == 0
275279
# To be discared when file really does not exist anymore
276-
assert p.is_file(), f"Could not find {l} as {p}"
277-
280+
if l not in ['d', 'e', 'f']:
281+
print("Analysing file %s", l)
282+
assert p.is_file(), f"Could not find {l} as {p}"
278283

279284
@needs_docker
280285
def test_no_data_files(tmp_path: Path) -> None:

tests/wf/directory_no_listing.cwl

+7-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ hints:
1313
inputs:
1414
dir:
1515
type: Directory
16+
loadListing: deep_listing
1617
ignore:
1718
type: Directory
1819
loadListing: no_listing
@@ -47,7 +48,10 @@ steps:
4748
run:
4849
class: CommandLineTool
4950
requirements:
50-
- class: ShellCommandRequirement
51+
ShellCommandRequirement: {}
52+
LoadListingRequirement:
53+
loadListing: deep_listing
54+
5155
arguments:
5256
- shellQuote: false
5357
valueFrom: >
@@ -64,10 +68,10 @@ steps:
6468
glob: "dir1"
6569

6670
outputs:
67-
listing:
71+
output_1:
6872
type: File
6973
outputSource: ls/listing
70-
dir1:
74+
output_2:
7175
type: Directory
7276
outputSource: generate/dir1
7377

0 commit comments

Comments
 (0)