1- import copy
2- import pdb
31import datetime
4- import logging
52import urllib
63import uuid
74import json
8- from io import BytesIO
95from pathlib import PurePath , PurePosixPath
10- from socket import getfqdn
116from typing import (
127 Any ,
138 Dict ,
14- Iterable ,
159 List ,
16- MutableMapping ,
1710 MutableSequence ,
1811 Optional ,
1912 Tuple ,
2316
2417from prov .identifier import Identifier
2518from prov .model import PROV , PROV_LABEL , PROV_TYPE , PROV_VALUE , ProvDocument , ProvEntity
26- from schema_salad .sourceline import SourceLine
27- from typing_extensions import TYPE_CHECKING
28- from tools .load_ga_export import load_ga_history_export , GalaxyJob , GalaxyDataset
19+ from tools .load_ga_export import load_ga_history_export , GalaxyJob
2920from ast import literal_eval
3021import os
3122
3627from rocrate .provenance_constants import (
3728 ACCOUNT_UUID ,
3829 CWLPROV ,
39- ENCODING ,
40- FOAF ,
4130 METADATA ,
4231 ORE ,
4332 PROVENANCE ,
4433 RO ,
4534 SCHEMA ,
4635 SHA1 ,
47- SHA256 ,
48- TEXT_PLAIN ,
4936 UUID ,
5037 WF4EVER ,
5138 WFDESC ,
5946# from rocrate.provenance import ResearchObject
6047
6148from pathlib import Path
62- import rocrate . rocrate as roc
49+
6350
6451def posix_path (local_path : str ) -> str :
6552 return str (PurePosixPath (Path (local_path )))
6653
54+
6755def remove_escapes (s ):
6856 escapes = '' .join ([chr (char ) for char in range (1 , 32 )])
6957 translator = str .maketrans ('' , '' , escapes )
70- t = s .translate (translator )
58+ s .translate (translator )
59+
7160
7261def reassign (d ):
7362 for k , v in d .items ():
@@ -78,16 +67,17 @@ def reassign(d):
7867 except ValueError :
7968 pass
8069
70+
8171class ProvenanceProfile :
82- """
72+ """\
8373 Provenance profile.
8474
8575 Populated from a galaxy workflow export.
8676 """
8777
8878 def __init__ (
8979 self ,
90- ga_export : Dict ,
80+ ga_export : Dict ,
9181 full_name : str = None ,
9282 orcid : str = None ,
9383 # prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112102 self .base_uri = "arcp://uuid,%s/" % self .ro_uuid
113103 self .document = ProvDocument ()
114104 # TODO extract engine_uuid from galaxy, type: str
115- self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () #type: str
105+ self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () # type: str
116106 self .full_name = full_name
117107 self .workflow_run_uuid = run_uuid or uuid .uuid4 ()
118108 self .workflow_run_uri = self .workflow_run_uuid .urn # type: str
119-
120- # move to separate function
109+ # move to separate function
121110 metadata_export = load_ga_history_export (ga_export )
122111 self .generate_prov_doc ()
123112 self .jobs = []
@@ -153,7 +142,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
153142 # PROV_TYPE: FOAF["OnlineAccount"],
154143 # TODO: change how we register galaxy version, probably a declare_version func
155144 # self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
156- # TODO: change notation to already imported namespaces?
145+ # TODO: change notation to already imported namespaces?
157146 self .document .add_namespace ("wfprov" , "http://purl.org/wf4ever/wfprov#" )
158147 # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
159148 self .document .add_namespace ("wfdesc" , "http://purl.org/wf4ever/wfdesc#" )
@@ -176,7 +165,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
176165 "provenance" , self .base_uri + posix_path (PROVENANCE ) + "/"
177166 )
178167 # TODO: use appropriate refs for ga_export and related inputs
179- ro_identifier_workflow = self .base_uri + "ga_export" + "/"
168+ ro_identifier_workflow = self .base_uri + "ga_export" + "/"
180169 self .wf_ns = self .document .add_namespace ("wf" , ro_identifier_workflow )
181170 ro_identifier_input = (
182171 self .base_uri + "ga_export/datasets#"
@@ -240,15 +229,15 @@ def declare_process(
240229 """Record the start of each Process."""
241230 if process_run_id is None :
242231 process_run_id = uuid .uuid4 ().urn
243-
244- cmd = ga_export_jobs_attrs ["command_line" ]
232+
233+ # cmd = ga_export_jobs_attrs["command_line"]
245234 process_name = ga_export_jobs_attrs ["tool_id" ]
246235 tool_version = ga_export_jobs_attrs ["tool_version" ]
247236 prov_label = "Run of " + process_name
248237 start_time = ga_export_jobs_attrs ["create_time" ]
249238 end_time = ga_export_jobs_attrs ["update_time" ]
250239
251- #TODO: Find out how to include commandline as a string
240+ # TODO: Find out how to include commandline as a string
252241 # cmd = self.document.entity(
253242 # uuid.uuid4().urn,
254243 # {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -259,9 +248,9 @@ def declare_process(
259248 start_time ,
260249 end_time ,
261250 {
262- PROV_TYPE : WFPROV ["ProcessRun" ],
263- PROV_LABEL : prov_label ,
264- #TODO: Find out how to include commandline as a string
251+ PROV_TYPE : WFPROV ["ProcessRun" ],
252+ PROV_LABEL : prov_label ,
253+ # TODO: Find out how to include commandline as a string
265254 # PROV_LABEL: cmd
266255 },
267256 )
@@ -289,7 +278,7 @@ def used_artefacts(
289278 base += "/" + process_name
290279 tool_id = process_metadata ["tool_id" ]
291280 base += "/" + tool_id
292- items = ["inputs" ,"outputs" ,"parameters" ]
281+ items = ["inputs" , "outputs" , "parameters" ]
293282 # print(process_metadata["params"])
294283 for item in items :
295284 # print(item)
@@ -317,7 +306,6 @@ def used_artefacts(
317306
318307 # for artefact in value:
319308 try :
320- # pdb.set_trace()
321309 entity = self .declare_artefact (value )
322310 self .document .used (
323311 process_run_id ,
@@ -356,7 +344,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
356344 # byte_s = BytesIO(value)
357345 # data_file = self.research_object.add_data_file(byte_s)
358346 # FIXME: Don't naively assume add_data_file uses hash in filename!
359- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
347+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
360348 return self .document .entity (
361349 data_id ,
362350 {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )},
@@ -394,7 +382,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
394382 )
395383
396384 if value .get ("class" ):
397- #_logger.warning("Unknown data class %s.", value["class"])
385+ # _logger.warning("Unknown data class %s.", value["class"])
398386 # FIXME: The class might be "http://example.com/somethingelse"
399387 coll .add_asserted_type (CWLPROV [value ["class" ]])
400388
@@ -404,7 +392,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
404392 # clean up unwanted characters
405393 if isinstance (key , str ):
406394 key = key .replace ("|" , "_" )
407- if isinstance (val , str ):
395+ if isinstance (val , str ):
408396 val = val .replace ("|" , "_" )
409397
410398 v_ent = self .declare_artefact (val )
@@ -451,7 +439,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
451439 # FIXME: list value does not support adding "@id"
452440 return coll
453441 except TypeError :
454- #_logger.warning("Unrecognized type %s of %r", type(value), value)
442+ # _logger.warning("Unrecognized type %s of %r", type(value), value)
455443 # Let's just fall back to Python repr()
456444 entity = self .document .entity (uuid .uuid4 ().urn , {PROV_LABEL : repr (value )})
457445 # self.research_object.add_uri(entity.identifier.uri)
@@ -466,7 +454,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
466454 if "checksum" in value :
467455 csum = cast (str , value ["checksum" ])
468456 (method , checksum ) = csum .split ("$" , 1 )
469- if method == SHA1 : # and self.research_object.has_data_file(checksum):
457+ if method == SHA1 : # and self.research_object.has_data_file(checksum):
470458 entity = self .document .entity ("data:" + checksum )
471459
472460 if not entity and "location" in value :
@@ -513,8 +501,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
513501
514502 # Check for secondaries
515503 for sec in cast (
516- # MutableSequence[CWLObjectType],
517- value .get ("secondaryFiles" , [])
504+ # MutableSequence[CWLObjectType],
505+ value .get ("secondaryFiles" , []) # noqa
518506 ):
519507 # TODO: Record these in a specializationOf entity with UUID?
520508 if sec ["class" ] == "File" :
@@ -535,8 +523,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
535523
536524 return file_entity , entity , checksum
537525
538- def declare_directory (self
539- # , value: CWLObjectType
526+ def declare_directory (
527+ self ,
528+ # value: CWLObjectType
529+ value
540530 ) -> ProvEntity :
541531 """Register any nested files/directories."""
542532 # FIXME: Calculate a hash-like identifier for directory
@@ -647,12 +637,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
647637 # checksum = PurePosixPath(data_file).name
648638 # FIXME: Don't naively assume add_data_file uses hash in filename!
649639 value = str (value ).replace ("|" , "_" )
650- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
640+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
651641 entity = self .document .entity (
652642 data_id , {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )}
653643 ) # type: ProvEntity
654- return entity #, checksum
655-
644+ return entity # , checksum
656645
657646 def generate_output_prov (
658647 self ,
@@ -735,7 +724,7 @@ def activity_has_provenance(self, activity, prov_ids):
735724 self .document .activity (activity , other_attributes = attribs )
736725 # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
737726 # as prov:mentionOf() is only for entities, not activities
738- uris = [i .uri for i in prov_ids ]
727+ # uris = [i.uri for i in prov_ids]
739728 # self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
740729
741730 def finalize_prov_profile (self , name = None , out_path = None ):
@@ -770,7 +759,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
770759
771760 # https://www.w3.org/TR/prov-xml/
772761 # serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
773- prov_ids .append (self .provenance_ns [filename + ".xml" ])
762+ prov_ids .append (self .provenance_ns [filename + ".xml" ])
774763 with open (basename + ".xml" , "w" ) as provenance_file :
775764 self .document .serialize (provenance_file , format = "xml" , indent = 4 )
776765
@@ -779,7 +768,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
779768 prov_ids .append (self .provenance_ns [filename + ".provn" ])
780769 with open (basename + ".provn" , "w" ) as provenance_file :
781770 self .document .serialize (provenance_file , format = "provn" , indent = 2 )
782-
783771
784772 # https://www.w3.org/Submission/prov-json/
785773 # serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -810,7 +798,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
810798 prov_ids .append (self .provenance_ns [filename + ".jsonld" ])
811799 with open (basename + ".jsonld" , "w" ) as provenance_file :
812800 self .document .serialize (provenance_file , format = "rdf" , rdf_format = "json-ld" )
813-
814801
815- #_logger.debug("[provenance] added provenance: %s", prov_ids)
802+ # _logger.debug("[provenance] added provenance: %s", prov_ids)
816803 return (serialized_prov_docs , prov_ids )
0 commit comments