Coverage for changes_metadata_manager / generate_provenance.py: 98%

49 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-21 12:19 +0000

1# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelomas@gmail.com> 

2# 

3# SPDX-License-Identifier: ISC 

4 

5""" 

6Module to generate provenance snapshots from RDF data. 

7Loads RDF data in various formats from all files in a directory, 

8extracts all subjects, and creates provenance snapshots as named graphs 

9with type prov:Entity. 

10""" 

11 

12import os 

13import datetime 

14from rdflib import Dataset, URIRef, Namespace, Literal 

15from rdflib.namespace import RDF, XSD, DCTERMS 

16 

17CC0 = URIRef("https://creativecommons.org/publicdomain/zero/1.0/") 

18 

19def generate_provenance_snapshots(input_directory: str, output_file: str, input_format: str | None = None, output_format: str = 'json-ld', agent_orcid: str = '', primary_source: str = ''): 

20 """ 

21 Generate provenance snapshots from RDF data. 

22  

23 Args: 

24 input_directory: Path to directory containing RDF files 

25 output_file: Path to output file with provenance snapshots (N-Quads format) 

26 input_format: Optional format to use for all input files (overrides auto-detection) 

27 output_format: Format to use for output file (default: nquads) 

28 agent_orcid: ORCID of the responsible agent 

29 primary_source: URI of the primary source for the data 

30 """ 

31 

32 input_graph = Dataset() 

33 default_graph = input_graph.graph() 

34 

35 file_count = 0 

36 

37 rdf_extensions = { 

38 '.ttl': 'turtle', 

39 '.nt': 'nt', 

40 '.n3': 'n3', 

41 '.xml': 'xml', 

42 '.rdf': 'xml', 

43 '.jsonld': 'json-ld', 

44 '.nq': 'nquads', 

45 '.trig': 'trig' 

46 } 

47 

48 for filename in os.listdir(input_directory): 

49 file_path = os.path.join(input_directory, filename) 

50 

51 if input_format: 

52 format_name = input_format 

53 else: 

54 _, ext = os.path.splitext(filename.lower()) 

55 if ext not in rdf_extensions: 

56 continue 

57 format_name = rdf_extensions[ext] 

58 

59 default_graph.parse(file_path, format=format_name) 

60 file_count += 1 

61 

62 if file_count == 0: 

63 return 

64 

65 dataset = Dataset() 

66 

67 PROV = Namespace('http://www.w3.org/ns/prov#') 

68 dataset.namespace_manager.bind('prov', PROV) 

69 dataset.namespace_manager.bind('dcterms', DCTERMS) 

70 

71 dataset.default_graph.add((URIRef(""), DCTERMS.license, CC0)) 

72 

73 for prefix, namespace in input_graph.namespace_manager.namespaces(): 

74 dataset.namespace_manager.bind(prefix, namespace) 

75 

76 license_subjects = {s for s, p, _ in default_graph if p == DCTERMS.license} 

77 subjects = set() 

78 for s, _, _ in default_graph: 

79 if isinstance(s, URIRef) and s not in license_subjects: 

80 subjects.add(s) 

81 

82 generation_time = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0).isoformat() 

83 

84 responsible_agent = URIRef(agent_orcid) 

85 primary_source_uri = URIRef(primary_source) 

86 

87 for subject in subjects: 

88 prov_graph_uri = URIRef(f"{subject}/prov/") 

89 

90 snapshot_uri = URIRef(f"{subject}/prov/se/1") 

91 

92 prov_graph = dataset.graph(identifier=prov_graph_uri) 

93 

94 prov_graph.add((snapshot_uri, RDF.type, PROV.Entity)) 

95 prov_graph.add((snapshot_uri, PROV.specializationOf, subject)) 

96 

97 prov_graph.add((snapshot_uri, PROV.generatedAtTime, Literal(generation_time, datatype=XSD.dateTime))) 

98 

99 prov_graph.add((snapshot_uri, PROV.wasAttributedTo, responsible_agent)) 

100 

101 prov_graph.add((snapshot_uri, PROV.hadPrimarySource, primary_source_uri)) 

102 

103 description = f"Entity <{str(subject)}> was created" 

104 prov_graph.add((snapshot_uri, DCTERMS.description, Literal(description, lang="en"))) 

105 

106 dataset.serialize(destination=output_file, format=output_format)