Coverage for changes_metadata_manager / generate_provenance.py: 98%
49 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-21 12:19 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-21 12:19 +0000
1# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelomas@gmail.com>
2#
3# SPDX-License-Identifier: ISC
5"""
6Module to generate provenance snapshots from RDF data.
7Loads RDF data in various formats from all files in a directory,
8extracts all subjects, and creates provenance snapshots as named graphs
9with type prov:Entity.
10"""
12import os
13import datetime
14from rdflib import Dataset, URIRef, Namespace, Literal
15from rdflib.namespace import RDF, XSD, DCTERMS
17CC0 = URIRef("https://creativecommons.org/publicdomain/zero/1.0/")
19def generate_provenance_snapshots(input_directory: str, output_file: str, input_format: str | None = None, output_format: str = 'json-ld', agent_orcid: str = '', primary_source: str = ''):
20 """
21 Generate provenance snapshots from RDF data.
23 Args:
24 input_directory: Path to directory containing RDF files
25 output_file: Path to output file with provenance snapshots (N-Quads format)
26 input_format: Optional format to use for all input files (overrides auto-detection)
27 output_format: Format to use for output file (default: nquads)
28 agent_orcid: ORCID of the responsible agent
29 primary_source: URI of the primary source for the data
30 """
32 input_graph = Dataset()
33 default_graph = input_graph.graph()
35 file_count = 0
37 rdf_extensions = {
38 '.ttl': 'turtle',
39 '.nt': 'nt',
40 '.n3': 'n3',
41 '.xml': 'xml',
42 '.rdf': 'xml',
43 '.jsonld': 'json-ld',
44 '.nq': 'nquads',
45 '.trig': 'trig'
46 }
48 for filename in os.listdir(input_directory):
49 file_path = os.path.join(input_directory, filename)
51 if input_format:
52 format_name = input_format
53 else:
54 _, ext = os.path.splitext(filename.lower())
55 if ext not in rdf_extensions:
56 continue
57 format_name = rdf_extensions[ext]
59 default_graph.parse(file_path, format=format_name)
60 file_count += 1
62 if file_count == 0:
63 return
65 dataset = Dataset()
67 PROV = Namespace('http://www.w3.org/ns/prov#')
68 dataset.namespace_manager.bind('prov', PROV)
69 dataset.namespace_manager.bind('dcterms', DCTERMS)
71 dataset.default_graph.add((URIRef(""), DCTERMS.license, CC0))
73 for prefix, namespace in input_graph.namespace_manager.namespaces():
74 dataset.namespace_manager.bind(prefix, namespace)
76 license_subjects = {s for s, p, _ in default_graph if p == DCTERMS.license}
77 subjects = set()
78 for s, _, _ in default_graph:
79 if isinstance(s, URIRef) and s not in license_subjects:
80 subjects.add(s)
82 generation_time = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0).isoformat()
84 responsible_agent = URIRef(agent_orcid)
85 primary_source_uri = URIRef(primary_source)
87 for subject in subjects:
88 prov_graph_uri = URIRef(f"{subject}/prov/")
90 snapshot_uri = URIRef(f"{subject}/prov/se/1")
92 prov_graph = dataset.graph(identifier=prov_graph_uri)
94 prov_graph.add((snapshot_uri, RDF.type, PROV.Entity))
95 prov_graph.add((snapshot_uri, PROV.specializationOf, subject))
97 prov_graph.add((snapshot_uri, PROV.generatedAtTime, Literal(generation_time, datatype=XSD.dateTime)))
99 prov_graph.add((snapshot_uri, PROV.wasAttributedTo, responsible_agent))
101 prov_graph.add((snapshot_uri, PROV.hadPrimarySource, primary_source_uri))
103 description = f"Entity <{str(subject)}> was created"
104 prov_graph.add((snapshot_uri, DCTERMS.description, Literal(description, lang="en")))
106 dataset.serialize(destination=output_file, format=output_format)