Coverage for tests/test_provenance.py: 100%

1#!/usr/bin/env python3

3# SPDX-FileCopyrightText: 2025-2026 Arcangelo Massari <arcangelomas@gmail.com>

5# SPDX-License-Identifier: ISC

7"""

8Tests for the provenance generator script.

9"""

11import os

12import sys

13import tempfile

14import shutil

15import pytest

16from rdflib import Dataset, URIRef, Namespace

17from rdflib.namespace import RDF

19# Add the src directory to the path so we can import the module

20sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

21from changes_metadata_manager.generate_provenance import generate_provenance_snapshots

23@pytest.fixture

24def test_environment():

25 """Set up test data and environment."""

26 test_dir = tempfile.mkdtemp(dir='./tests/')

27 test_ttl = os.path.join(test_dir, 'test_data.ttl')

28 test_output = tempfile.mktemp(suffix='.nq')

30 # Create test data file

31 with open(test_ttl, 'w') as f:

32 f.write("""

33@prefix ex: <http://example.org/> .

34@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .

35@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

36@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .

38ex:item1 a crm:E22_Human-Made_Object ;

39 rdfs:label "Test Manuscript" .

41ex:item2 a crm:E21_Person ;

42 rdfs:label "John Doe" .

43 """)

45 yield {"test_dir": test_dir, "test_ttl": test_ttl, "test_output": test_output}

47 # Clean up

48 if os.path.exists(test_dir):

49 shutil.rmtree(test_dir)

50 if os.path.exists(test_output):

51 os.remove(test_output)

53def test_provenance_generation(test_environment):

54 """Test that provenance snapshots are generated correctly."""

55 # Get test environment variables

56 test_dir = test_environment["test_dir"]

57 test_output = test_environment["test_output"]

59 # Generate provenance snapshots

60 agent_orcid = "https://orcid.org/0000-0002-8420-0696"

61 primary_source = "https://example.org/primary-source"

62 generate_provenance_snapshots(test_dir, test_output, output_format='trig', agent_orcid=agent_orcid, primary_source=primary_source)

64 # Check that the output file was created

65 assert os.path.exists(test_output), "Output file was not created"

67 # Load the output file

68 dataset = Dataset()

69 dataset.parse(test_output, format='trig')

71 # Define namespaces

72 PROV = Namespace('http://www.w3.org/ns/prov#')

74 # Check that we have the expected named graphs

75 expected_graphs = [

76 URIRef('http://example.org/item1/prov/'),

77 URIRef('http://example.org/item2/prov/')

78 ]

79 actual_graphs = [g.identifier for g in dataset.graphs()]

81 for graph in expected_graphs:

82 assert graph in actual_graphs, f"Expected graph {graph} not found"

84 # Check that snapshots are typed as prov:Entity

85 item1_prov_graph = dataset.graph(URIRef('http://example.org/item1/prov/'))

86 item2_prov_graph = dataset.graph(URIRef('http://example.org/item2/prov/'))

88 item1_snapshot = URIRef('http://example.org/item1/prov/se/1')

89 item2_snapshot = URIRef('http://example.org/item2/prov/se/1')

91 assert (item1_snapshot, RDF.type, PROV.Entity) in item1_prov_graph, "item1 snapshot is not typed as prov:Entity"

92 assert (item2_snapshot, RDF.type, PROV.Entity) in item2_prov_graph, "item2 snapshot is not typed as prov:Entity"

94 # Check for specializationOf relationship

95 assert (item1_snapshot, PROV.specializationOf, URIRef('http://example.org/item1')) in item1_prov_graph

96 assert (item2_snapshot, PROV.specializationOf, URIRef('http://example.org/item2')) in item2_prov_graph

98 # Check for primary source relationship

99 assert (item1_snapshot, PROV.hadPrimarySource, URIRef(primary_source)) in item1_prov_graph, "item1 snapshot missing primary source"

100 assert (item2_snapshot, PROV.hadPrimarySource, URIRef(primary_source)) in item2_prov_graph, "item2 snapshot missing primary source"

101

102def test_input_format_parameter(test_environment):

103 """Test that the input_format parameter works correctly."""

104 # Get test environment variables

105 test_dir = test_environment["test_dir"]

106 test_output = test_environment["test_output"]

107

108 # Create a file with an unknown extension but containing Turtle content

109 test_unknown = os.path.join(test_dir, 'unknown_format.xyz')

110 with open(test_unknown, 'w') as f:

111 f.write("""

112@prefix ex: <http://example.org/> .

113@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

114@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .

115

116ex:item3 a crm:E22_Human-Made_Object ;

117 rdfs:label "Test Object with Unknown Format" .

118 """)

119

120 # Generate provenance snapshots, specifying the format explicitly

121 agent_orcid = "https://orcid.org/0000-0002-8420-0696"

122 primary_source = "https://example.org/primary-source"

123 generate_provenance_snapshots(test_dir, test_output, input_format='turtle', output_format='trig', agent_orcid=agent_orcid, primary_source=primary_source)

124

125 # Check that the output file was created

126 assert os.path.exists(test_output), "Output file was not created"

127

128 # Load the output file

129 dataset = Dataset()

130 dataset.parse(test_output, format='trig')

131

132 # Define namespaces

133 PROV = Namespace('http://www.w3.org/ns/prov#')

134

135 # Check that we have the expected named graph for item3

136 item3_graph = URIRef('http://example.org/item3/prov/')

137 actual_graphs = [g.identifier for g in dataset.graphs()]

138 assert item3_graph in actual_graphs, f"Expected graph {item3_graph} not found"

139

140 # Check that snapshot is typed as prov:Entity

141 item3_prov_graph = dataset.graph(item3_graph)

142 item3_snapshot = URIRef('http://example.org/item3/prov/se/1')

143 assert (item3_snapshot, RDF.type, PROV.Entity) in item3_prov_graph, "item3 snapshot is not typed as prov:Entity"

144

145def test_empty_directory(test_environment):

146 """Test that the script handles empty directories correctly."""

147 # Create an empty directory

148 empty_dir = tempfile.mkdtemp(dir='./tests/')

149 test_output = test_environment["test_output"]

150

151 try:

152 # Generate provenance snapshots for the empty directory

153 agent_orcid = "https://orcid.org/0000-0002-8420-0696"

154 primary_source = "https://example.org/primary-source"

155 generate_provenance_snapshots(empty_dir, test_output, agent_orcid=agent_orcid, primary_source=primary_source)

156

157 # Check that the output file was not created

158 assert not os.path.exists(test_output), "Output file should not be created for empty directory"

159 finally:

160 # Clean up

161 if os.path.exists(empty_dir):

162 shutil.rmtree(empty_dir)

163

164if __name__ == '__main__':

165 pytest.main()

Coverage for tests / test_provenance.py: 100%

74 statements