Coverage for changes_metadata_manager / folder_metadata_builder.py: 100%

78 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 13:06 +0000

1import argparse 

2import json 

3import re 

4from pathlib import Path 

5 

6from rdflib import Graph, URIRef 

7 

8from changes_metadata_manager.generate_provenance import generate_provenance_snapshots 

9 

10 

11BASE_URI = "https://w3id.org/changes/4/aldrovandi" 

12STRUCTURE_PATH = Path("data/sharepoint_structure.json") 

13KG_PATH = Path("data/kg.ttl") 

14RESP_AGENT = "https://orcid.org/0000-0000-0000-0000" # TODO: replace with actual URI 

15PRIMARY_SOURCE = "https://example.org/primary-source" # TODO: replace with actual URI 

16 

17STAGE_STEPS = { 

18 "raw": ["00"], 

19 "rawp": ["00", "01"], 

20 "dcho": ["00", "01", "02"], 

21 "dchoo": ["00", "01", "02", "03", "04", "05", "06"], 

22} 

23 

24 

25def load_kg(path: Path) -> Graph: 

26 graph = Graph() 

27 graph.parse(path, format="turtle") 

28 return graph 

29 

30 

31def extract_nr_from_folder_name(folder_name: str) -> int: 

32 match = re.match(r"S\d+-(\d+)-", folder_name) 

33 if not match: 

34 raise ValueError(f"Cannot extract NR from folder name: {folder_name}") 

35 return int(match.group(1)) 

36 

37 

38def extract_metadata_for_stage(graph: Graph, nr: int, stage: str) -> Graph: 

39 result = Graph() 

40 for prefix, namespace in graph.namespace_manager.namespaces(): 

41 result.namespace_manager.bind(prefix, namespace) 

42 

43 steps = STAGE_STEPS[stage] 

44 

45 for s, p, o in graph: 

46 s_str = str(s) 

47 step_match = re.search(rf"/{nr}/(\d{{2}})/1$", s_str) 

48 if step_match: 

49 step = step_match.group(1) 

50 if step in steps: 

51 result.add((s, p, o)) 

52 if isinstance(o, URIRef): 

53 for s2, p2, o2 in graph.triples((o, None, None)): 

54 result.add((s2, p2, o2)) 

55 continue 

56 

57 ob_match = re.search(rf"/{nr}/ob\d+/1$", s_str) 

58 if ob_match: 

59 result.add((s, p, o)) 

60 if isinstance(o, URIRef): 

61 for s2, p2, o2 in graph.triples((o, None, None)): 

62 result.add((s2, p2, o2)) 

63 

64 return result 

65 

66 

67def load_sharepoint_structure(structure_path: Path) -> dict: 

68 with open(structure_path) as f: 

69 return json.load(f) 

70 

71 

72def scan_folder_structure(root_path: Path) -> dict: 

73 structure = {} 

74 for sala_dir in root_path.iterdir(): 

75 sala_name = sala_dir.name 

76 structure[sala_name] = {} 

77 for folder_dir in sala_dir.iterdir(): 

78 folder_name = folder_dir.name 

79 structure[sala_name][folder_name] = {} 

80 for stage_dir in folder_dir.iterdir(): 

81 stage_name = stage_dir.name 

82 files = [f.name for f in stage_dir.iterdir() if f.is_file()] 

83 structure[sala_name][folder_name][stage_name] = {"_files": files} 

84 return {"structure": structure} 

85 

86 

87def process_all_folders( 

88 root: Path, 

89 kg_path: Path = KG_PATH, 

90 structure_path: Path | None = None, 

91) -> None: 

92 if structure_path is not None: 

93 structure = load_sharepoint_structure(structure_path) 

94 else: 

95 structure = scan_folder_structure(root) 

96 kg = load_kg(kg_path) 

97 

98 for sala_name, sala_items in structure["structure"].items(): 

99 for folder_name, subfolders in sala_items.items(): 

100 nr = extract_nr_from_folder_name(folder_name) 

101 

102 existing_stages = [ 

103 s for s in subfolders.keys() 

104 if s.lower() in STAGE_STEPS 

105 ] 

106 

107 for stage_name in existing_stages: 

108 stage_key = stage_name.lower() 

109 stage_dir = root / sala_name / folder_name / stage_name 

110 

111 metadata = extract_metadata_for_stage(kg, nr, stage_key) 

112 

113 meta_path = stage_dir / "meta.ttl" 

114 metadata.serialize(destination=str(meta_path), format="turtle") 

115 

116 prov_path = stage_dir / "prov.nq" 

117 generate_provenance_snapshots( 

118 input_directory=str(stage_dir), 

119 output_file=str(prov_path), 

120 output_format="nquads", 

121 agent_orcid=RESP_AGENT, 

122 primary_source=PRIMARY_SOURCE, 

123 ) 

124 

125 print(f"Processed {folder_name} (NR={nr}): {len(existing_stages)} stages") 

126 

127 

128def parse_arguments(): # pragma: no cover 

129 parser = argparse.ArgumentParser( 

130 description="Generate metadata and provenance files for folder structure" 

131 ) 

132 parser.add_argument( 

133 "root", 

134 type=Path, 

135 help="Root directory containing Sala/Folder/Stage structure", 

136 ) 

137 parser.add_argument( 

138 "--structure", 

139 "-s", 

140 type=Path, 

141 default=None, 

142 help="SharePoint JSON structure file (optional, for development)", 

143 ) 

144 return parser.parse_args() 

145 

146 

147def main(): # pragma: no cover 

148 args = parse_arguments() 

149 process_all_folders(root=args.root, structure_path=args.structure) 

150 print("\nProcessing complete") 

151 

152 

153if __name__ == "__main__": # pragma: no cover 

154 main()