Coverage for changes_metadata_manager / folder_metadata_builder.py: 99%

82 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-12 19:28 +0000

1import argparse 

2import json 

3import re 

4from pathlib import Path 

5 

6from rdflib import Graph, URIRef 

7 

8from changes_metadata_manager.generate_provenance import generate_provenance_snapshots 

9 

10 

11BASE_URI = "https://w3id.org/changes/4/aldrovandi" 

12STRUCTURE_PATH = Path("data/sharepoint_structure.json") 

13KG_PATH = Path("data/kg.ttl") 

14RESP_AGENT = "https://w3id.org/changes/4/agent/morph-kgc-changes-metadata/1.0.1" 

15PRIMARY_SOURCE = "https://doi.org/10.5281/zenodo.18190642" 

16 

17STAGE_STEPS = { 

18 "raw": ["00"], 

19 "rawp": ["00", "01"], 

20 "dcho": ["00", "01", "02"], 

21 "dchoo": ["00", "01", "02", "03", "04", "05", "06"], 

22} 

23 

24SKIP_FOLDERS = { 

25 "S1-CNR_SoffittoSala1", 

26} 

27 

28 

29def load_kg(path: Path) -> Graph: 

30 graph = Graph() 

31 graph.parse(path, format="turtle") 

32 return graph 

33 

34 

35def extract_nr_from_folder_name(folder_name: str) -> int: 

36 match = re.match(r"S\d+-(\d+)[a-z]? ?[-_]", folder_name) 

37 if not match: 

38 raise ValueError(f"Cannot extract NR from folder name: {folder_name}") 

39 return int(match.group(1)) 

40 

41 

42def extract_metadata_for_stage(graph: Graph, nr: int, stage: str) -> Graph: 

43 result = Graph() 

44 for prefix, namespace in graph.namespace_manager.namespaces(): 

45 result.namespace_manager.bind(prefix, namespace) 

46 

47 steps = STAGE_STEPS[stage] 

48 

49 for s, p, o in graph: 

50 s_str = str(s) 

51 step_match = re.search(rf"/{nr}/(\d{{2}})/1$", s_str) 

52 if step_match: 

53 step = step_match.group(1) 

54 if step in steps: 

55 result.add((s, p, o)) 

56 if isinstance(o, URIRef): 

57 for s2, p2, o2 in graph.triples((o, None, None)): 

58 result.add((s2, p2, o2)) 

59 continue 

60 

61 ob_match = re.search(rf"/{nr}/ob\d+/1$", s_str) 

62 if ob_match: 

63 result.add((s, p, o)) 

64 if isinstance(o, URIRef): 

65 for s2, p2, o2 in graph.triples((o, None, None)): 

66 result.add((s2, p2, o2)) 

67 

68 return result 

69 

70 

71def load_sharepoint_structure(structure_path: Path) -> dict: 

72 with open(structure_path) as f: 

73 return json.load(f) 

74 

75 

76def scan_folder_structure(root_path: Path) -> dict: 

77 structure = {} 

78 for sala_dir in root_path.iterdir(): 

79 sala_name = sala_dir.name 

80 structure[sala_name] = {} 

81 for folder_dir in sala_dir.iterdir(): 

82 folder_name = folder_dir.name 

83 structure[sala_name][folder_name] = {} 

84 for stage_dir in folder_dir.iterdir(): 

85 stage_name = stage_dir.name 

86 files = [f.name for f in stage_dir.iterdir() if f.is_file()] 

87 structure[sala_name][folder_name][stage_name] = {"_files": files} 

88 return {"structure": structure} 

89 

90 

91def process_all_folders( 

92 root: Path, 

93 kg_path: Path = KG_PATH, 

94 structure_path: Path | None = None, 

95) -> None: 

96 if structure_path is not None: 

97 structure = load_sharepoint_structure(structure_path) 

98 else: 

99 structure = scan_folder_structure(root) 

100 kg = load_kg(kg_path) 

101 

102 for sala_name, sala_items in structure["structure"].items(): 

103 for folder_name, subfolders in sala_items.items(): 

104 if folder_name in SKIP_FOLDERS: 

105 continue 

106 nr = extract_nr_from_folder_name(folder_name) 

107 

108 existing_stages = [ 

109 s for s in subfolders.keys() 

110 if s.lower() in STAGE_STEPS 

111 ] 

112 

113 for stage_name in existing_stages: 

114 stage_key = stage_name.lower() 

115 stage_dir = root / sala_name / folder_name / stage_name 

116 stage_dir.mkdir(parents=True, exist_ok=True) 

117 

118 metadata = extract_metadata_for_stage(kg, nr, stage_key) 

119 

120 meta_path = stage_dir / "meta.ttl" 

121 metadata.serialize(destination=str(meta_path), format="turtle") 

122 

123 prov_path = stage_dir / "prov.nq" 

124 generate_provenance_snapshots( 

125 input_directory=str(stage_dir), 

126 output_file=str(prov_path), 

127 output_format="nquads", 

128 agent_orcid=RESP_AGENT, 

129 primary_source=PRIMARY_SOURCE, 

130 ) 

131 

132 print(f"Processed {folder_name} (NR={nr}): {len(existing_stages)} stages") 

133 

134 

135def parse_arguments(): # pragma: no cover 

136 parser = argparse.ArgumentParser( 

137 description="Generate metadata and provenance files for folder structure" 

138 ) 

139 parser.add_argument( 

140 "root", 

141 type=Path, 

142 help="Root directory containing Sala/Folder/Stage structure", 

143 ) 

144 parser.add_argument( 

145 "--structure", 

146 "-s", 

147 type=Path, 

148 default=None, 

149 help="SharePoint JSON structure file (optional, for development)", 

150 ) 

151 return parser.parse_args() 

152 

153 

154def main(): # pragma: no cover 

155 args = parse_arguments() 

156 process_all_folders(root=args.root, structure_path=args.structure) 

157 print("\nProcessing complete") 

158 

159 

160if __name__ == "__main__": # pragma: no cover 

161 main()