Coverage for changes_metadata_manager / folder_metadata_builder.py: 99%
82 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-12 19:28 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-12 19:28 +0000
1import argparse
2import json
3import re
4from pathlib import Path
6from rdflib import Graph, URIRef
8from changes_metadata_manager.generate_provenance import generate_provenance_snapshots
11BASE_URI = "https://w3id.org/changes/4/aldrovandi"
12STRUCTURE_PATH = Path("data/sharepoint_structure.json")
13KG_PATH = Path("data/kg.ttl")
14RESP_AGENT = "https://w3id.org/changes/4/agent/morph-kgc-changes-metadata/1.0.1"
15PRIMARY_SOURCE = "https://doi.org/10.5281/zenodo.18190642"
17STAGE_STEPS = {
18 "raw": ["00"],
19 "rawp": ["00", "01"],
20 "dcho": ["00", "01", "02"],
21 "dchoo": ["00", "01", "02", "03", "04", "05", "06"],
22}
24SKIP_FOLDERS = {
25 "S1-CNR_SoffittoSala1",
26}
29def load_kg(path: Path) -> Graph:
30 graph = Graph()
31 graph.parse(path, format="turtle")
32 return graph
35def extract_nr_from_folder_name(folder_name: str) -> int:
36 match = re.match(r"S\d+-(\d+)[a-z]? ?[-_]", folder_name)
37 if not match:
38 raise ValueError(f"Cannot extract NR from folder name: {folder_name}")
39 return int(match.group(1))
42def extract_metadata_for_stage(graph: Graph, nr: int, stage: str) -> Graph:
43 result = Graph()
44 for prefix, namespace in graph.namespace_manager.namespaces():
45 result.namespace_manager.bind(prefix, namespace)
47 steps = STAGE_STEPS[stage]
49 for s, p, o in graph:
50 s_str = str(s)
51 step_match = re.search(rf"/{nr}/(\d{{2}})/1$", s_str)
52 if step_match:
53 step = step_match.group(1)
54 if step in steps:
55 result.add((s, p, o))
56 if isinstance(o, URIRef):
57 for s2, p2, o2 in graph.triples((o, None, None)):
58 result.add((s2, p2, o2))
59 continue
61 ob_match = re.search(rf"/{nr}/ob\d+/1$", s_str)
62 if ob_match:
63 result.add((s, p, o))
64 if isinstance(o, URIRef):
65 for s2, p2, o2 in graph.triples((o, None, None)):
66 result.add((s2, p2, o2))
68 return result
71def load_sharepoint_structure(structure_path: Path) -> dict:
72 with open(structure_path) as f:
73 return json.load(f)
76def scan_folder_structure(root_path: Path) -> dict:
77 structure = {}
78 for sala_dir in root_path.iterdir():
79 sala_name = sala_dir.name
80 structure[sala_name] = {}
81 for folder_dir in sala_dir.iterdir():
82 folder_name = folder_dir.name
83 structure[sala_name][folder_name] = {}
84 for stage_dir in folder_dir.iterdir():
85 stage_name = stage_dir.name
86 files = [f.name for f in stage_dir.iterdir() if f.is_file()]
87 structure[sala_name][folder_name][stage_name] = {"_files": files}
88 return {"structure": structure}
91def process_all_folders(
92 root: Path,
93 kg_path: Path = KG_PATH,
94 structure_path: Path | None = None,
95) -> None:
96 if structure_path is not None:
97 structure = load_sharepoint_structure(structure_path)
98 else:
99 structure = scan_folder_structure(root)
100 kg = load_kg(kg_path)
102 for sala_name, sala_items in structure["structure"].items():
103 for folder_name, subfolders in sala_items.items():
104 if folder_name in SKIP_FOLDERS:
105 continue
106 nr = extract_nr_from_folder_name(folder_name)
108 existing_stages = [
109 s for s in subfolders.keys()
110 if s.lower() in STAGE_STEPS
111 ]
113 for stage_name in existing_stages:
114 stage_key = stage_name.lower()
115 stage_dir = root / sala_name / folder_name / stage_name
116 stage_dir.mkdir(parents=True, exist_ok=True)
118 metadata = extract_metadata_for_stage(kg, nr, stage_key)
120 meta_path = stage_dir / "meta.ttl"
121 metadata.serialize(destination=str(meta_path), format="turtle")
123 prov_path = stage_dir / "prov.nq"
124 generate_provenance_snapshots(
125 input_directory=str(stage_dir),
126 output_file=str(prov_path),
127 output_format="nquads",
128 agent_orcid=RESP_AGENT,
129 primary_source=PRIMARY_SOURCE,
130 )
132 print(f"Processed {folder_name} (NR={nr}): {len(existing_stages)} stages")
135def parse_arguments(): # pragma: no cover
136 parser = argparse.ArgumentParser(
137 description="Generate metadata and provenance files for folder structure"
138 )
139 parser.add_argument(
140 "root",
141 type=Path,
142 help="Root directory containing Sala/Folder/Stage structure",
143 )
144 parser.add_argument(
145 "--structure",
146 "-s",
147 type=Path,
148 default=None,
149 help="SharePoint JSON structure file (optional, for development)",
150 )
151 return parser.parse_args()
154def main(): # pragma: no cover
155 args = parse_arguments()
156 process_all_folders(root=args.root, structure_path=args.structure)
157 print("\nProcessing complete")
160if __name__ == "__main__": # pragma: no cover
161 main()