Coverage for changes_metadata_manager / folder_metadata_builder.py: 100%
78 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 13:06 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 13:06 +0000
1import argparse
2import json
3import re
4from pathlib import Path
6from rdflib import Graph, URIRef
8from changes_metadata_manager.generate_provenance import generate_provenance_snapshots
11BASE_URI = "https://w3id.org/changes/4/aldrovandi"
12STRUCTURE_PATH = Path("data/sharepoint_structure.json")
13KG_PATH = Path("data/kg.ttl")
14RESP_AGENT = "https://orcid.org/0000-0000-0000-0000" # TODO: replace with actual URI
15PRIMARY_SOURCE = "https://example.org/primary-source" # TODO: replace with actual URI
17STAGE_STEPS = {
18 "raw": ["00"],
19 "rawp": ["00", "01"],
20 "dcho": ["00", "01", "02"],
21 "dchoo": ["00", "01", "02", "03", "04", "05", "06"],
22}
25def load_kg(path: Path) -> Graph:
26 graph = Graph()
27 graph.parse(path, format="turtle")
28 return graph
31def extract_nr_from_folder_name(folder_name: str) -> int:
32 match = re.match(r"S\d+-(\d+)-", folder_name)
33 if not match:
34 raise ValueError(f"Cannot extract NR from folder name: {folder_name}")
35 return int(match.group(1))
38def extract_metadata_for_stage(graph: Graph, nr: int, stage: str) -> Graph:
39 result = Graph()
40 for prefix, namespace in graph.namespace_manager.namespaces():
41 result.namespace_manager.bind(prefix, namespace)
43 steps = STAGE_STEPS[stage]
45 for s, p, o in graph:
46 s_str = str(s)
47 step_match = re.search(rf"/{nr}/(\d{{2}})/1$", s_str)
48 if step_match:
49 step = step_match.group(1)
50 if step in steps:
51 result.add((s, p, o))
52 if isinstance(o, URIRef):
53 for s2, p2, o2 in graph.triples((o, None, None)):
54 result.add((s2, p2, o2))
55 continue
57 ob_match = re.search(rf"/{nr}/ob\d+/1$", s_str)
58 if ob_match:
59 result.add((s, p, o))
60 if isinstance(o, URIRef):
61 for s2, p2, o2 in graph.triples((o, None, None)):
62 result.add((s2, p2, o2))
64 return result
67def load_sharepoint_structure(structure_path: Path) -> dict:
68 with open(structure_path) as f:
69 return json.load(f)
72def scan_folder_structure(root_path: Path) -> dict:
73 structure = {}
74 for sala_dir in root_path.iterdir():
75 sala_name = sala_dir.name
76 structure[sala_name] = {}
77 for folder_dir in sala_dir.iterdir():
78 folder_name = folder_dir.name
79 structure[sala_name][folder_name] = {}
80 for stage_dir in folder_dir.iterdir():
81 stage_name = stage_dir.name
82 files = [f.name for f in stage_dir.iterdir() if f.is_file()]
83 structure[sala_name][folder_name][stage_name] = {"_files": files}
84 return {"structure": structure}
87def process_all_folders(
88 root: Path,
89 kg_path: Path = KG_PATH,
90 structure_path: Path | None = None,
91) -> None:
92 if structure_path is not None:
93 structure = load_sharepoint_structure(structure_path)
94 else:
95 structure = scan_folder_structure(root)
96 kg = load_kg(kg_path)
98 for sala_name, sala_items in structure["structure"].items():
99 for folder_name, subfolders in sala_items.items():
100 nr = extract_nr_from_folder_name(folder_name)
102 existing_stages = [
103 s for s in subfolders.keys()
104 if s.lower() in STAGE_STEPS
105 ]
107 for stage_name in existing_stages:
108 stage_key = stage_name.lower()
109 stage_dir = root / sala_name / folder_name / stage_name
111 metadata = extract_metadata_for_stage(kg, nr, stage_key)
113 meta_path = stage_dir / "meta.ttl"
114 metadata.serialize(destination=str(meta_path), format="turtle")
116 prov_path = stage_dir / "prov.nq"
117 generate_provenance_snapshots(
118 input_directory=str(stage_dir),
119 output_file=str(prov_path),
120 output_format="nquads",
121 agent_orcid=RESP_AGENT,
122 primary_source=PRIMARY_SOURCE,
123 )
125 print(f"Processed {folder_name} (NR={nr}): {len(existing_stages)} stages")
128def parse_arguments(): # pragma: no cover
129 parser = argparse.ArgumentParser(
130 description="Generate metadata and provenance files for folder structure"
131 )
132 parser.add_argument(
133 "root",
134 type=Path,
135 help="Root directory containing Sala/Folder/Stage structure",
136 )
137 parser.add_argument(
138 "--structure",
139 "-s",
140 type=Path,
141 default=None,
142 help="SharePoint JSON structure file (optional, for development)",
143 )
144 return parser.parse_args()
147def main(): # pragma: no cover
148 args = parse_arguments()
149 process_all_folders(root=args.root, structure_path=args.structure)
150 print("\nProcessing complete")
153if __name__ == "__main__": # pragma: no cover
154 main()