Coverage for changes_metadata_manager / zenodo_upload.py: 82%

348 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-03-04 14:41 +0000

1import argparse 

2import csv 

3import re 

4import unicodedata 

5import zipfile 

6from collections import defaultdict 

7from datetime import date 

8from pathlib import Path 

9 

10import yaml 

11from rdflib import Graph, URIRef 

12from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn 

13 

14from piccione.upload.on_zenodo import main as piccione_upload 

15 

16from changes_metadata_manager.folder_metadata_builder import ( 

17 BASE_URI, 

18 FOLDER_TO_ID, 

19 KG_PATH, 

20 SKIP_FOLDERS, 

21 STAGE_STEPS, 

22 extract_id_from_folder_name, 

23 load_kg, 

24 scan_folder_structure, 

25) 

26 

27 

28class LiteralBlockDumper(yaml.SafeDumper): 

29 pass 

30 

31 

32def _literal_str_representer(dumper: yaml.SafeDumper, data): 

33 if "\n" in data: 

34 return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") 

35 return dumper.represent_scalar("tag:yaml.org,2002:str", data) 

36 

37 

38LiteralBlockDumper.add_representer(str, _literal_str_representer) 

39 

40CREATORS_LOOKUP_PATH = Path(__file__).parent.parent / "data" / "creators_lookup.yaml" 

41 

42 

43def slugify(text: str) -> str: 

44 text = unicodedata.normalize("NFKD", text) 

45 text = text.encode("ascii", "ignore").decode("ascii") 

46 text = re.sub(r"[^\w\s-]", "", text.lower()) 

47 return re.sub(r"[-\s]+", "-", text).strip("-") 

48 

49STEP_TO_STAGE = { 

50 "00": "raw", 

51 "01": "rawp", 

52 "02": "dcho", 

53 "03": "dchoo", 

54 "04": "dchoo", 

55 "05": "dchoo", 

56 "06": "dchoo", 

57} 

58 

59CRM = "http://www.cidoc-crm.org/cidoc-crm/" 

60CRMDIG = "http://www.cidoc-crm.org/extensions/crmdig/" 

61AAT = "http://vocab.getty.edu/aat/" 

62 

63P70I = URIRef(f"{CRM}P70i_is_documented_in") 

64P3_HAS_NOTE = URIRef(f"{CRM}P3_has_note") 

65P14_CARRIED_OUT_BY = URIRef(f"{CRM}P14_carried_out_by") 

66P1_IS_IDENTIFIED_BY = URIRef(f"{CRM}P1_is_identified_by") 

67P190_HAS_SYMBOLIC_CONTENT = URIRef(f"{CRM}P190_has_symbolic_content") 

68P74_HAS_RESIDENCE = URIRef(f"{CRM}P74_has_current_or_former_residence") 

69P32_USED_GENERAL_TECHNIQUE = URIRef(f"{CRM}P32_used_general_technique") 

70P16_USED_SPECIFIC_OBJECT = URIRef(f"{CRM}P16_used_specific_object") 

71E21_PERSON = URIRef(f"{CRM}E21_Person") 

72RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") 

73L23_USED_SOFTWARE = URIRef(f"{CRMDIG}L23_used_software_or_firmware") 

74 

75AAT_TECHNIQUE_LABELS: dict[str, str] = { 

76 f"{AAT}300266792": "digital photography", 

77 f"{AAT}300429747": "optical scanning", 

78} 

79 

80SLUG_LABELS: dict[str, str] = { 

81 "3df_zephyr": "3DF Zephyr", 

82 "adobe_photoshop_2023": "Adobe Photoshop 2023", 

83 "agisoft_metashape": "Agisoft Metashape", 

84 "artec_eva": "Artec Eva", 

85 "artec_leo": "Artec Leo", 

86 "artec_spider": "Artec Spider", 

87 "artec_studio_14": "Artec Studio 14", 

88 "artec_studio_15": "Artec Studio 15", 

89 "artec_studio_16": "Artec Studio 16", 

90 "artec_studio_19": "Artec Studio 19", 

91 "aton": "ATON", 

92 "blender": "Blender", 

93 "canon_eos_6d": "Canon EOS 6D", 

94 "chad-ap": "CHAD-AP", 

95 "cloudcompare": "CloudCompare", 

96 "gestaltor": "Gestaltor", 

97 "gimp": "GIMP", 

98 "heritrace": "HERITRACE", 

99 "instalod": "InstaLOD", 

100 "instant_meshes": "Instant Meshes", 

101 "lente_24-70_f2_8_l": "Canon EF 24-70mm f/2.8L", 

102 "meshlab": "MeshLab", 

103 "metashape": "Agisoft Metashape", 

104 "microscopio_digitale_bresciani": "Microscopio Digitale Bresciani", 

105 "modo": "Modo", 

106 "morph-kgc": "Morph-KGC", 

107 "nextcloud": "Nextcloud", 

108 "nikkor_35mm": "Nikkor 35mm", 

109 "nikkor_50mm": "Nikkor 50mm", 

110 "nikon_d3300": "Nikon D3300", 

111 "nikon_d5200": "Nikon D5200", 

112 "nikon_d5600": "Nikon D5600", 

113 "nikon_d7200": "Nikon D7200", 

114 "nikon_d750": "Nikon D750", 

115 "panasonic_dmc-lx100": "Panasonic DMC-LX100", 

116 "sony_alpha_6100": "Sony Alpha 6100", 

117 "sony_alpha_7_i": "Sony Alpha 7 I", 

118 "substance_3d_painter": "Substance 3D Painter", 

119 "substance_painter": "Substance Painter", 

120 "zbrush": "ZBrush", 

121} 

122 

123 

124def _format_slug(slug: str) -> str: 

125 return SLUG_LABELS[slug] 

126 

127 

128def load_creators_lookup(path: Path) -> dict[str, dict]: 

129 with open(path) as f: 

130 data = yaml.safe_load(f) 

131 return { 

132 creator["name_in_rdf"]: { 

133 "family_name": creator["family_name"], 

134 "given_name": creator["given_name"], 

135 "affiliation": creator["affiliation"], 

136 "orcid": creator["orcid"], 

137 } 

138 for creator in data["creators"] 

139 } 

140 

141 

142def _format_creator(creator_data: dict, role: str) -> dict: 

143 return { 

144 "person_or_org": { 

145 "type": "personal", 

146 "family_name": creator_data["family_name"], 

147 "given_name": creator_data["given_name"], 

148 "identifiers": [{"scheme": "orcid", "identifier": creator_data["orcid"]}], 

149 }, 

150 "role": {"id": role}, 

151 "affiliations": [{"name": creator_data["affiliation"]}], 

152 } 

153 

154 

155METADATA_STEP = "05" 

156 

157 

158def _extract_actor_names(graph: Graph, act_uri: URIRef) -> set[str]: 

159 names = set() 

160 for _, _, actor_uri in graph.triples((act_uri, P14_CARRIED_OUT_BY, None)): 

161 assert (actor_uri, RDF_TYPE, E21_PERSON) in graph 

162 for _, _, apl_uri in graph.triples((actor_uri, P1_IS_IDENTIFIED_BY, None)): 

163 for _, _, name in graph.triples((apl_uri, P190_HAS_SYMBOLIC_CONTENT, None)): 

164 names.add(str(name)) 

165 return names 

166 

167 

168def extract_authors_for_entity_stage(graph: Graph, entity_id: str, stage: str) -> set[str]: 

169 steps = [s for s in STAGE_STEPS[stage] if s != METADATA_STEP] 

170 authors = set() 

171 for step in steps: 

172 authors |= _extract_actor_names(graph, URIRef(f"{BASE_URI}/act/{entity_id}/{step}/1")) 

173 return authors 

174 

175 

176def extract_metadata_authors(graph: Graph, entity_id: str) -> set[str]: 

177 return _extract_actor_names(graph, URIRef(f"{BASE_URI}/act/{entity_id}/05/1")) 

178 

179 

180def build_creators_for_entity_stage( 

181 graph: Graph, entity_id: str, stage: str, creators_lookup: dict[str, dict] 

182) -> list[dict]: 

183 author_names = extract_authors_for_entity_stage(graph, entity_id, stage) 

184 return [ 

185 _format_creator(creators_lookup[name], "researcher") 

186 for name in sorted(author_names) 

187 if name in creators_lookup 

188 ] 

189 

190 

191def build_metadata_creators( 

192 graph: Graph, entity_id: str, creators_lookup: dict[str, dict] 

193) -> list[dict]: 

194 author_names = extract_metadata_authors(graph, entity_id) 

195 return [ 

196 _format_creator(creators_lookup[name], "datacurator") 

197 for name in sorted(author_names) 

198 if name in creators_lookup 

199 ] 

200 

201 

202def merge_creators(digitization_creators: list[dict], metadata_creators: list[dict]) -> list[dict]: 

203 seen_orcids: set[str] = set() 

204 merged: list[dict] = [] 

205 for creator in digitization_creators: 

206 orcid = creator["person_or_org"]["identifiers"][0]["identifier"] 

207 seen_orcids.add(orcid) 

208 merged.append(creator) 

209 for creator in metadata_creators: 

210 orcid = creator["person_or_org"]["identifiers"][0]["identifier"] 

211 if orcid not in seen_orcids: 

212 seen_orcids.add(orcid) 

213 merged.append(creator) 

214 return merged 

215 

216 

217def extract_licensed_entity_stages(graph: Graph) -> set[tuple[str, str]]: 

218 pattern = re.compile(rf"^{re.escape(BASE_URI)}/lic/([^/]+)/(\d{{2}})/1$") 

219 licensed = set() 

220 for s, p, o in graph.triples((None, P70I, None)): 

221 match = pattern.match(str(s)) 

222 if match: 

223 entity_id, step = match.groups() 

224 stage = STEP_TO_STAGE.get(step) 

225 if stage: 

226 licensed.add((entity_id, stage)) 

227 return licensed 

228 

229 

230def group_folders_by_entity(structure: dict) -> dict[str, list[tuple[str, str, dict]]]: 

231 groups = defaultdict(list) 

232 for sala_name, sala_items in structure["structure"].items(): 

233 for folder_name, subfolders in sala_items.items(): 

234 if folder_name in SKIP_FOLDERS: 

235 continue 

236 entity_id = extract_id_from_folder_name(folder_name) 

237 if entity_id in FOLDER_TO_ID.values(): 

238 base_id = entity_id 

239 else: 

240 base_id = entity_id.rstrip("abcdefghijklmnopqrstuvwxyz") 

241 groups[base_id].append((sala_name, folder_name, subfolders)) 

242 return dict(groups) 

243 

244 

245STAGES = ("raw", "rawp", "dcho", "dchoo") 

246 

247 

248def create_stage_zip( 

249 entity_id: str, 

250 stage: str, 

251 folders: list[tuple[str, str, dict]], 

252 root: Path, 

253 licensed_stages: set[tuple[str, str]], 

254 output_dir: Path, 

255 title: str, 

256) -> tuple[Path, bool] | None: 

257 sala_name = folders[0][0] 

258 sala_slug = slugify(sala_name) 

259 title_slug = slugify(title) 

260 zip_path = output_dir / f"{sala_slug}-{title_slug}-{stage}.zip" 

261 has_files = False 

262 with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: 

263 for sala_name, folder_name, stages_dict in folders: 

264 stage_name_in_folder = None 

265 for name in stages_dict: 

266 if name.lower() == stage: 

267 stage_name_in_folder = name 

268 break 

269 if stage_name_in_folder is None: 

270 continue 

271 stage_dir = root / sala_name / folder_name / stage_name_in_folder 

272 has_license = (entity_id, stage) in licensed_stages 

273 for file_path in stage_dir.iterdir(): 

274 if not file_path.is_file(): 

275 continue 

276 if has_license or file_path.name in ("meta.ttl", "prov.trig"): 

277 arc_name = f"{folder_name}/{stage_name_in_folder}/{file_path.name}" 

278 zf.write(file_path, arc_name) 

279 has_files = True 

280 if not has_files: 

281 zip_path.unlink() 

282 return None 

283 return zip_path, (entity_id, stage) in licensed_stages 

284 

285 

286def _get_label(graph: Graph, uri: URIRef) -> str | None: 

287 for _, _, apl_uri in graph.triples((uri, P1_IS_IDENTIFIED_BY, None)): 

288 for _, _, name in graph.triples((apl_uri, P190_HAS_SYMBOLIC_CONTENT, None)): 

289 return str(name) 

290 return None 

291 

292 

293def extract_keeper_info(graph: Graph, entity_id: str) -> tuple[str | None, str | None]: 

294 custody_uri = URIRef(f"{BASE_URI}/act/{entity_id}/ob08/1") 

295 for _, _, keeper_uri in graph.triples((custody_uri, P14_CARRIED_OUT_BY, None)): 

296 assert isinstance(keeper_uri, URIRef) 

297 keeper_name = _get_label(graph, keeper_uri) 

298 location_name = None 

299 for _, _, place_uri in graph.triples((keeper_uri, P74_HAS_RESIDENCE, None)): 

300 assert isinstance(place_uri, URIRef) 

301 location_name = _get_label(graph, place_uri) 

302 return keeper_name, location_name 

303 return None, None 

304 

305 

306def extract_entity_title(graph: Graph, entity_id: str) -> str: 

307 item_uri = URIRef(f"{BASE_URI}/itm/{entity_id}/ob00/1") 

308 for s, p, o in graph.triples((item_uri, P3_HAS_NOTE, None)): 

309 note = str(o) 

310 return note.split("\n")[0].strip() 

311 return f"Entity {entity_id}" 

312 

313 

314def extract_acquisition_technique(graph: Graph, entity_id: str) -> str | None: 

315 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/00/1") 

316 for _, _, technique_uri in graph.triples((act_uri, P32_USED_GENERAL_TECHNIQUE, None)): 

317 return AAT_TECHNIQUE_LABELS[str(technique_uri)] 

318 return None 

319 

320 

321def extract_devices(graph: Graph, entity_id: str) -> list[str]: 

322 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/00/1") 

323 devices = [] 

324 for _, _, obj_uri in graph.triples((act_uri, P16_USED_SPECIFIC_OBJECT, None)): 

325 uri_str = str(obj_uri) 

326 if "/dev/" in uri_str: 

327 slug = uri_str.split("/dev/")[1].split("/")[0] 

328 devices.append(_format_slug(slug)) 

329 return sorted(devices) 

330 

331 

332def extract_software_for_stage(graph: Graph, entity_id: str, stage: str) -> list[str]: 

333 steps = [s for s in STAGE_STEPS[stage] if s != METADATA_STEP] 

334 software: set[str] = set() 

335 for step in steps: 

336 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/{step}/1") 

337 for _, _, sfw_uri in graph.triples((act_uri, L23_USED_SOFTWARE, None)): 

338 uri_str = str(sfw_uri) 

339 slug = uri_str.split("/sfw/")[1].split("/")[0] 

340 software.add(_format_slug(slug)) 

341 return sorted(software) 

342 

343 

344LICENSE_URI_TO_ZENODO = { 

345 "https://creativecommons.org/publicdomain/zero/1.0/": "cc0-1.0", 

346 "https://creativecommons.org/licenses/by/4.0/": "cc-by-4.0", 

347 "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc-4.0", 

348 "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa-4.0", 

349 "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa-4.0", 

350} 

351 

352STAGE_TITLE_NAMES = { 

353 "raw": "Raw", 

354 "rawp": "Processed raw model", 

355 "dcho": "Digital Cultural Heritage Object", 

356 "dchoo": "Optimized Digital Cultural Heritage Object", 

357} 

358 

359STAGE_DESCRIPTION_NAMES = { 

360 "raw": "Raw acquisition data", 

361 "rawp": "Processed raw model", 

362 "dcho": "Digital Cultural Heritage Object", 

363 "dchoo": "Optimized Digital Cultural Heritage Object", 

364} 

365 

366STAGE_DESCRIPTIONS = { 

367 "raw": "This dataset contains the raw material generated during the acquisition phase.", 

368 "rawp": "This dataset contains the preliminary output from the photogrammetry or scanner software after initial data processing but without any interpolation or geometry corrections.", 

369 "dcho": "This dataset contains the version that includes interpolation, gap filling, and resolution of geometric issues, resulting in a refined and improved model.", 

370 "dchoo": "This dataset contains the version optimised for real-time online interaction.", 

371} 

372 

373PROPAGATED_FIELDS = ( 

374 "zenodo_url", "access_token", "user_agent", 

375 "subjects", "publication_date", 

376 "version", "community", 

377 "contributors", "funding", 

378 "references", "dates", 

379) 

380 

381 

382def extract_license_for_entity_stage(graph: Graph, entity_id: str, stage: str) -> str | None: 

383 steps = STAGE_STEPS[stage] 

384 for step in steps: 

385 lic_uri = URIRef(f"{BASE_URI}/lic/{entity_id}/{step}/1") 

386 for _, _, license_url in graph.triples((lic_uri, P70I, None)): 

387 zenodo_license = LICENSE_URI_TO_ZENODO.get(str(license_url)) 

388 if zenodo_license: 

389 return zenodo_license 

390 return None 

391 

392 

393CC0_DISCLAIMER = ( 

394 "No copyright or related rights are claimed in these digital reproductions. " 

395 "The files are released under CC0 1.0 Universal (Public Domain Dedication).\n" 

396 "\n" 

397 "Please note that the original works may qualify as cultural heritage assets " 

398 "under Italian law (D. Lgs. 42/2004). Consequently, although the digital " 

399 "reproductions are released under CC0, certain uses — and in particular " 

400 "commercial uses — may be subject to specific authorisations, restrictions, " 

401 "or fees pursuant to the applicable provisions governing the reproduction " 

402 "and publication of cultural heritage assets. Users are therefore responsible " 

403 "for ensuring compliance with Italian cultural heritage regulations before " 

404 "undertaking any commercial exploitation of the images." 

405) 

406 

407 

408CHAD_AP_URL = "https://w3id.org/dharc/ontology/chad-ap" 

409 

410RESTRICTED_NOTICE = ( 

411 "The digital object files are not included in this dataset " 

412 "because the holding institution did not grant permission for their publication. " 

413 "Only metadata and provenance files are provided." 

414) 

415 

416 

417def build_enhanced_description( 

418 stage: str, 

419 title: str, 

420 keeper_name: str | None = None, 

421 keeper_location: str | None = None, 

422) -> str: 

423 parts = [ 

424 f'{STAGE_DESCRIPTION_NAMES[stage]} of "{title}" from the Aldrovandi Digital Twin.', 

425 ] 

426 if keeper_name: 

427 keeper_line = f"The original object is held by {keeper_name}" 

428 if keeper_location: 

429 keeper_line += f" ({keeper_location})" 

430 keeper_line += "." 

431 parts.append(keeper_line) 

432 parts.append(STAGE_DESCRIPTIONS[stage]) 

433 parts.append( 

434 f"Includes metadata (meta.ttl) and provenance (prov.trig) files following the <a href=\"{CHAD_AP_URL}\">CHAD-AP</a> ontology.", 

435 ) 

436 return " ".join(parts) + "\n" 

437 

438 

439WORKFLOW_DOI_URL = "https://doi.org/10.46298/transformations.14773" 

440 

441 

442def build_methods_description( 

443 graph: Graph, 

444 entity_id: str, 

445 stage: str, 

446) -> str: 

447 parts = [ 

448 f'Acquisition and digitization followed the reproducible workflow documented in ' 

449 f'<a href="{WORKFLOW_DOI_URL}">doi:10.46298/transformations.14773</a>.', 

450 ] 

451 technique = extract_acquisition_technique(graph, entity_id) 

452 devices = extract_devices(graph, entity_id) 

453 if technique: 

454 line = f"Data was acquired using {technique}" 

455 if devices: 

456 line += f" ({', '.join(devices)})" 

457 line += "." 

458 parts.append(line) 

459 software = extract_software_for_stage(graph, entity_id, stage) 

460 if software: 

461 parts.append(f"Processing software: {', '.join(software)}.") 

462 parts.append( 

463 f'Metadata follows the <a href="{CHAD_AP_URL}">Cultural Heritage Acquisition and ' 

464 f"Digitisation Application Profile (CHAD-AP)</a> based on CIDOC-CRM.", 

465 ) 

466 return "\n\n".join(parts) + "\n" 

467 

468 

469def build_entity_uri(entity_id: str) -> str: 

470 return f"{BASE_URI}/itm/{entity_id}/ob00/1" 

471 

472 

473LICENSE_INFO = { 

474 "cc0-1.0": { 

475 "title": "Creative Commons Zero v1.0 Universal", 

476 "link": "https://creativecommons.org/publicdomain/zero/1.0/", 

477 }, 

478 "cc-by-4.0": { 

479 "title": "Creative Commons Attribution 4.0 International", 

480 "link": "https://creativecommons.org/licenses/by/4.0/", 

481 }, 

482 "cc-by-nc-4.0": { 

483 "title": "Creative Commons Attribution Non Commercial 4.0 International", 

484 "link": "https://creativecommons.org/licenses/by-nc/4.0/", 

485 }, 

486 "cc-by-sa-4.0": { 

487 "title": "Creative Commons Attribution Share Alike 4.0 International", 

488 "link": "https://creativecommons.org/licenses/by-sa/4.0/", 

489 }, 

490 "cc-by-nc-sa-4.0": { 

491 "title": "Creative Commons Attribution Non Commercial Share Alike 4.0 International", 

492 "link": "https://creativecommons.org/licenses/by-nc-sa/4.0/", 

493 }, 

494} 

495 

496 

497def build_rights(content_license: str | None) -> list[dict]: 

498 metadata_info = LICENSE_INFO["cc0-1.0"] 

499 rights = [{ 

500 "title": {"en": f"{metadata_info['title']} (Metadata license)"}, 

501 "description": {"en": "Applies to metadata files: meta.ttl, prov.trig"}, 

502 "link": metadata_info["link"], 

503 }] 

504 if content_license and content_license in LICENSE_INFO: 

505 content_info = LICENSE_INFO[content_license] 

506 rights.append({ 

507 "title": {"en": f"{content_info['title']} (Content license)"}, 

508 "description": {"en": "Applies to all data files except meta.ttl and prov.trig"}, 

509 "link": content_info["link"], 

510 }) 

511 return rights 

512 

513 

514def generate_zenodo_config( 

515 stage: str, 

516 zip_path: Path, 

517 title: str, 

518 base_config: dict, 

519 creators: list[dict], 

520 methods_description: str, 

521 license: str | None = None, 

522 entity_uri: str | None = None, 

523 keeper_name: str | None = None, 

524 keeper_location: str | None = None, 

525 has_license: bool = True, 

526) -> dict: 

527 description = build_enhanced_description(stage, title, keeper_name, keeper_location) 

528 

529 config: dict = { 

530 "title": f"{title} - {STAGE_TITLE_NAMES[stage]} - Aldrovandi Digital Twin", 

531 "description": description, 

532 "resource_type": {"id": "dataset"}, 

533 "publisher": "Zenodo", 

534 "access": {"record": "public", "files": "public"}, 

535 "files": [str(zip_path.absolute())], 

536 "creators": creators, 

537 "publication_date": date.today().isoformat(), 

538 "rights": build_rights(license), 

539 } 

540 

541 additional_descriptions: list[dict] = [ 

542 { 

543 "description": methods_description, 

544 "type": {"id": "methods"}, 

545 }, 

546 { 

547 "description": base_config["notes"], 

548 "type": {"id": "notes"}, 

549 }, 

550 ] 

551 if not has_license: 

552 additional_descriptions.append({ 

553 "description": RESTRICTED_NOTICE, 

554 "type": {"id": "notes"}, 

555 }) 

556 if license == "cc0-1.0": 

557 additional_descriptions.append({ 

558 "description": CC0_DISCLAIMER, 

559 "type": {"id": "notes"}, 

560 }) 

561 config["additional_descriptions"] = additional_descriptions 

562 

563 config["locations"] = { 

564 "features": [ 

565 { 

566 "geometry": { 

567 "type": "Point", 

568 "coordinates": [loc["lon"], loc["lat"]], 

569 }, 

570 "place": loc["place"], 

571 "description": loc["description"], 

572 } 

573 for loc in base_config["locations"] 

574 ] 

575 } 

576 

577 for field in PROPAGATED_FIELDS: 

578 if field in base_config and field not in config: 

579 config[field] = base_config[field] 

580 

581 if "related_identifiers" in base_config: 

582 converted = [] 

583 for ri in base_config["related_identifiers"]: 

584 entry: dict = { 

585 "identifier": ri["identifier"], 

586 "relation_type": {"id": ri["relation"]}, 

587 } 

588 if "resource_type" in ri: 

589 entry["resource_type"] = {"id": ri["resource_type"]} 

590 if "scheme" in ri: 

591 entry["scheme"] = ri["scheme"] 

592 converted.append(entry) 

593 config["related_identifiers"] = converted 

594 

595 if entity_uri: 

596 config["identifiers"] = [{"identifier": entity_uri, "scheme": "url"}] 

597 

598 return config 

599 

600 

601def prepare_all( 

602 root: Path, 

603 zenodo_base_config_path: Path, 

604 output_dir: Path, 

605 kg_path: Path = KG_PATH, 

606) -> None: 

607 structure = scan_folder_structure(root) 

608 

609 kg = load_kg(kg_path) 

610 licensed_stages = extract_licensed_entity_stages(kg) 

611 entity_groups = group_folders_by_entity(structure) 

612 

613 with open(zenodo_base_config_path) as f: 

614 base_config = yaml.safe_load(f) 

615 

616 creators_lookup = load_creators_lookup(CREATORS_LOOKUP_PATH) 

617 

618 zips_dir = output_dir / "zips" 

619 configs_dir = output_dir / "configs" 

620 zips_dir.mkdir(parents=True, exist_ok=True) 

621 configs_dir.mkdir(parents=True, exist_ok=True) 

622 

623 with Progress( 

624 SpinnerColumn(), 

625 TextColumn("[progress.description]{task.description}"), 

626 BarColumn(), 

627 MofNCompleteColumn(), 

628 ) as progress: 

629 task = progress.add_task("Creating stage packages", total=len(entity_groups) * len(STAGES)) 

630 

631 for entity_id, folders in entity_groups.items(): 

632 title = extract_entity_title(kg, entity_id) 

633 keeper_name, keeper_location = extract_keeper_info(kg, entity_id) 

634 sala_slug = slugify(folders[0][0]) 

635 title_slug = slugify(title) 

636 metadata_creators = build_metadata_creators(kg, entity_id, creators_lookup) 

637 for stage in STAGES: 

638 progress.update(task, description=f"Entity {entity_id} - {stage}") 

639 result = create_stage_zip(entity_id, stage, folders, root, licensed_stages, zips_dir, title) 

640 if result is None: 

641 progress.advance(task) 

642 continue 

643 zip_path, has_license = result 

644 digitization_creators = build_creators_for_entity_stage(kg, entity_id, stage, creators_lookup) 

645 creators = merge_creators(digitization_creators, metadata_creators) 

646 license = extract_license_for_entity_stage(kg, entity_id, stage) 

647 entity_uri = build_entity_uri(entity_id) 

648 methods_description = build_methods_description(kg, entity_id, stage) 

649 config = generate_zenodo_config(stage, zip_path, title, base_config, creators, methods_description, license, entity_uri, keeper_name, keeper_location, has_license) 

650 config_path = configs_dir / f"{sala_slug}-{title_slug}-{stage}.yaml" 

651 with open(config_path, "w") as f: 

652 yaml.dump(config, f, Dumper=LiteralBlockDumper, default_flow_style=False, allow_unicode=True, sort_keys=False) 

653 progress.advance(task) 

654 

655 

656def _extract_doi(record: dict) -> str: 

657 pids = record.get("pids", {}) 

658 doi_info = pids.get("doi", {}) 

659 return doi_info.get("identifier", "") 

660 

661 

662def _extract_record_url(record: dict) -> str: 

663 return record["links"]["self_html"] 

664 

665 

666LICENSE_TITLE_TO_SHORT: dict[str, str] = { 

667 info["title"]: short_name for short_name, info in LICENSE_INFO.items() 

668} 

669 

670 

671def _format_creators_for_table(config: dict) -> str: 

672 creators = config["creators"] 

673 parts: list[str] = [] 

674 for c in creators: 

675 org = c["person_or_org"] 

676 orcid = org["identifiers"][0]["identifier"] 

677 parts.append(f"{org['family_name']}, {org['given_name']} [orcid:{orcid}]") 

678 return "; ".join(parts) 

679 

680 

681def _format_licenses_for_table(config: dict) -> str: 

682 parts: list[str] = [] 

683 for right in config["rights"]: 

684 title_en = right["title"]["en"] 

685 for full_name, short_name in LICENSE_TITLE_TO_SHORT.items(): 

686 if title_en.startswith(full_name): 

687 context = title_en.removeprefix(full_name).strip(" ()") 

688 parts.append(f"{short_name} ({context})") 

689 break 

690 return "; ".join(parts) 

691 

692 

693DOI_TABLE_FIELDNAMES = [ 

694 "Numero su DMP", 

695 "Caso di studio", 

696 "Autore/i", 

697 "Tipo", 

698 "Titolo", 

699 "Data pubblicazione", 

700 "DOI", 

701 "URL", 

702 "Repository", 

703 "Licenza", 

704 "Note", 

705] 

706 

707 

708def upload_all(configs_dir: Path, publish: bool = False) -> Path: 

709 config_files = sorted(configs_dir.glob("*.yaml")) 

710 doi_table: list[dict[str, str]] = [] 

711 

712 with Progress( 

713 SpinnerColumn(), 

714 TextColumn("[progress.description]{task.description}"), 

715 BarColumn(), 

716 MofNCompleteColumn(), 

717 ) as progress: 

718 task = progress.add_task("Uploading to Zenodo", total=len(config_files)) 

719 for config_file in config_files: 

720 progress.update(task, description=f"Uploading {config_file.stem}") 

721 record = piccione_upload(str(config_file), publish=publish) 

722 with open(config_file) as f: 

723 config = yaml.safe_load(f) 

724 row: dict[str, str] = { 

725 "Numero su DMP": "", 

726 "Caso di studio": "Aldrovandi", 

727 "Autore/i": _format_creators_for_table(config), 

728 "Tipo": "Dataset", 

729 "Titolo": config["title"], 

730 "Data pubblicazione": config["publication_date"], 

731 "DOI": _extract_doi(record), 

732 "URL": _extract_record_url(record), 

733 "Repository": "Zenodo", 

734 "Licenza": _format_licenses_for_table(config), 

735 "Note": "", 

736 } 

737 doi_table.append(row) 

738 progress.advance(task) 

739 

740 csv_path = configs_dir.parent / "doi_table.csv" 

741 with open(csv_path, "w", newline="") as f: 

742 writer = csv.DictWriter(f, fieldnames=DOI_TABLE_FIELDNAMES) 

743 writer.writeheader() 

744 writer.writerows(doi_table) 

745 print(f"DOI table written to {csv_path}") 

746 return csv_path 

747 

748 

749def parse_arguments(): # pragma: no cover 

750 parser = argparse.ArgumentParser(description="Prepare and upload Zenodo packages") 

751 subparsers = parser.add_subparsers(dest="command", required=True) 

752 

753 prepare_parser = subparsers.add_parser("prepare", help="Create zips and YAML configs") 

754 prepare_parser.add_argument("root", type=Path, help="Root directory with Sala/Folder/Stage structure") 

755 prepare_parser.add_argument("zenodo_config", type=Path, help="Base Zenodo configuration YAML") 

756 prepare_parser.add_argument("--output", "-o", type=Path, default=Path("zenodo_output"), help="Output directory") 

757 upload_parser = subparsers.add_parser("upload", help="Upload to Zenodo") 

758 upload_parser.add_argument("configs_dir", type=Path, help="Directory containing YAML configs") 

759 upload_parser.add_argument("--publish", action="store_true", help="Publish after upload") 

760 

761 return parser.parse_args() 

762 

763 

764def main(): # pragma: no cover 

765 args = parse_arguments() 

766 if args.command == "prepare": 

767 prepare_all( 

768 root=args.root, 

769 zenodo_base_config_path=args.zenodo_config, 

770 output_dir=args.output, 

771 ) 

772 elif args.command == "upload": 

773 upload_all(configs_dir=args.configs_dir, publish=args.publish) 

774 

775 

776if __name__ == "__main__": # pragma: no cover 

777 main()