Coverage for changes_metadata_manager / zenodo_upload.py: 82%
348 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-04 14:41 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-03-04 14:41 +0000
1import argparse
2import csv
3import re
4import unicodedata
5import zipfile
6from collections import defaultdict
7from datetime import date
8from pathlib import Path
10import yaml
11from rdflib import Graph, URIRef
12from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, MofNCompleteColumn
14from piccione.upload.on_zenodo import main as piccione_upload
16from changes_metadata_manager.folder_metadata_builder import (
17 BASE_URI,
18 FOLDER_TO_ID,
19 KG_PATH,
20 SKIP_FOLDERS,
21 STAGE_STEPS,
22 extract_id_from_folder_name,
23 load_kg,
24 scan_folder_structure,
25)
28class LiteralBlockDumper(yaml.SafeDumper):
29 pass
32def _literal_str_representer(dumper: yaml.SafeDumper, data):
33 if "\n" in data:
34 return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
35 return dumper.represent_scalar("tag:yaml.org,2002:str", data)
38LiteralBlockDumper.add_representer(str, _literal_str_representer)
40CREATORS_LOOKUP_PATH = Path(__file__).parent.parent / "data" / "creators_lookup.yaml"
43def slugify(text: str) -> str:
44 text = unicodedata.normalize("NFKD", text)
45 text = text.encode("ascii", "ignore").decode("ascii")
46 text = re.sub(r"[^\w\s-]", "", text.lower())
47 return re.sub(r"[-\s]+", "-", text).strip("-")
49STEP_TO_STAGE = {
50 "00": "raw",
51 "01": "rawp",
52 "02": "dcho",
53 "03": "dchoo",
54 "04": "dchoo",
55 "05": "dchoo",
56 "06": "dchoo",
57}
59CRM = "http://www.cidoc-crm.org/cidoc-crm/"
60CRMDIG = "http://www.cidoc-crm.org/extensions/crmdig/"
61AAT = "http://vocab.getty.edu/aat/"
63P70I = URIRef(f"{CRM}P70i_is_documented_in")
64P3_HAS_NOTE = URIRef(f"{CRM}P3_has_note")
65P14_CARRIED_OUT_BY = URIRef(f"{CRM}P14_carried_out_by")
66P1_IS_IDENTIFIED_BY = URIRef(f"{CRM}P1_is_identified_by")
67P190_HAS_SYMBOLIC_CONTENT = URIRef(f"{CRM}P190_has_symbolic_content")
68P74_HAS_RESIDENCE = URIRef(f"{CRM}P74_has_current_or_former_residence")
69P32_USED_GENERAL_TECHNIQUE = URIRef(f"{CRM}P32_used_general_technique")
70P16_USED_SPECIFIC_OBJECT = URIRef(f"{CRM}P16_used_specific_object")
71E21_PERSON = URIRef(f"{CRM}E21_Person")
72RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
73L23_USED_SOFTWARE = URIRef(f"{CRMDIG}L23_used_software_or_firmware")
75AAT_TECHNIQUE_LABELS: dict[str, str] = {
76 f"{AAT}300266792": "digital photography",
77 f"{AAT}300429747": "optical scanning",
78}
80SLUG_LABELS: dict[str, str] = {
81 "3df_zephyr": "3DF Zephyr",
82 "adobe_photoshop_2023": "Adobe Photoshop 2023",
83 "agisoft_metashape": "Agisoft Metashape",
84 "artec_eva": "Artec Eva",
85 "artec_leo": "Artec Leo",
86 "artec_spider": "Artec Spider",
87 "artec_studio_14": "Artec Studio 14",
88 "artec_studio_15": "Artec Studio 15",
89 "artec_studio_16": "Artec Studio 16",
90 "artec_studio_19": "Artec Studio 19",
91 "aton": "ATON",
92 "blender": "Blender",
93 "canon_eos_6d": "Canon EOS 6D",
94 "chad-ap": "CHAD-AP",
95 "cloudcompare": "CloudCompare",
96 "gestaltor": "Gestaltor",
97 "gimp": "GIMP",
98 "heritrace": "HERITRACE",
99 "instalod": "InstaLOD",
100 "instant_meshes": "Instant Meshes",
101 "lente_24-70_f2_8_l": "Canon EF 24-70mm f/2.8L",
102 "meshlab": "MeshLab",
103 "metashape": "Agisoft Metashape",
104 "microscopio_digitale_bresciani": "Microscopio Digitale Bresciani",
105 "modo": "Modo",
106 "morph-kgc": "Morph-KGC",
107 "nextcloud": "Nextcloud",
108 "nikkor_35mm": "Nikkor 35mm",
109 "nikkor_50mm": "Nikkor 50mm",
110 "nikon_d3300": "Nikon D3300",
111 "nikon_d5200": "Nikon D5200",
112 "nikon_d5600": "Nikon D5600",
113 "nikon_d7200": "Nikon D7200",
114 "nikon_d750": "Nikon D750",
115 "panasonic_dmc-lx100": "Panasonic DMC-LX100",
116 "sony_alpha_6100": "Sony Alpha 6100",
117 "sony_alpha_7_i": "Sony Alpha 7 I",
118 "substance_3d_painter": "Substance 3D Painter",
119 "substance_painter": "Substance Painter",
120 "zbrush": "ZBrush",
121}
124def _format_slug(slug: str) -> str:
125 return SLUG_LABELS[slug]
128def load_creators_lookup(path: Path) -> dict[str, dict]:
129 with open(path) as f:
130 data = yaml.safe_load(f)
131 return {
132 creator["name_in_rdf"]: {
133 "family_name": creator["family_name"],
134 "given_name": creator["given_name"],
135 "affiliation": creator["affiliation"],
136 "orcid": creator["orcid"],
137 }
138 for creator in data["creators"]
139 }
142def _format_creator(creator_data: dict, role: str) -> dict:
143 return {
144 "person_or_org": {
145 "type": "personal",
146 "family_name": creator_data["family_name"],
147 "given_name": creator_data["given_name"],
148 "identifiers": [{"scheme": "orcid", "identifier": creator_data["orcid"]}],
149 },
150 "role": {"id": role},
151 "affiliations": [{"name": creator_data["affiliation"]}],
152 }
155METADATA_STEP = "05"
158def _extract_actor_names(graph: Graph, act_uri: URIRef) -> set[str]:
159 names = set()
160 for _, _, actor_uri in graph.triples((act_uri, P14_CARRIED_OUT_BY, None)):
161 assert (actor_uri, RDF_TYPE, E21_PERSON) in graph
162 for _, _, apl_uri in graph.triples((actor_uri, P1_IS_IDENTIFIED_BY, None)):
163 for _, _, name in graph.triples((apl_uri, P190_HAS_SYMBOLIC_CONTENT, None)):
164 names.add(str(name))
165 return names
168def extract_authors_for_entity_stage(graph: Graph, entity_id: str, stage: str) -> set[str]:
169 steps = [s for s in STAGE_STEPS[stage] if s != METADATA_STEP]
170 authors = set()
171 for step in steps:
172 authors |= _extract_actor_names(graph, URIRef(f"{BASE_URI}/act/{entity_id}/{step}/1"))
173 return authors
176def extract_metadata_authors(graph: Graph, entity_id: str) -> set[str]:
177 return _extract_actor_names(graph, URIRef(f"{BASE_URI}/act/{entity_id}/05/1"))
180def build_creators_for_entity_stage(
181 graph: Graph, entity_id: str, stage: str, creators_lookup: dict[str, dict]
182) -> list[dict]:
183 author_names = extract_authors_for_entity_stage(graph, entity_id, stage)
184 return [
185 _format_creator(creators_lookup[name], "researcher")
186 for name in sorted(author_names)
187 if name in creators_lookup
188 ]
191def build_metadata_creators(
192 graph: Graph, entity_id: str, creators_lookup: dict[str, dict]
193) -> list[dict]:
194 author_names = extract_metadata_authors(graph, entity_id)
195 return [
196 _format_creator(creators_lookup[name], "datacurator")
197 for name in sorted(author_names)
198 if name in creators_lookup
199 ]
202def merge_creators(digitization_creators: list[dict], metadata_creators: list[dict]) -> list[dict]:
203 seen_orcids: set[str] = set()
204 merged: list[dict] = []
205 for creator in digitization_creators:
206 orcid = creator["person_or_org"]["identifiers"][0]["identifier"]
207 seen_orcids.add(orcid)
208 merged.append(creator)
209 for creator in metadata_creators:
210 orcid = creator["person_or_org"]["identifiers"][0]["identifier"]
211 if orcid not in seen_orcids:
212 seen_orcids.add(orcid)
213 merged.append(creator)
214 return merged
217def extract_licensed_entity_stages(graph: Graph) -> set[tuple[str, str]]:
218 pattern = re.compile(rf"^{re.escape(BASE_URI)}/lic/([^/]+)/(\d{{2}})/1$")
219 licensed = set()
220 for s, p, o in graph.triples((None, P70I, None)):
221 match = pattern.match(str(s))
222 if match:
223 entity_id, step = match.groups()
224 stage = STEP_TO_STAGE.get(step)
225 if stage:
226 licensed.add((entity_id, stage))
227 return licensed
230def group_folders_by_entity(structure: dict) -> dict[str, list[tuple[str, str, dict]]]:
231 groups = defaultdict(list)
232 for sala_name, sala_items in structure["structure"].items():
233 for folder_name, subfolders in sala_items.items():
234 if folder_name in SKIP_FOLDERS:
235 continue
236 entity_id = extract_id_from_folder_name(folder_name)
237 if entity_id in FOLDER_TO_ID.values():
238 base_id = entity_id
239 else:
240 base_id = entity_id.rstrip("abcdefghijklmnopqrstuvwxyz")
241 groups[base_id].append((sala_name, folder_name, subfolders))
242 return dict(groups)
245STAGES = ("raw", "rawp", "dcho", "dchoo")
248def create_stage_zip(
249 entity_id: str,
250 stage: str,
251 folders: list[tuple[str, str, dict]],
252 root: Path,
253 licensed_stages: set[tuple[str, str]],
254 output_dir: Path,
255 title: str,
256) -> tuple[Path, bool] | None:
257 sala_name = folders[0][0]
258 sala_slug = slugify(sala_name)
259 title_slug = slugify(title)
260 zip_path = output_dir / f"{sala_slug}-{title_slug}-{stage}.zip"
261 has_files = False
262 with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
263 for sala_name, folder_name, stages_dict in folders:
264 stage_name_in_folder = None
265 for name in stages_dict:
266 if name.lower() == stage:
267 stage_name_in_folder = name
268 break
269 if stage_name_in_folder is None:
270 continue
271 stage_dir = root / sala_name / folder_name / stage_name_in_folder
272 has_license = (entity_id, stage) in licensed_stages
273 for file_path in stage_dir.iterdir():
274 if not file_path.is_file():
275 continue
276 if has_license or file_path.name in ("meta.ttl", "prov.trig"):
277 arc_name = f"{folder_name}/{stage_name_in_folder}/{file_path.name}"
278 zf.write(file_path, arc_name)
279 has_files = True
280 if not has_files:
281 zip_path.unlink()
282 return None
283 return zip_path, (entity_id, stage) in licensed_stages
286def _get_label(graph: Graph, uri: URIRef) -> str | None:
287 for _, _, apl_uri in graph.triples((uri, P1_IS_IDENTIFIED_BY, None)):
288 for _, _, name in graph.triples((apl_uri, P190_HAS_SYMBOLIC_CONTENT, None)):
289 return str(name)
290 return None
293def extract_keeper_info(graph: Graph, entity_id: str) -> tuple[str | None, str | None]:
294 custody_uri = URIRef(f"{BASE_URI}/act/{entity_id}/ob08/1")
295 for _, _, keeper_uri in graph.triples((custody_uri, P14_CARRIED_OUT_BY, None)):
296 assert isinstance(keeper_uri, URIRef)
297 keeper_name = _get_label(graph, keeper_uri)
298 location_name = None
299 for _, _, place_uri in graph.triples((keeper_uri, P74_HAS_RESIDENCE, None)):
300 assert isinstance(place_uri, URIRef)
301 location_name = _get_label(graph, place_uri)
302 return keeper_name, location_name
303 return None, None
306def extract_entity_title(graph: Graph, entity_id: str) -> str:
307 item_uri = URIRef(f"{BASE_URI}/itm/{entity_id}/ob00/1")
308 for s, p, o in graph.triples((item_uri, P3_HAS_NOTE, None)):
309 note = str(o)
310 return note.split("\n")[0].strip()
311 return f"Entity {entity_id}"
314def extract_acquisition_technique(graph: Graph, entity_id: str) -> str | None:
315 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/00/1")
316 for _, _, technique_uri in graph.triples((act_uri, P32_USED_GENERAL_TECHNIQUE, None)):
317 return AAT_TECHNIQUE_LABELS[str(technique_uri)]
318 return None
321def extract_devices(graph: Graph, entity_id: str) -> list[str]:
322 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/00/1")
323 devices = []
324 for _, _, obj_uri in graph.triples((act_uri, P16_USED_SPECIFIC_OBJECT, None)):
325 uri_str = str(obj_uri)
326 if "/dev/" in uri_str:
327 slug = uri_str.split("/dev/")[1].split("/")[0]
328 devices.append(_format_slug(slug))
329 return sorted(devices)
332def extract_software_for_stage(graph: Graph, entity_id: str, stage: str) -> list[str]:
333 steps = [s for s in STAGE_STEPS[stage] if s != METADATA_STEP]
334 software: set[str] = set()
335 for step in steps:
336 act_uri = URIRef(f"{BASE_URI}/act/{entity_id}/{step}/1")
337 for _, _, sfw_uri in graph.triples((act_uri, L23_USED_SOFTWARE, None)):
338 uri_str = str(sfw_uri)
339 slug = uri_str.split("/sfw/")[1].split("/")[0]
340 software.add(_format_slug(slug))
341 return sorted(software)
344LICENSE_URI_TO_ZENODO = {
345 "https://creativecommons.org/publicdomain/zero/1.0/": "cc0-1.0",
346 "https://creativecommons.org/licenses/by/4.0/": "cc-by-4.0",
347 "https://creativecommons.org/licenses/by-nc/4.0/": "cc-by-nc-4.0",
348 "https://creativecommons.org/licenses/by-sa/4.0/": "cc-by-sa-4.0",
349 "https://creativecommons.org/licenses/by-nc-sa/4.0/": "cc-by-nc-sa-4.0",
350}
352STAGE_TITLE_NAMES = {
353 "raw": "Raw",
354 "rawp": "Processed raw model",
355 "dcho": "Digital Cultural Heritage Object",
356 "dchoo": "Optimized Digital Cultural Heritage Object",
357}
359STAGE_DESCRIPTION_NAMES = {
360 "raw": "Raw acquisition data",
361 "rawp": "Processed raw model",
362 "dcho": "Digital Cultural Heritage Object",
363 "dchoo": "Optimized Digital Cultural Heritage Object",
364}
366STAGE_DESCRIPTIONS = {
367 "raw": "This dataset contains the raw material generated during the acquisition phase.",
368 "rawp": "This dataset contains the preliminary output from the photogrammetry or scanner software after initial data processing but without any interpolation or geometry corrections.",
369 "dcho": "This dataset contains the version that includes interpolation, gap filling, and resolution of geometric issues, resulting in a refined and improved model.",
370 "dchoo": "This dataset contains the version optimised for real-time online interaction.",
371}
373PROPAGATED_FIELDS = (
374 "zenodo_url", "access_token", "user_agent",
375 "subjects", "publication_date",
376 "version", "community",
377 "contributors", "funding",
378 "references", "dates",
379)
382def extract_license_for_entity_stage(graph: Graph, entity_id: str, stage: str) -> str | None:
383 steps = STAGE_STEPS[stage]
384 for step in steps:
385 lic_uri = URIRef(f"{BASE_URI}/lic/{entity_id}/{step}/1")
386 for _, _, license_url in graph.triples((lic_uri, P70I, None)):
387 zenodo_license = LICENSE_URI_TO_ZENODO.get(str(license_url))
388 if zenodo_license:
389 return zenodo_license
390 return None
393CC0_DISCLAIMER = (
394 "No copyright or related rights are claimed in these digital reproductions. "
395 "The files are released under CC0 1.0 Universal (Public Domain Dedication).\n"
396 "\n"
397 "Please note that the original works may qualify as cultural heritage assets "
398 "under Italian law (D. Lgs. 42/2004). Consequently, although the digital "
399 "reproductions are released under CC0, certain uses — and in particular "
400 "commercial uses — may be subject to specific authorisations, restrictions, "
401 "or fees pursuant to the applicable provisions governing the reproduction "
402 "and publication of cultural heritage assets. Users are therefore responsible "
403 "for ensuring compliance with Italian cultural heritage regulations before "
404 "undertaking any commercial exploitation of the images."
405)
408CHAD_AP_URL = "https://w3id.org/dharc/ontology/chad-ap"
410RESTRICTED_NOTICE = (
411 "The digital object files are not included in this dataset "
412 "because the holding institution did not grant permission for their publication. "
413 "Only metadata and provenance files are provided."
414)
417def build_enhanced_description(
418 stage: str,
419 title: str,
420 keeper_name: str | None = None,
421 keeper_location: str | None = None,
422) -> str:
423 parts = [
424 f'{STAGE_DESCRIPTION_NAMES[stage]} of "{title}" from the Aldrovandi Digital Twin.',
425 ]
426 if keeper_name:
427 keeper_line = f"The original object is held by {keeper_name}"
428 if keeper_location:
429 keeper_line += f" ({keeper_location})"
430 keeper_line += "."
431 parts.append(keeper_line)
432 parts.append(STAGE_DESCRIPTIONS[stage])
433 parts.append(
434 f"Includes metadata (meta.ttl) and provenance (prov.trig) files following the <a href=\"{CHAD_AP_URL}\">CHAD-AP</a> ontology.",
435 )
436 return " ".join(parts) + "\n"
439WORKFLOW_DOI_URL = "https://doi.org/10.46298/transformations.14773"
442def build_methods_description(
443 graph: Graph,
444 entity_id: str,
445 stage: str,
446) -> str:
447 parts = [
448 f'Acquisition and digitization followed the reproducible workflow documented in '
449 f'<a href="{WORKFLOW_DOI_URL}">doi:10.46298/transformations.14773</a>.',
450 ]
451 technique = extract_acquisition_technique(graph, entity_id)
452 devices = extract_devices(graph, entity_id)
453 if technique:
454 line = f"Data was acquired using {technique}"
455 if devices:
456 line += f" ({', '.join(devices)})"
457 line += "."
458 parts.append(line)
459 software = extract_software_for_stage(graph, entity_id, stage)
460 if software:
461 parts.append(f"Processing software: {', '.join(software)}.")
462 parts.append(
463 f'Metadata follows the <a href="{CHAD_AP_URL}">Cultural Heritage Acquisition and '
464 f"Digitisation Application Profile (CHAD-AP)</a> based on CIDOC-CRM.",
465 )
466 return "\n\n".join(parts) + "\n"
469def build_entity_uri(entity_id: str) -> str:
470 return f"{BASE_URI}/itm/{entity_id}/ob00/1"
473LICENSE_INFO = {
474 "cc0-1.0": {
475 "title": "Creative Commons Zero v1.0 Universal",
476 "link": "https://creativecommons.org/publicdomain/zero/1.0/",
477 },
478 "cc-by-4.0": {
479 "title": "Creative Commons Attribution 4.0 International",
480 "link": "https://creativecommons.org/licenses/by/4.0/",
481 },
482 "cc-by-nc-4.0": {
483 "title": "Creative Commons Attribution Non Commercial 4.0 International",
484 "link": "https://creativecommons.org/licenses/by-nc/4.0/",
485 },
486 "cc-by-sa-4.0": {
487 "title": "Creative Commons Attribution Share Alike 4.0 International",
488 "link": "https://creativecommons.org/licenses/by-sa/4.0/",
489 },
490 "cc-by-nc-sa-4.0": {
491 "title": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
492 "link": "https://creativecommons.org/licenses/by-nc-sa/4.0/",
493 },
494}
497def build_rights(content_license: str | None) -> list[dict]:
498 metadata_info = LICENSE_INFO["cc0-1.0"]
499 rights = [{
500 "title": {"en": f"{metadata_info['title']} (Metadata license)"},
501 "description": {"en": "Applies to metadata files: meta.ttl, prov.trig"},
502 "link": metadata_info["link"],
503 }]
504 if content_license and content_license in LICENSE_INFO:
505 content_info = LICENSE_INFO[content_license]
506 rights.append({
507 "title": {"en": f"{content_info['title']} (Content license)"},
508 "description": {"en": "Applies to all data files except meta.ttl and prov.trig"},
509 "link": content_info["link"],
510 })
511 return rights
514def generate_zenodo_config(
515 stage: str,
516 zip_path: Path,
517 title: str,
518 base_config: dict,
519 creators: list[dict],
520 methods_description: str,
521 license: str | None = None,
522 entity_uri: str | None = None,
523 keeper_name: str | None = None,
524 keeper_location: str | None = None,
525 has_license: bool = True,
526) -> dict:
527 description = build_enhanced_description(stage, title, keeper_name, keeper_location)
529 config: dict = {
530 "title": f"{title} - {STAGE_TITLE_NAMES[stage]} - Aldrovandi Digital Twin",
531 "description": description,
532 "resource_type": {"id": "dataset"},
533 "publisher": "Zenodo",
534 "access": {"record": "public", "files": "public"},
535 "files": [str(zip_path.absolute())],
536 "creators": creators,
537 "publication_date": date.today().isoformat(),
538 "rights": build_rights(license),
539 }
541 additional_descriptions: list[dict] = [
542 {
543 "description": methods_description,
544 "type": {"id": "methods"},
545 },
546 {
547 "description": base_config["notes"],
548 "type": {"id": "notes"},
549 },
550 ]
551 if not has_license:
552 additional_descriptions.append({
553 "description": RESTRICTED_NOTICE,
554 "type": {"id": "notes"},
555 })
556 if license == "cc0-1.0":
557 additional_descriptions.append({
558 "description": CC0_DISCLAIMER,
559 "type": {"id": "notes"},
560 })
561 config["additional_descriptions"] = additional_descriptions
563 config["locations"] = {
564 "features": [
565 {
566 "geometry": {
567 "type": "Point",
568 "coordinates": [loc["lon"], loc["lat"]],
569 },
570 "place": loc["place"],
571 "description": loc["description"],
572 }
573 for loc in base_config["locations"]
574 ]
575 }
577 for field in PROPAGATED_FIELDS:
578 if field in base_config and field not in config:
579 config[field] = base_config[field]
581 if "related_identifiers" in base_config:
582 converted = []
583 for ri in base_config["related_identifiers"]:
584 entry: dict = {
585 "identifier": ri["identifier"],
586 "relation_type": {"id": ri["relation"]},
587 }
588 if "resource_type" in ri:
589 entry["resource_type"] = {"id": ri["resource_type"]}
590 if "scheme" in ri:
591 entry["scheme"] = ri["scheme"]
592 converted.append(entry)
593 config["related_identifiers"] = converted
595 if entity_uri:
596 config["identifiers"] = [{"identifier": entity_uri, "scheme": "url"}]
598 return config
601def prepare_all(
602 root: Path,
603 zenodo_base_config_path: Path,
604 output_dir: Path,
605 kg_path: Path = KG_PATH,
606) -> None:
607 structure = scan_folder_structure(root)
609 kg = load_kg(kg_path)
610 licensed_stages = extract_licensed_entity_stages(kg)
611 entity_groups = group_folders_by_entity(structure)
613 with open(zenodo_base_config_path) as f:
614 base_config = yaml.safe_load(f)
616 creators_lookup = load_creators_lookup(CREATORS_LOOKUP_PATH)
618 zips_dir = output_dir / "zips"
619 configs_dir = output_dir / "configs"
620 zips_dir.mkdir(parents=True, exist_ok=True)
621 configs_dir.mkdir(parents=True, exist_ok=True)
623 with Progress(
624 SpinnerColumn(),
625 TextColumn("[progress.description]{task.description}"),
626 BarColumn(),
627 MofNCompleteColumn(),
628 ) as progress:
629 task = progress.add_task("Creating stage packages", total=len(entity_groups) * len(STAGES))
631 for entity_id, folders in entity_groups.items():
632 title = extract_entity_title(kg, entity_id)
633 keeper_name, keeper_location = extract_keeper_info(kg, entity_id)
634 sala_slug = slugify(folders[0][0])
635 title_slug = slugify(title)
636 metadata_creators = build_metadata_creators(kg, entity_id, creators_lookup)
637 for stage in STAGES:
638 progress.update(task, description=f"Entity {entity_id} - {stage}")
639 result = create_stage_zip(entity_id, stage, folders, root, licensed_stages, zips_dir, title)
640 if result is None:
641 progress.advance(task)
642 continue
643 zip_path, has_license = result
644 digitization_creators = build_creators_for_entity_stage(kg, entity_id, stage, creators_lookup)
645 creators = merge_creators(digitization_creators, metadata_creators)
646 license = extract_license_for_entity_stage(kg, entity_id, stage)
647 entity_uri = build_entity_uri(entity_id)
648 methods_description = build_methods_description(kg, entity_id, stage)
649 config = generate_zenodo_config(stage, zip_path, title, base_config, creators, methods_description, license, entity_uri, keeper_name, keeper_location, has_license)
650 config_path = configs_dir / f"{sala_slug}-{title_slug}-{stage}.yaml"
651 with open(config_path, "w") as f:
652 yaml.dump(config, f, Dumper=LiteralBlockDumper, default_flow_style=False, allow_unicode=True, sort_keys=False)
653 progress.advance(task)
656def _extract_doi(record: dict) -> str:
657 pids = record.get("pids", {})
658 doi_info = pids.get("doi", {})
659 return doi_info.get("identifier", "")
662def _extract_record_url(record: dict) -> str:
663 return record["links"]["self_html"]
666LICENSE_TITLE_TO_SHORT: dict[str, str] = {
667 info["title"]: short_name for short_name, info in LICENSE_INFO.items()
668}
671def _format_creators_for_table(config: dict) -> str:
672 creators = config["creators"]
673 parts: list[str] = []
674 for c in creators:
675 org = c["person_or_org"]
676 orcid = org["identifiers"][0]["identifier"]
677 parts.append(f"{org['family_name']}, {org['given_name']} [orcid:{orcid}]")
678 return "; ".join(parts)
681def _format_licenses_for_table(config: dict) -> str:
682 parts: list[str] = []
683 for right in config["rights"]:
684 title_en = right["title"]["en"]
685 for full_name, short_name in LICENSE_TITLE_TO_SHORT.items():
686 if title_en.startswith(full_name):
687 context = title_en.removeprefix(full_name).strip(" ()")
688 parts.append(f"{short_name} ({context})")
689 break
690 return "; ".join(parts)
693DOI_TABLE_FIELDNAMES = [
694 "Numero su DMP",
695 "Caso di studio",
696 "Autore/i",
697 "Tipo",
698 "Titolo",
699 "Data pubblicazione",
700 "DOI",
701 "URL",
702 "Repository",
703 "Licenza",
704 "Note",
705]
708def upload_all(configs_dir: Path, publish: bool = False) -> Path:
709 config_files = sorted(configs_dir.glob("*.yaml"))
710 doi_table: list[dict[str, str]] = []
712 with Progress(
713 SpinnerColumn(),
714 TextColumn("[progress.description]{task.description}"),
715 BarColumn(),
716 MofNCompleteColumn(),
717 ) as progress:
718 task = progress.add_task("Uploading to Zenodo", total=len(config_files))
719 for config_file in config_files:
720 progress.update(task, description=f"Uploading {config_file.stem}")
721 record = piccione_upload(str(config_file), publish=publish)
722 with open(config_file) as f:
723 config = yaml.safe_load(f)
724 row: dict[str, str] = {
725 "Numero su DMP": "",
726 "Caso di studio": "Aldrovandi",
727 "Autore/i": _format_creators_for_table(config),
728 "Tipo": "Dataset",
729 "Titolo": config["title"],
730 "Data pubblicazione": config["publication_date"],
731 "DOI": _extract_doi(record),
732 "URL": _extract_record_url(record),
733 "Repository": "Zenodo",
734 "Licenza": _format_licenses_for_table(config),
735 "Note": "",
736 }
737 doi_table.append(row)
738 progress.advance(task)
740 csv_path = configs_dir.parent / "doi_table.csv"
741 with open(csv_path, "w", newline="") as f:
742 writer = csv.DictWriter(f, fieldnames=DOI_TABLE_FIELDNAMES)
743 writer.writeheader()
744 writer.writerows(doi_table)
745 print(f"DOI table written to {csv_path}")
746 return csv_path
749def parse_arguments(): # pragma: no cover
750 parser = argparse.ArgumentParser(description="Prepare and upload Zenodo packages")
751 subparsers = parser.add_subparsers(dest="command", required=True)
753 prepare_parser = subparsers.add_parser("prepare", help="Create zips and YAML configs")
754 prepare_parser.add_argument("root", type=Path, help="Root directory with Sala/Folder/Stage structure")
755 prepare_parser.add_argument("zenodo_config", type=Path, help="Base Zenodo configuration YAML")
756 prepare_parser.add_argument("--output", "-o", type=Path, default=Path("zenodo_output"), help="Output directory")
757 upload_parser = subparsers.add_parser("upload", help="Upload to Zenodo")
758 upload_parser.add_argument("configs_dir", type=Path, help="Directory containing YAML configs")
759 upload_parser.add_argument("--publish", action="store_true", help="Publish after upload")
761 return parser.parse_args()
764def main(): # pragma: no cover
765 args = parse_arguments()
766 if args.command == "prepare":
767 prepare_all(
768 root=args.root,
769 zenodo_base_config_path=args.zenodo_config,
770 output_dir=args.output,
771 )
772 elif args.command == "upload":
773 upload_all(configs_dir=args.configs_dir, publish=args.publish)
776if __name__ == "__main__": # pragma: no cover
777 main()