From bf4a2232f49e46582ea5f81f8a96cfb83394d869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thibault=20Cl=C3=A9rice?= Date: Fri, 23 Aug 2024 17:28:16 +0200 Subject: [PATCH] [WIP] Adding a catalog solution --- dapitains/local/collections.py | 55 ++++++++++++++++++++++++++++ dapitains/local/ingester.py | 67 ++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 dapitains/local/collections.py create mode 100644 dapitains/local/ingester.py diff --git a/dapitains/local/collections.py b/dapitains/local/collections.py new file mode 100644 index 0000000..dfc1a51 --- /dev/null +++ b/dapitains/local/collections.py @@ -0,0 +1,55 @@ +from dataclasses import dataclass, field +from typing import List, Optional + + +@dataclass +class DublinCore: + term: str + value: str + language: Optional[str] = None + + def json(self): + return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language} + + +class Extension(DublinCore): + term: str + value: str + language: Optional[str] = None + + def json(self): + return {"property": self.term, "value": self.value, "language": self.language} + + +@dataclass +class Collection: + identifier: str + title: str + description: Optional[str] = None + parents: List[str] = field(default_factory=list) + children: List[str] = field(default_factory=list) + dublin_core: List[DublinCore] = field(default_factory=list) + extension: List[Extension] = field(default_factory=list) + resource: bool = False + filepath: Optional[str] = None + + def json(self): + return { + "identifier": self.identifier, + "title": self.title, + "description": self.description, + "parents": self.parents, + "children": self.children, + "dublin_core": self.dublin_core, + "extension": self.extension, + "resource": self.resource, + "filepath": self.filepath + } + +@dataclass +class CitableUnit: + resource: str + reference: str + children: List[str] = field(default_factory=list) + dublin_core: List[DublinCore] = field(default_factory=list) + extension: List[Extension] = field(default_factory=list) diff --git a/dapitains/local/ingester.py b/dapitains/local/ingester.py new file mode 100644 index 0000000..e485fc0 --- /dev/null +++ b/dapitains/local/ingester.py @@ -0,0 +1,67 @@ +import os.path +import re +from typing import Dict, Optional +import lxml.etree as ET +from dapitains.local.collections import DublinCore, Extension, Collection, CitableUnit + + +_re_tag = re.compile(r"[{}]") + + +def parse_metadata(xml: ET.Element): + obj = { + "identifier": xml.attrib["identifier"], + "title": xml.xpath("./title[1]/text()")[0], + "description": (xml.xpath("./description[1]/text()") or [None])[0] + } + # Treat Dublin Core + dublin_core = [] + for node in xml.xpath("./dublinCore/*"): + tag = node.tag.split("}")[-1] + language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") + text = node.text + dublin_core.append(DublinCore(tag, text, language)) + if dublin_core: + obj["dublin_core"] = dublin_core + + # Treat Extension + extensions = [] + for node in xml.xpath("./extension/*"): + tag = _re_tag.sub("", node.tag) + language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang") + text = node.text + extensions.append(Extension(tag, text, language)) + if extensions: + obj["extensions"] = extensions + + return obj + + +def parse_collection(xml: ET.Element, basedir: str, tree: Dict[str, Collection]) -> Collection: + obj = parse_metadata(xml) + obj = Collection(**obj, resource=xml.tag == "resource") + tree[obj.identifier] = obj + for member in xml.xpath("./members/*"): + if member.xpath("./title"): + parse_collection(member, basedir, tree) + # ToDo: deal with children ? + else: + ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree) + return obj + + +def ingest_catalog(path: str, tree: Optional[Dict[str, Collection]] = None) -> Dict[str, Collection]: + """ + + :param path: + :return: + + >>> ingest_catalog("../../tests/catalog/example-collection.xml") + """ + xml = ET.parse(path) + current_dir = os.path.abspath(os.path.dirname(path)) + + root: ET.Element = xml.getroot() + tree = tree or {} + parse_collection(root, basedir=current_dir, tree=tree) + return tree