Skip to content

Commit

Permalink
[WIP] Adding a catalog solution
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Aug 23, 2024
1 parent e42d4c2 commit bf4a223
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 0 deletions.
55 changes: 55 additions & 0 deletions dapitains/local/collections.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
class DublinCore:
term: str
value: str
language: Optional[str] = None

def json(self):
return {"property": f"http://purl.org/dc/terms/{self.term}", "value": self.value, "language": self.language}


class Extension(DublinCore):
term: str
value: str
language: Optional[str] = None

def json(self):
return {"property": self.term, "value": self.value, "language": self.language}


@dataclass
class Collection:
identifier: str
title: str
description: Optional[str] = None
parents: List[str] = field(default_factory=list)
children: List[str] = field(default_factory=list)
dublin_core: List[DublinCore] = field(default_factory=list)
extension: List[Extension] = field(default_factory=list)
resource: bool = False
filepath: Optional[str] = None

def json(self):
return {
"identifier": self.identifier,
"title": self.title,
"description": self.description,
"parents": self.parents,
"children": self.children,
"dublin_core": self.dublin_core,
"extension": self.extension,
"resource": self.resource,
"filepath": self.filepath
}

@dataclass
class CitableUnit:
resource: str
reference: str
children: List[str] = field(default_factory=list)
dublin_core: List[DublinCore] = field(default_factory=list)
extension: List[Extension] = field(default_factory=list)
67 changes: 67 additions & 0 deletions dapitains/local/ingester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os.path
import re
from typing import Dict, Optional
import lxml.etree as ET
from dapitains.local.collections import DublinCore, Extension, Collection, CitableUnit


_re_tag = re.compile(r"[{}]")


def parse_metadata(xml: ET.Element):
obj = {
"identifier": xml.attrib["identifier"],
"title": xml.xpath("./title[1]/text()")[0],
"description": (xml.xpath("./description[1]/text()") or [None])[0]
}
# Treat Dublin Core
dublin_core = []
for node in xml.xpath("./dublinCore/*"):
tag = node.tag.split("}")[-1]
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
text = node.text
dublin_core.append(DublinCore(tag, text, language))
if dublin_core:
obj["dublin_core"] = dublin_core

# Treat Extension
extensions = []
for node in xml.xpath("./extension/*"):
tag = _re_tag.sub("", node.tag)
language = node.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
text = node.text
extensions.append(Extension(tag, text, language))
if extensions:
obj["extensions"] = extensions

return obj


def parse_collection(xml: ET.Element, basedir: str, tree: Dict[str, Collection]) -> Collection:
obj = parse_metadata(xml)
obj = Collection(**obj, resource=xml.tag == "resource")
tree[obj.identifier] = obj
for member in xml.xpath("./members/*"):
if member.xpath("./title"):
parse_collection(member, basedir, tree)
# ToDo: deal with children ?
else:
ingest_catalog(os.path.join(basedir, member.attrib["filepath"]), tree)
return obj


def ingest_catalog(path: str, tree: Optional[Dict[str, Collection]] = None) -> Dict[str, Collection]:
"""
:param path:
:return:
>>> ingest_catalog("../../tests/catalog/example-collection.xml")
"""
xml = ET.parse(path)
current_dir = os.path.abspath(os.path.dirname(path))

root: ET.Element = xml.getroot()
tree = tree or {}
parse_collection(root, basedir=current_dir, tree=tree)
return tree

0 comments on commit bf4a223

Please sign in to comment.