Skip to content

Commit

Permalink
[WIP] Heavy changes to package structure, and the processor can now u…
Browse files Browse the repository at this point in the history
…se different version
  • Loading branch information
PonteIneptique committed Aug 24, 2024
1 parent 6c33088 commit b389e9f
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 35 deletions.
24 changes: 20 additions & 4 deletions dapitains/constants.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,28 @@
import logging
import os

try:
from saxonche import PySaxonProcessor, PyXdmNode, PyXPathProcessor
saxon_version = os.getenv("pysaxon", "HE")
saxon_license = os.getenv("pysaxon_license", "")
logging.info(f"Using SaxonLib {saxon_version}")
if saxon_version == "HE":
import saxonche as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor()
elif saxon_version == "PE":
import saxoncpe as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
elif saxon_version == "PE":
import saxoncee as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor(license=saxon_license)
except ImportError:
print("PySaxonC-HE not found")
print("Unable to import the required PySaxonC version, resorting to PySaxonC-HE")
import saxonche as saxonlib
PROCESSOR = saxonlib.PySaxonProcessor()


PROCESSOR = PySaxonProcessor()


def get_xpath_proc(elem: PyXdmNode) -> PyXPathProcessor:
def get_xpath_proc(elem: saxonlib.PyXdmNode) -> saxonlib.PyXPathProcessor:
""" Builds an XPath processor around a given element, with the default TEI namespace
:param elem: An XML node, root or not
Expand Down
File renamed without changes.
File renamed without changes.
28 changes: 20 additions & 8 deletions dapitains/local/ingester.py → dapitains/metadata/xml_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import os.path
import re
from typing import Dict, Optional, List, Tuple
from typing import Dict, Optional, List, Tuple, Any
from dataclasses import dataclass, field
import lxml.etree as ET
from dapitains.local.collection import DublinCore, Extension, Collection, CitableUnit
from dapitains.metadata.classes import DublinCore, Extension, Collection


_re_tag = re.compile(r"[{}]")
Expand All @@ -15,7 +15,12 @@ class Catalog:
objects: Dict[str, Collection] = field(default_factory=dict)


def parse_metadata(xml: ET.Element):
def parse_metadata(xml: ET.Element) -> Dict[str, Any]:
""" Parse Metadata
:param xml: Collection/Resource tag
:returns: Main metadata obj Resource or Collection objects
"""
obj = {
"identifier": xml.attrib["identifier"],
"title": xml.xpath("./title[1]/text()")[0],
Expand Down Expand Up @@ -51,11 +56,17 @@ def parse_metadata(xml: ET.Element):


def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection:
""" Parse a Collection or Resource object
:param xml: Parsed Collection or Resource by LXML
:param basedir: Directory used to resolve filepath, that are relative to the main object
:param tree: Catalog that is updated with objects.
"""
obj = parse_metadata(xml)
obj = Collection(**obj, resource=xml.tag == "resource")
tree.objects[obj.identifier] = obj
if xml.attrib.get("filepath"):
obj.filepath = os.path.join(basedir, xml.attrib["filepath"])
if xml.attrib.get("filepath") and obj.resource:
obj.filepath = os.path.normpath(os.path.join(basedir, xml.attrib["filepath"]))
for member in xml.xpath("./members/*"):
if member.xpath("./title"):
child = parse_collection(member, basedir, tree)
Expand All @@ -69,10 +80,11 @@ def parse_collection(xml: ET.Element, basedir: str, tree: Catalog) -> Collection


def ingest_catalog(path: str, tree: Optional[Catalog] = None) -> Tuple[Catalog, Collection]:
"""
""" Ingest a collection description file.
:param path:
:return:
:param path: Path to a Collection XML File, see the schema at tests/catalog/schema.rng
:param tree: Current catalog, which is either updated or created
:return: Catalog and root collection found at path.
>>> ingest_catalog("../../tests/catalog/example-collection.xml")
"""
Expand Down
Empty file added dapitains/tei/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import re
from typing import Dict, List, Optional
from dataclasses import dataclass, field
from saxonche import PyXdmNode, PyXPathProcessor
from collections import namedtuple, defaultdict
from functools import cmp_to_key
from dapitains.constants import PROCESSOR, get_xpath_proc
from dapitains.constants import get_xpath_proc, saxonlib


@dataclass
Expand Down Expand Up @@ -45,7 +44,7 @@ class CitableUnit:
citeType: str
ref: str
children: List["CitableUnit"] = field(default_factory=list)
node: Optional[PyXdmNode] = None
node: Optional[saxonlib.PyXdmNode] = None
dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))

Expand All @@ -69,7 +68,7 @@ def to_dts(self):
_simple_node = namedtuple("SimpleNode", ["citation", "xpath", "struct"])


def get_children_cite_structures(elem: PyXdmNode) -> List[PyXdmNode]:
def get_children_cite_structures(elem: saxonlib.PyXdmNode) -> List[saxonlib.PyXdmNode]:
xpath = get_xpath_proc(elem=elem).evaluate("./citeStructure")
if xpath is not None:
return list(iter(xpath))
Expand All @@ -82,7 +81,7 @@ class CiteStructureParser:
ToDo: Add the ability to use CiteData. This will mean moving from len(element) to len(element.xpath("./citeStructure"))
ToDo: Add the ability to use citationTree labels
"""
def __init__(self, root: PyXdmNode):
def __init__(self, root: saxonlib.PyXdmNode):
self.root = root
self.xpath_matcher: Dict[str, str] = {}
self.regex_pattern, cite_structure = self.build_regex_and_xpath(
Expand Down Expand Up @@ -189,7 +188,7 @@ def _dispatch(
self,
child_xpath: str,
structure: CitableStructure,
xpath_processor: PyXPathProcessor,
xpath_processor: saxonlib.PyXPathProcessor,
unit: CitableUnit):
# target = self.generate_xpath(child.ref)
if len(structure.children) == 1:
Expand All @@ -207,7 +206,7 @@ def _dispatch(

def find_refs(
self,
root: PyXdmNode,
root: saxonlib.PyXdmNode,
structure: CitableStructure = None,
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
Expand Down Expand Up @@ -245,7 +244,7 @@ def find_refs(

def find_refs_from_branches(
self,
root: PyXdmNode,
root: saxonlib.PyXdmNode,
structure: List[CitableStructure],
unit: Optional[CitableUnit] = None
) -> List[CitableUnit]:
Expand Down
13 changes: 6 additions & 7 deletions dapitains/local/tei.py → dapitains/tei/tei.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from dapitains.local.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc
from dapitains.tei.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc, saxonlib
from typing import Optional, List, Tuple, Dict
from lxml.etree import fromstring
from lxml.objectify import Element, SubElement
from lxml import objectify
from saxonche import PyXdmNode, PyXPathProcessor
import re
from dapitains.errors import UnknownTreeName

Expand All @@ -31,7 +30,7 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str]]:
return current, queue


def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str) -> bool:
""" Check if an XPath is traversing more than one level
:param parent:
Expand All @@ -49,7 +48,7 @@ def is_traversing_xpath(parent: PyXdmNode, xpath: str) -> bool:
return False


def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str) -> Tuple[saxonlib.PyXdmNode, bool]:
""" Perform an XPath on an element to find a child that is part of the XPath.
If the child is a direct member of the path, returns a False boolean indicating to move
onto the next element.
Expand All @@ -71,7 +70,7 @@ def xpath_walk_step(parent: PyXdmNode, xpath: str) -> Tuple[PyXdmNode, bool]:
return xpath_proc.evaluate_single(xpath), False


def copy_node(node: PyXdmNode, include_children=False, parent: Optional[Element] = None):
def copy_node(node: saxonlib.PyXdmNode, include_children=False, parent: Optional[Element] = None):
""" Copy an XML Node
:param node: Etree Node
Expand Down Expand Up @@ -124,7 +123,7 @@ def normalize_xpath(xpath: List[str]) -> List[str]:


def reconstruct_doc(
root: PyXdmNode,
root: saxonlib.PyXdmNode,
start_xpath: List[str],
new_tree: Optional[Element] = None,
end_xpath: Optional[List[str]] = None
Expand Down
2 changes: 1 addition & 1 deletion tests/catalog/example-collection.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
</dublinCore>
<members>
<collection filepath="./example-sub-collection.xml"/>
<resource identifier="https://foo.bar/text" path="../tei/base_tei.xml">
<resource identifier="https://foo.bar/text" filepath="../tei/base_tei.xml">
<title>A simple resource</title>
<description>With a description</description>
<dublinCore>
Expand Down
2 changes: 1 addition & 1 deletion tests/catalog/example-sub-collection.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<date xmlns="http://purl.org/dc/terms/">2023-08-24</date>
</dublinCore>
<members>
<resource identifier="https://example.org/resource1" path="../data/resource1.xml">
<resource identifier="https://example.org/resource1" filepath="../tei/multiple_tree.xml">
<title>Historical Document</title>
<description>A document about historical events.</description>
<dublinCore>
Expand Down
10 changes: 6 additions & 4 deletions tests/test_catalog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from dapitains.local.ingester import ingest_catalog
from dapitains.local.collection import *
import os.path

from dapitains.metadata.xml_parser import ingest_catalog
from dapitains.metadata.classes import *


def test_ingestion():
Expand Down Expand Up @@ -39,7 +41,7 @@ def test_ingestion():
DublinCore(term='language', value='en', language=None)
],
extension=[], resource=True,
filepath=None
filepath=os.path.abspath("tei/multiple_tree.xml")
),
"https://foo.bar/text": Collection(
identifier='https://foo.bar/text',
Expand All @@ -52,6 +54,6 @@ def test_ingestion():
],
extension=[],
resource=True,
filepath=None
filepath=os.path.abspath("tei/base_tei.xml")
)
}
2 changes: 1 addition & 1 deletion tests/test_citeStructure.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dapitains.local.citeStructure import CiteStructureParser
from dapitains.tei.citeStructure import CiteStructureParser
from dapitains.constants import PROCESSOR, get_xpath_proc
import os.path
import pytest
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tei.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os.path

import pytest
from dapitains.local.tei import Document
from dapitains.tei.tei import Document
from lxml.etree import tostring

local_dir = os.path.join(os.path.dirname(__file__), "tei")
Expand Down

0 comments on commit b389e9f

Please sign in to comment.