Skip to content

Commit

Permalink
Reference parsing and ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
PonteIneptique committed Aug 24, 2024
1 parent 2fd5fca commit 2c618eb
Show file tree
Hide file tree
Showing 7 changed files with 162 additions and 32 deletions.
40 changes: 14 additions & 26 deletions dapitains/app/database.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
try:
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.ext.mutable import MutableDict, Mutable
from sqlalchemy.types import TypeDecorator, TEXT
import click
except ImportError:
print("This part of the package can only be imported with the web requirements.")
raise

import dapitains.metadata.classes as abstracts
from dapitains.metadata.xml_parser import Catalog
from dapitains.tei.document import Document
import json


db = SQLAlchemy()

parent_child_association = db.Table('parent_child_association',
Expand All @@ -18,20 +21,19 @@
)


class JSONEncodedDict(TypeDecorator):
class JSONEncoded(TypeDecorator):
"""Enables JSON storage by encoding and decoding on the fly."""
impl = TEXT

def process_bind_param(self, value, dialect):
if value is None:
return ''
elif isinstance(value, dict):
return None
else:
return json.dumps(value)
return value

def process_result_value(self, value, dialect):
if value is None:
return '""'
return None
return json.loads(value)

class Collection(db.Model):
Expand All @@ -43,8 +45,8 @@ class Collection(db.Model):
description = db.Column(db.String, nullable=True)
resource = db.Column(db.Boolean, default=False)
filepath = db.Column(db.String, nullable=True)
dublin_core = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True)
extensions = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=True)
dublin_core = db.Column(JSONEncoded, nullable=True)
extensions = db.Column(JSONEncoded, nullable=True)

# One-to-one relationship with Navigation
navigation = db.relationship('Navigation', uselist=False, backref='collection', lazy='noload')
Expand All @@ -67,8 +69,8 @@ def from_class(cls, obj: abstracts.Collection) -> "Collection":
resource=obj.resource,
filepath=obj.filepath,
# We are dumping because it's not read or accessible
dublin_core=json.dumps([dub.json() for dub in obj.dublin_core]),
extensions=json.dumps([ext.json() for ext in obj.extension])
dublin_core=[dub.json() for dub in obj.dublin_core],
extensions=[ext.json() for ext in obj.extension]
)

class Navigation(db.Model):
Expand All @@ -79,19 +81,5 @@ class Navigation(db.Model):
default_tree = db.Column(db.String, nullable=True)

# JSON fields stored as TEXT
paths = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={})
references = db.Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, default={})

if __name__ == "__main__":
import flask
import os
app = flask.Flask(__name__)

basedir = os.path.abspath(os.path.dirname(__file__))
db_path = os.path.join(basedir, 'app.db')
app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db.init_app(app)
with app.app_context():
db.create_all()
paths = db.Column(JSONEncoded, nullable=False, default={})
references = db.Column(JSONEncoded, nullable=False, default={})
90 changes: 90 additions & 0 deletions dapitains/app/ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import Dict, List, Optional, Any
from dapitains.app.database import Collection, Navigation, db
from dapitains.metadata.xml_parser import Catalog
from dapitains.tei.document import Document

def store_catalog(catalog: Catalog):
for identifier, collection in catalog.objects.items():
db.session.add(Collection.from_class(collection))
if collection.resource:
doc = Document(collection.filepath)
references = {
key: struct.find_refs(root=doc.xml, structure=struct.units) for key, struct in doc.citeStructure.items()
}


def get_member_by_path(data: List[Dict[str, Any]], path: List[int]) -> Optional[Dict[str, Any]]:
"""
Retrieve the member at the specified path in the nested data structure.
:param data: The nested data structure (list of dictionaries).
:param path: A list of indices that represent the path to the desired member.
:return: The member at the specified path, or None if the path is invalid.
"""
current_level = data

for index in path:
try:
current_level = current_level[index]
if 'members' in current_level:
current_level = current_level['members']
except (IndexError, KeyError):
return None

return current_level


def generate_paths(data: List[Dict[str, Any]], path: Optional[List[int]] = None) -> Dict[str, List[int]]:
"""
Generate a dictionary mapping each 'ref' in a nested data structure to its path.
The path is represented as a list of indices that show how to access each 'ref'
in the nested structure.
:param data: The nested data structure (list of dictionaries). Each dictionary
can have a 'ref' and/or 'members' key.
:param path: A list of indices representing the current path in the nested data
structure. Used internally for recursion. Defaults to None for the
initial call.
:return: A dictionary where each key is a 'ref' and each value is a list of indices
representing the path to that 'ref' in the nested structure.
"""
if path is None:
path = []

paths = {}

def recurse(items, current_path):
for index, item in enumerate(items):
ref = item.get('ref')
if ref:
# Record the path for the current reference
paths[ref] = current_path + [index]

members = item.get('members')
if members:
# Recurse into the 'members' list
recurse(members, current_path + [index])

recurse(data, [])
return paths

if __name__ == "__main__":
import flask
import os
from dapitains.metadata.xml_parser import ingest_catalog
app = flask.Flask(__name__)

basedir = os.path.abspath(os.path.dirname(__file__))
db_path = os.path.join(basedir, 'app.db')
app.config['SQLALCHEMY_DATABASE_URI'] = f'sqlite:///{db_path}'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db.init_app(app)
with app.app_context():
db.drop_all()
db.create_all()

catalog, _ = ingest_catalog("/home/thibault/dev/MyDapytains/tests/catalog/example-collection.xml")

store_catalog(catalog)
4 changes: 2 additions & 2 deletions dapitains/tei/citeStructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ class CitableUnit:
dublinCore: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))
extension: Dict[str, List[str]] = field(default_factory=lambda: defaultdict(list))

def to_dts(self):
def json(self):
out = {
"citeType": self.citeType,
"ref": self.ref
}
if self.children:
out["members"] = [
member.to_dts()
member.json()
for member in self.children
]
if self.dublinCore:
Expand Down
File renamed without changes.
6 changes: 3 additions & 3 deletions tests/test_citeStructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_parsing():
# Generate XPath for "Luke 1" (partial match)
assert parser.generate_xpath("Luke") == "//body/div[@n='Luke']"

assert [root.to_dts() for root in parser.find_refs(root=TEI, structure=parser.units)] == [
assert [root.json() for root in parser.find_refs(root=TEI, structure=parser.units)] == [
{'citeType': 'book', 'ref': 'Luke', 'members': [
{'citeType': 'chapter', 'ref': 'Luke 1', 'members': [
{'citeType': 'verse', 'ref': 'Luke 1:1'},
Expand All @@ -82,7 +82,7 @@ def test_cite_data():
citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]")
parser = CiteStructureParser(citeStructure)
refs = parser.find_refs(root=TEI, structure=parser.units)
refs = [ref.to_dts() for ref in refs]
refs = [ref.json() for ref in refs]
assert refs == [
{'citeType': 'book', 'ref': '1', 'dublinCore': {
'http://purl.org/dc/terms/title': ['Introduction', 'Introduction'],
Expand All @@ -106,7 +106,7 @@ def test_advanced_cite_data():
citeStructure = xpath.evaluate_single("/TEI/teiHeader/refsDecl[1]")
parser = CiteStructureParser(citeStructure)
refs = parser.find_refs(root=TEI, structure=parser.units)
refs = [ref.to_dts() for ref in refs]
refs = [ref.json() for ref in refs]
assert refs == [
{'citeType': 'part', 'ref': 'part-1', 'members': [
{'citeType': 'book', 'ref': 'part-1.1', 'dublinCore': {
Expand Down
52 changes: 52 additions & 0 deletions tests/test_db_create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import flask
from dapitains.app.ingest import generate_paths, get_member_by_path
from dapitains.tei.document import Document
import os


local_dir = os.path.join(os.path.dirname(__file__))


def test_simple_path():
"""Check that a document can be parsed and that path are corrects"""
doc = Document(f"{local_dir}/tei/multiple_tree.xml")
refs = {
tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)]
for tree, obj in doc.citeStructure.items()
}
paths = {tree: generate_paths(ref) for tree, ref in refs.items()}
assert paths == {
'nums': {
'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4]
},
None: {
'I': [0], '1': [1], 'A': [2], '4': [3], 'V': [4]
},
'alpha': {
'div-a1': [0], 'div-002': [1], 'div-xyz': [2], 'div-004': [3], 'div-v5': [4]
}
}
# Second part of the test
doc = Document(f"{local_dir}/tei/base_tei.xml")
refs = {
tree: [ref.json() for ref in obj.find_refs(doc.xml, structure=obj.units)]
for tree, obj in doc.citeStructure.items()
}
paths = {tree: generate_paths(ref) for tree, ref in refs.items()}
assert paths == {
None: {
"Luke": [0],
"Luke 1": [0, 0],
"Luke 1:1": [0, 0, 0],
"Luke 1:2": [0, 0, 1],
"Luke 1#1": [0, 0, 2],
"Mark": [1],
"Mark 1": [1, 0],
"Mark 1:1": [1, 0, 0],
"Mark 1:2": [1, 0, 1],
"Mark 1#1": [1, 0, 2],
"Mark 1:3": [1, 0, 3]
}
}
assert get_member_by_path(refs[None], paths[None]["Mark 1:3"]) == {'citeType': 'verse', 'ref': 'Mark 1:3'}

2 changes: 1 addition & 1 deletion tests/test_tei.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os.path

import pytest
from dapitains.tei.tei import Document
from dapitains.tei.document import Document
from lxml.etree import tostring

local_dir = os.path.join(os.path.dirname(__file__), "tei")
Expand Down

0 comments on commit 2c618eb

Please sign in to comment.