delb-xml · funkyfuture · May 11, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -4,12 +4,18 @@ end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 
+[*.md]
+indent_style = space
+indent_size = x
+max_line_length = 80
+
 [*.py]
 indent_style = space
 indent_size = 4
 max_line_length = 88
 
 [*.rst]
+indent_style = space
 max_line_length = 80
 
 [*.yml]

diff --git a/Justfile b/Justfile
@@ -10,7 +10,7 @@ benchmarks:
 
 # normalize Python code
 black:
-    black benchmarks _delb delb tests
+    black benchmarks _delb delb integration-tests tests
 
 # runs tests (except loaders) and reports uncovered lines
 coverage-report:

diff --git a/integration-tests/README.md b/integration-tests/README.md
@@ -1,20 +1,27 @@
 # Integration tests against corpora
 
 This folder serves as playground for tests of basic functionality against many
-XML documents, mostly TEI-encodings. They are supposed to be executed with
+XML documents, mostly TEI encodings. They are supposed to be executed with
 major code change proposals and before releases.
 
+The `requirements.txt` should contain a list of all libraries needed to run the
+scripts beside `delb`.
+
 ## Test corpus
 
 Place document collections into the `corpora` folder. The `fetch-corpora.py`
-script helps to get going with the minimal requirement (~3GB) of tests.
+script helps to get going with the minimal requirement (~6GB) of data.
+Set any non-empty string as environment variable `SKIP_EXISTING` to skip
+downloading a corpus whose target folder already exists.
 
 Due to the `lb` tag [issue](https://github.com/deutschestextarchiv/dtabf/issues/33)
 with the DTABf the DTA corpus isn't considered. It could be an experiment to
 use *delb* for transformations with regards to the conclusions of that issue.
 
 The `normalize-corpora.py` script addresses issues that were found in the text
-encodings and must be run before the tests.
+encodings and must be run after fetching test data.
+The corpus folder names can be passed as arguments to the script in order to
+process only those contents.
 
 ## Tests
 
@@ -25,3 +32,8 @@ reserves a `report.txt` for messages redirected from *stdout*.
 
 When problems occur, carefully investigate that it's not due to the source, and
 if not extract simple enough cases for the unit tests.
+
+## TODO
+
+After adding the third kind of test, wrap all scripts here into a
+[textual](https://textual.textualize.io) app.
diff --git a/integration-tests/fetch-corpora.py b/integration-tests/fetch-corpora.py
@@ -30,6 +30,7 @@
 from typing import Final, NamedTuple
 
 from httpx import AsyncClient, HTTPError
+from tenacity import retry, wait_random_exponential
 
 import delb
 
@@ -49,7 +50,8 @@ class Archive(NamedTuple):
 
 ARCHIVE_DESCRIPTIONS: Final = (
     Archive(
-        url="https://github.com/Brown-University-Library/atalanta-texts/archive/master.tar.gz",
+        url="https://github.com/Brown-University-Library/atalanta-texts/archive"
+        "/master.tar.gz",
         archive_documents_root="atalanta-texts-master/",
         target_directory="atalanta",
     ),
@@ -221,7 +223,7 @@ class Archive(NamedTuple):
         url="https://github.com/funkyfuture/idp.data/archive/master.tar.gz",
         # TODO use this URL when https://github.com/papyri/idp.data/pull/391
         # was merged:
-        # url="https://github.com/papyri/idp.data/archive/master.tar.gz",
+        # url="https://github.com/papyri/idp.data/archive/master.tar.gz",  # noqa: E800
         archive_documents_root="idp.data-master/",
         target_directory="papyri",
     ),
@@ -243,6 +245,7 @@ class Archive(NamedTuple):
 http_client: Final = AsyncClient()
 
 
+@retry(wait=wait_random_exponential(multiplier=1, max=120))
 async def fetch_resource(url: str, destination: io.BufferedWriter) -> bool:
     async with http_client.stream("GET", url, follow_redirects=True) as response:
         try:

diff --git a/integration-tests/normalize-corpora.py b/integration-tests/normalize-corpora.py
@@ -5,9 +5,12 @@
 from functools import partial
 from pathlib import Path
 from typing import Final
-from sys import argv, stderr
+from sys import argv
+
+from tqdm import tqdm
 
 CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"
+RELEVANT_CORPORA: Final = ("casebooks", "papyri")
 
 # the casebooks corpus uses an external, local .dtd file whose reference needs to be
 # adjusted
@@ -31,28 +34,30 @@
 cr_ent_to_lf = partial(re.compile(re.escape(b"&#xd;"), flags=re.IGNORECASE).subn, b"\n")
 
 
-async def normalize_file(file: Path):
+async def normalize_file(file: Path, indicate_progress: callable):
     match file.parent.name:
         case "casebooks":
             contents, subs = adjust_casebooks_dtd_path(file.read_bytes())
         case "papyri":
             contents, subs = cr_ent_to_lf(file.read_bytes())
         case _:
-            return
+            raise RuntimeError
 
     if subs:
         file.write_bytes(contents)
-    stderr.write("✓")
+    indicate_progress()
 
 
 async def main():
-    root = CORPORA_PATH
-    if len(argv) > 1:
-        root /= argv[1]
-    print(root)
-    async with asyncio.TaskGroup() as tasks:
-        for file in root.rglob("*.xml"):
-            tasks.create_task(normalize_file(file))
+    corpora = [x for x in RELEVANT_CORPORA if x in argv[1:]] or RELEVANT_CORPORA
+    for folder in (CORPORA_PATH / x for x in corpora):
+        print(f"Normalizing contents of {folder}")
+        files = tuple(folder.glob("*.xml"))
+        progressbar = tqdm(total=len(files), mininterval=0.5, unit_scale=True)
+        async with asyncio.TaskGroup() as tasks:
+            for file in files:
+                tasks.create_task(normalize_file(file, progressbar.update))
+        progressbar.close()
 
 
 if __name__ == "__main__":

diff --git a/integration-tests/requirements.txt b/integration-tests/requirements.txt
@@ -0,0 +1,3 @@
+httpx
+tenacity
+tqdm
diff --git a/integration-tests/test-location-paths.py b/integration-tests/test-location-paths.py
@@ -0,0 +1,93 @@
+#!/bin/env python3
+
+from __future__ import annotations
+
+import multiprocessing as mp
+import random
+from itertools import batched, chain
+from pathlib import Path
+from typing import TYPE_CHECKING, Final
+
+from tqdm import tqdm
+
+from _delb.plugins.core_loaders import path_loader
+from delb import is_tag_node, Document, FailedDocumentLoading, ParserOptions
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+BATCH_SIZE: Final = 64
+CPU_COUNT: Final = mp.cpu_count()
+
+CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"
+
+DOCUMENT_SAMPLES_PERCENT: Final = 25
+LOCATIONS_PATHS_SAMPLES_PERCENT: Final = 25
+
+
+def verify_location_paths(file: Path):
+    try:
+        document = Document(
+            file,
+            parser_options=ParserOptions(
+                collapse_whitespace=False, resolve_entities=False, unplugged=True
+            ),
+        )
+    except FailedDocumentLoading as exc:
+        print(
+            f"\nFailed to load {file.name}: {exc.excuses[path_loader]}",
+            end="",
+        )
+        return
+
+    root = document.root
+    for node in chain((root,), root.iterate_descendants(is_tag_node)):
+        if random.randint(1, 100) > LOCATIONS_PATHS_SAMPLES_PERCENT:
+            continue
+
+        query_results = document.xpath(node.location_path)
+        if not (query_results.size == 1 and query_results.first is node):
+            print(
+                f"\nXPath query `{node.location_path}` in {file} yielded unexpected "
+                "results."
+            )
+
+
+def dispatch_batch(files: Iterable[Path]):
+    for file in files:
+        try:
+            verify_location_paths(file)
+        except Exception as e:
+            print(f"\nUnhandled exception while testing {file}: {e}")
+
+
+def main():
+    mp.set_start_method("forkserver")
+
+    all_files = tuple(CORPORA_PATH.rglob("*.xml"))
+    all_files_size = len(all_files)
+    sample_size = int(all_files_size / 100 * DOCUMENT_SAMPLES_PERCENT)
+    selected_files = random.choices(all_files, k=sample_size)
+    del all_files
+
+    dispatched_tasks = []
+    progressbar = tqdm(total=sample_size, mininterval=0.5, unit_scale=True)
+
+    with mp.Pool(CPU_COUNT) as pool:
+        for batch in batched(selected_files, n=BATCH_SIZE):
+            dispatched_tasks.append(pool.apply_async(dispatch_batch, (batch,)))
+            while len(dispatched_tasks) >= CPU_COUNT:
+                for task in (t for t in dispatched_tasks if t.ready()):
+                    dispatched_tasks.remove(task)
+                    progressbar.update(n=BATCH_SIZE)
+
+    print(
+        f"\n\nTested against {sample_size} *randomly* selected out of "
+        f"{len(all_files_size)} documents."
+        f"\n{LOCATIONS_PATHS_SAMPLES_PERCENT}% of the tag nodes' `location_path` "
+        f"attribute were verified per document."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/integration-tests/test-parse-serialize-equality.py b/integration-tests/test-parse-serialize-equality.py
@@ -3,9 +3,10 @@
 import multiprocessing as mp
 from itertools import batched
 from pathlib import Path
-from sys import stderr
 from typing import Final
 
+from tqdm import tqdm
+
 from _delb.plugins.core_loaders import path_loader
 from delb import compare_trees, Document, FailedDocumentLoading, ParserOptions
 
@@ -26,7 +27,8 @@ def parse_serialize_compare(file: Path):
 
     try:
         document = Document(
-            file, parser_options=ParserOptions(collapse_whitespace=False)
+            file,
+            parser_options=ParserOptions(collapse_whitespace=False, unplugged=True),
         )
     except FailedDocumentLoading as exc:
         print(
@@ -53,45 +55,41 @@ def parse_serialize_compare(file: Path):
         or not compare_trees(document.root, result_document.root)
     ):
         print(f"\nUnequal document produced from: {file.name}", end="")
-        stderr.write("🕱")
     # TODO? compare with lxml as well
     else:
         result_file.unlink()
-        stderr.write("✓")
 
 
 def dispatch_batch(files_list: list[Path]):
     for file in files_list:
         try:
             parse_serialize_compare(file)
         except Exception as e:
-            print(f"\nUnhandled exception while testing {file}: {e}", end="")
+            print(f"\nUnhandled exception while testing {file}: {e}")
 
 
 def main():
-    counter = 0
     mp.set_start_method("forkserver")
 
+    all_files = tuple(CORPORA_PATH.rglob("*.xml"))
     dispatched_tasks = []
+    progressbar = tqdm(total=len(all_files), mininterval=0.5, unit_scale=True)
+
     with mp.Pool(CPU_COUNT) as pool:
-        for file_list in batched(CORPORA_PATH.rglob("*.xml"), n=BATCH_SIZE):
+        for file_list in batched(all_files, n=BATCH_SIZE):
             dispatched_tasks.append(
                 pool.apply_async(
                     dispatch_batch,
                     (file_list,),
                 )
             )
 
-            counter += 1
-
             while len(dispatched_tasks) >= CPU_COUNT:
-                for task in dispatched_tasks:
-                    if task.ready():
-                        dispatched_tasks.remove(task)
-
-            stderr.flush()
+                for task in (t for t in dispatched_tasks if t.ready()):
+                    dispatched_tasks.remove(task)
+                    progressbar.update(n=BATCH_SIZE)
 
-    print(f"\n\nTested against ~{counter*BATCH_SIZE} documents.")
+    print(f"\n\nTested against {len(all_files)} documents.")
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -167,7 +167,7 @@ dependencies = [
 ]
 detached = true
 [tool.hatch.envs.linting.scripts]
-check = "flake8 benchmarks _delb delb tests"
+check = "flake8 benchmarks _delb delb integration-tests tests"
 
 [tool.hatch.envs.mypy]
 dependencies = [