Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds verification of TagNode.location_path to the integration tests #87

Closed
wants to merge 10 commits into from
Closed
6 changes: 6 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@ end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

[*.md]
indent_style = space
indent_size = x
max_line_length = 80

[*.py]
indent_style = space
indent_size = 4
max_line_length = 88

[*.rst]
indent_style = space
max_line_length = 80

[*.yml]
Expand Down
2 changes: 1 addition & 1 deletion Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ benchmarks:

# normalize Python code
black:
black benchmarks _delb delb tests
black benchmarks _delb delb integration-tests tests

# runs tests (except loaders) and reports uncovered lines
coverage-report:
Expand Down
18 changes: 15 additions & 3 deletions integration-tests/README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
# Integration tests against corpora

This folder serves as playground for tests of basic functionality against many
XML documents, mostly TEI-encodings. They are supposed to be executed with
XML documents, mostly TEI encodings. They are supposed to be executed with
major code change proposals and before releases.

The `requirements.txt` should contain a list of all libraries needed to run the
scripts beside `delb`.

## Test corpus

Place document collections into the `corpora` folder. The `fetch-corpora.py`
script helps to get going with the minimal requirement (~3GB) of tests.
script helps to get going with the minimal requirement (~6GB) of data.
Set any non-empty string as environment variable `SKIP_EXISTING` to skip
downloading a corpus whose target folder already exists.

Due to the `lb` tag [issue](https://github.com/deutschestextarchiv/dtabf/issues/33)
with the DTABf the DTA corpus isn't considered. It could be an experiment to
use *delb* for transformations with regards to the conclusions of that issue.

The `normalize-corpora.py` script addresses issues that were found in the text
encodings and must be run before the tests.
encodings and must be run after fetching test data.
The corpus folder names can be passed as arguments to the script in order to
process only those contents.

## Tests

Expand All @@ -25,3 +32,8 @@ reserves a `report.txt` for messages redirected from *stdout*.

When problems occur, carefully investigate that it's not due to the source, and
if not extract simple enough cases for the unit tests.

## TODO

After adding the third kind of test, wrap all scripts here into a
[textual](https://textual.textualize.io) app.
7 changes: 5 additions & 2 deletions integration-tests/fetch-corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import Final, NamedTuple

from httpx import AsyncClient, HTTPError
from tenacity import retry, wait_random_exponential

import delb

Expand All @@ -49,7 +50,8 @@ class Archive(NamedTuple):

ARCHIVE_DESCRIPTIONS: Final = (
Archive(
url="https://github.com/Brown-University-Library/atalanta-texts/archive/master.tar.gz",
url="https://github.com/Brown-University-Library/atalanta-texts/archive"
"/master.tar.gz",
archive_documents_root="atalanta-texts-master/",
target_directory="atalanta",
),
Expand Down Expand Up @@ -221,7 +223,7 @@ class Archive(NamedTuple):
url="https://github.com/funkyfuture/idp.data/archive/master.tar.gz",
# TODO use this URL when https://github.com/papyri/idp.data/pull/391
# was merged:
# url="https://github.com/papyri/idp.data/archive/master.tar.gz",
# url="https://github.com/papyri/idp.data/archive/master.tar.gz", # noqa: E800
archive_documents_root="idp.data-master/",
target_directory="papyri",
),
Expand All @@ -243,6 +245,7 @@ class Archive(NamedTuple):
http_client: Final = AsyncClient()


@retry(wait=wait_random_exponential(multiplier=1, max=120))
async def fetch_resource(url: str, destination: io.BufferedWriter) -> bool:
async with http_client.stream("GET", url, follow_redirects=True) as response:
try:
Expand Down
27 changes: 16 additions & 11 deletions integration-tests/normalize-corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
from functools import partial
from pathlib import Path
from typing import Final
from sys import argv, stderr
from sys import argv

from tqdm import tqdm

CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"
RELEVANT_CORPORA: Final = ("casebooks", "papyri")

# the casebooks corpus uses an external, local .dtd file whose reference needs to be
# adjusted
Expand All @@ -31,28 +34,30 @@
cr_ent_to_lf = partial(re.compile(re.escape(b"
"), flags=re.IGNORECASE).subn, b"\n")


async def normalize_file(file: Path):
async def normalize_file(file: Path, indicate_progress: callable):
match file.parent.name:
case "casebooks":
contents, subs = adjust_casebooks_dtd_path(file.read_bytes())
case "papyri":
contents, subs = cr_ent_to_lf(file.read_bytes())
case _:
return
raise RuntimeError

if subs:
file.write_bytes(contents)
stderr.write("✓")
indicate_progress()


async def main():
root = CORPORA_PATH
if len(argv) > 1:
root /= argv[1]
print(root)
async with asyncio.TaskGroup() as tasks:
for file in root.rglob("*.xml"):
tasks.create_task(normalize_file(file))
corpora = [x for x in RELEVANT_CORPORA if x in argv[1:]] or RELEVANT_CORPORA
for folder in (CORPORA_PATH / x for x in corpora):
print(f"Normalizing contents of {folder}")
files = tuple(folder.glob("*.xml"))
progressbar = tqdm(total=len(files), mininterval=0.5, unit_scale=True)
async with asyncio.TaskGroup() as tasks:
for file in files:
tasks.create_task(normalize_file(file, progressbar.update))
progressbar.close()


if __name__ == "__main__":
Expand Down
3 changes: 3 additions & 0 deletions integration-tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
httpx
tenacity
tqdm
93 changes: 93 additions & 0 deletions integration-tests/test-location-paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/bin/env python3

from __future__ import annotations

import multiprocessing as mp
import random
from itertools import batched, chain
from pathlib import Path
from typing import TYPE_CHECKING, Final

from tqdm import tqdm

from _delb.plugins.core_loaders import path_loader
from delb import is_tag_node, Document, FailedDocumentLoading, ParserOptions

if TYPE_CHECKING:
from collections.abc import Iterable

BATCH_SIZE: Final = 64
CPU_COUNT: Final = mp.cpu_count()

CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"

DOCUMENT_SAMPLES_PERCENT: Final = 25
LOCATIONS_PATHS_SAMPLES_PERCENT: Final = 25


def verify_location_paths(file: Path):
try:
document = Document(
file,
parser_options=ParserOptions(
collapse_whitespace=False, resolve_entities=False, unplugged=True
),
)
except FailedDocumentLoading as exc:
print(
f"\nFailed to load {file.name}: {exc.excuses[path_loader]}",
end="",
)
return

root = document.root
for node in chain((root,), root.iterate_descendants(is_tag_node)):
if random.randint(1, 100) > LOCATIONS_PATHS_SAMPLES_PERCENT:
continue

query_results = document.xpath(node.location_path)
if not (query_results.size == 1 and query_results.first is node):
print(
f"\nXPath query `{node.location_path}` in {file} yielded unexpected "
"results."
)


def dispatch_batch(files: Iterable[Path]):
for file in files:
try:
verify_location_paths(file)
except Exception as e:
print(f"\nUnhandled exception while testing {file}: {e}")


def main():
mp.set_start_method("forkserver")

all_files = tuple(CORPORA_PATH.rglob("*.xml"))
all_files_size = len(all_files)
sample_size = int(all_files_size / 100 * DOCUMENT_SAMPLES_PERCENT)
selected_files = random.choices(all_files, k=sample_size)
del all_files

dispatched_tasks = []
progressbar = tqdm(total=sample_size, mininterval=0.5, unit_scale=True)

with mp.Pool(CPU_COUNT) as pool:
for batch in batched(selected_files, n=BATCH_SIZE):
dispatched_tasks.append(pool.apply_async(dispatch_batch, (batch,)))
while len(dispatched_tasks) >= CPU_COUNT:
for task in (t for t in dispatched_tasks if t.ready()):
dispatched_tasks.remove(task)
progressbar.update(n=BATCH_SIZE)

print(
f"\n\nTested against {sample_size} *randomly* selected out of "
f"{len(all_files_size)} documents."
f"\n{LOCATIONS_PATHS_SAMPLES_PERCENT}% of the tag nodes' `location_path` "
f"attribute were verified per document."
)


if __name__ == "__main__":
main()
28 changes: 13 additions & 15 deletions integration-tests/test-parse-serialize-equality.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import multiprocessing as mp
from itertools import batched
from pathlib import Path
from sys import stderr
from typing import Final

from tqdm import tqdm

from _delb.plugins.core_loaders import path_loader
from delb import compare_trees, Document, FailedDocumentLoading, ParserOptions

Expand All @@ -26,7 +27,8 @@ def parse_serialize_compare(file: Path):

try:
document = Document(
file, parser_options=ParserOptions(collapse_whitespace=False)
file,
parser_options=ParserOptions(collapse_whitespace=False, unplugged=True),
)
except FailedDocumentLoading as exc:
print(
Expand All @@ -53,45 +55,41 @@ def parse_serialize_compare(file: Path):
or not compare_trees(document.root, result_document.root)
):
print(f"\nUnequal document produced from: {file.name}", end="")
stderr.write("🕱")
# TODO? compare with lxml as well
else:
result_file.unlink()
stderr.write("✓")


def dispatch_batch(files_list: list[Path]):
for file in files_list:
try:
parse_serialize_compare(file)
except Exception as e:
print(f"\nUnhandled exception while testing {file}: {e}", end="")
print(f"\nUnhandled exception while testing {file}: {e}")


def main():
counter = 0
mp.set_start_method("forkserver")

all_files = tuple(CORPORA_PATH.rglob("*.xml"))
dispatched_tasks = []
progressbar = tqdm(total=len(all_files), mininterval=0.5, unit_scale=True)

with mp.Pool(CPU_COUNT) as pool:
for file_list in batched(CORPORA_PATH.rglob("*.xml"), n=BATCH_SIZE):
for file_list in batched(all_files, n=BATCH_SIZE):
dispatched_tasks.append(
pool.apply_async(
dispatch_batch,
(file_list,),
)
)

counter += 1

while len(dispatched_tasks) >= CPU_COUNT:
for task in dispatched_tasks:
if task.ready():
dispatched_tasks.remove(task)

stderr.flush()
for task in (t for t in dispatched_tasks if t.ready()):
dispatched_tasks.remove(task)
progressbar.update(n=BATCH_SIZE)

print(f"\n\nTested against ~{counter*BATCH_SIZE} documents.")
print(f"\n\nTested against {len(all_files)} documents.")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ dependencies = [
]
detached = true
[tool.hatch.envs.linting.scripts]
check = "flake8 benchmarks _delb delb tests"
check = "flake8 benchmarks _delb delb integration-tests tests"

[tool.hatch.envs.mypy]
dependencies = [
Expand Down