Merge pull request #5 from alexandrainst/domsdatabasen

Domsdatabasen API
alexandrainst · Jul 17, 2024 · 33edefd · 33edefd
2 parents 972c8bb + a87d4b5
commit 33edefd
Show file tree

Hide file tree

Showing 9 changed files with 2,834 additions and 35 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -2,28 +2,34 @@ name: CI
 
 on:
   pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+      - ready_for_review
     branches:
       - main
 
 jobs:
-
   lint:
+    if: github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
-      - uses: jpetrucciani/black-check@master
+      - uses: actions/checkout@v4
+      - uses: jpetrucciani/ruff-check@main
 
   pytest:
+    if: github.event.pull_request.draft == false
     strategy:
         matrix:
             os: [ubuntu-latest]
             python-version: ["3.11"]
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Install Poetry
-        run: pip3 install poetry==1.4.0
+        run: pipx install poetry==1.8.2
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -37,4 +43,4 @@ jobs:
           poetry install --no-interaction --no-cache
 
       - name: Test with pytest
-        run: poetry run pytest --dist no -n 0
+        run: poetry run pytest
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -15,10 +15,10 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Install Poetry
-        run: pip3 install poetry==1.4.0
+        run: pipx install poetry==1.8.2
 
       - name: Set up Python
         uses: actions/setup-python@v4

diff --git a/.gitignore b/.gitignore
@@ -113,3 +113,6 @@ models/*
 
 # Dotenv file with name and email
 .name_and_email
+
+
+znotes.md
diff --git a/README.md b/README.md
@@ -20,10 +20,27 @@ To install the package simply write the following command in your favorite termi
 pip install alexandra-ai-data
 ```
 
+### Domsdatabasen
+The processing part of the Domsdatabasen API depends on [poppler](https://poppler.freedesktop.org/). To install it on macOS, run the following command:
+
+```
+brew install poppler
+```
+
+This is only necessary if you use the API to get cases that are not in [the cached dataset](https://huggingface.co/datasets/alexandrainst/domsdatabasen).
 
 ## Quickstart
 
-TODO
+### Domsdatabasen usage example
+```python
+from alexandra_ai_data.domsdatabasen import Domsdatabasen
+
+domsdatabasen = Domsdatabasen()
+
+case = domsdatabasen.get_case(case_id="100")
+
+print(case)
+```
 
 
 ## Contributors

diff --git a/makefile b/makefile
@@ -35,13 +35,16 @@ help:
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' makefile | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 
 install: ## Install dependencies
-	@echo "Installing the 'ScandEval' project..."
+	@echo "Installing the 'AlexandraAI-data' project..."
 	@$(MAKE) --quiet install-brew
 	@$(MAKE) --quiet install-pipx
 	@$(MAKE) --quiet install-poetry
 	@$(MAKE) --quiet setup-poetry
 	@$(MAKE) --quiet setup-environment-variables
-	@echo "Installed the 'ScandEval' project."
+	@echo "Installed the 'AlexandraAI-data' project. If you want to use pre-commit hooks, run 'make install-pre-commit'."
+
+install-pre-commit:  ## Install pre-commit hooks
+	@poetry run pre-commit install
 
 install-brew:
 	@if [ $$(uname) = "Darwin" ] && [ "$(shell which brew)" = "" ]; then \
@@ -72,7 +75,6 @@ install-poetry:
 setup-poetry:
 	@poetry env use python3.11
 	@poetry install
-	@poetry run pre-commit install
 
 setup-environment-variables:
 	@poetry run python src/scripts/fix_dot_env_file.py
@@ -123,6 +125,12 @@ publish-minor: bump-minor publish  ## Publish a minor version
 
 publish-patch: bump-patch publish  ## Publish a patch version
 
+lint:  ## Lint code
+	@poetry run ruff check src --fix
+
+type-check:  ## Run type checking
+	@poetry run mypy src --install-types --non-interactive --ignore-missing-imports --show-error-codes --check-untyped-defs
+
 test:  ## Run tests
 	@poetry run pytest && poetry run readme-cov
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ license = "MIT"
 
 [tool.poetry.dependencies]
 python = ">=3.11,<3.13"
+domsdatabasen = "^0.1.4"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.4.2"

diff --git a/src/alexandra_ai_data/config_domsdatabasen.py b/src/alexandra_ai_data/config_domsdatabasen.py
@@ -0,0 +1,106 @@
+"""Configuration for the domsdatabasen package."""
+
+from omegaconf import DictConfig, OmegaConf
+
+config: DictConfig = OmegaConf.create(
+    {
+        "scrape": {
+            "paths": {"download_dir": "download_tmp/", "test_dir": "tmp/"},
+            "force": False,
+            "case_id": "1",
+            "all": False,
+            "start_case_id": "3962",
+            "messages": {
+                "give_correct_input": "Please specify either a 'case_id' or use 'all' to scrape all cases.\n",
+                "done": "Scraping done!\n",
+            },
+            "test_case_name": "test_case",
+            "test_case_id": "1",
+            "sleep": 5,
+            "max_consecutive_nonexistent_page_count": 100,
+            "timeout_pdf_download": 10,
+        },
+        "process": {
+            "paths": {
+                "test_data_raw_dir": "tests/data/processor/raw/",
+                "test_data_processed_dir": "tests/data/processor/processed",
+                "blacklist": "data/blacklists/process.jsonl",
+            },
+            "force": False,
+            "case_id": "1",
+            "all": False,
+            "start_case_id": "2732",
+            "blacklist_flag": False,
+            "test_case_id": "1",
+            "page_number": False,
+            "gpu": False,
+            "max_y_difference": 25,
+            "neighbor_distance_max": 1,
+            "box_area_min": 2500,
+            "box_height_min": 35,
+            "box_height_upper": 110,
+            "box_width_min": 35,
+            "box_accept_ratio": 0.6,
+            "box_split_white_space": 7,
+            "shift_up": 0,
+            "iou_overlap_threshold": 0.5,
+            "indices_to_split_edge_min_length": 5,
+            "edge_accept_ratio": 0.95,
+            "indices_to_split_row_diff": 45,
+            "max_scale": 1.5,
+            "anonymized_box_crop_padding": 3,
+            "make_split_between_overlapping_box_and_line_height_max": 30,
+            "box_split_delta": 2,
+            "threshold_binarize_process_image": 50,
+            "threshold_binarize_anonymized_boxes": 75,
+            "threshold_binarize_empty_box": 100,
+            "threshold_binarize_top_page": 230,
+            "threshold_binarize_process_crop": 200,
+            "threshold_gap": 11,
+            "threshold_box_confidence": 0.3,
+            "threshold_remove_boundary_height": 20,
+            "threshold_remove_boundary_length": 50,
+            "threshold_remove_boundary_closely_square": 3,
+            "threshold_remove_boundary_too_few_pixels": 10,
+            "threshold_footnote_height": 30,
+            "invert_find_anonymized_boxes": False,
+            "invert_find_underline_anonymizations": True,
+            "underline_length_min": 26,
+            "underline_height_lower_bound": 2,
+            "underline_height_upper_bound": 7,
+            "underline_remove_pad": 1,
+            "underline_box_height": 50,
+            "underline_box_height_min": 32,
+            "underline_box_expand": 3,
+            "threshold_binarize_line_anonymization": 255,
+            "line_start_ignore_col": 1250,
+            "line_start_ignore_row": 3000,
+            "remove_cell_border": 5,
+            "cell_box_shrink": 5,
+            "cell_box_crop_padding": 3,
+            "cell_multiple_lines_gap_threshold": 10,
+            "remove_table_border": 7,
+            "threshold_binarize_process_before_table_search": 1,
+            "origin_box": "box",
+            "origin_underline": "underline",
+            "page_from_top_to_this_row": 500,
+            "logo_bbox_area_threshold": 50000,
+        },
+        "finalize": {"force": False},
+        "domsdatabasen": {"url": "https://domsdatabasen.dk/#sag"},
+        "paths": {
+            "hf_hub": "alexandrainst/domsdatabasen",
+            "data_raw_dir": "data/raw/",
+            "data_processed_dir": "data/processed",
+            "data_final_dir": "data/final",
+        },
+        "file_names": {
+            "tabular_data": "tabular_data.json",
+            "pdf_document": "document.pdf",
+            "processed_data": "processed_data.json",
+            "dataset": "dataset.jsonl",
+        },
+        "anon_method": {"underline": "underline", "box": "box", "none": "none"},
+        "testing": False,
+    }
+)
diff --git a/src/alexandra_ai_data/domsdatabasen.py b/src/alexandra_ai_data/domsdatabasen.py
@@ -0,0 +1,89 @@
+"""API for accessing processed data from Domsdatabasen."""
+
+from logging import getLogger
+from typing import Union
+
+from datasets import load_dataset
+from domsdatabasen import DatasetBuilder, Processor, Scraper
+from omegaconf import DictConfig
+
+from .config_domsdatabasen import config
+
+logger = getLogger(__name__)
+
+
+class Domsdatabasen:
+    """API for accessing processed data from Domsdatabasen.
+
+    Attributes:
+        config (DictConfig):
+            Configuration settings object.
+        scraper (Scraper):
+            Scraper object for scraping data from Domsdatabasen.
+        processor (Processor):
+            Processor object for processing scraped data.
+        dataset_builder (DatasetBuilder):
+            DatasetBuilder object for building dataset samples.
+        dataset (Dataset):
+            Dataset of processed data from Domsdatabasen.
+    """
+
+    def __init__(self):
+        """Initialize."""
+        self.config: DictConfig = config
+        self.dataset = load_dataset("alexandrainst/domsdatabasen", split="train")
+
+        # The following objects will not be initialized until
+        # the first time they are needed.
+        self.scraper = None
+        self.processor = None
+        self.dataset_builder = None
+
+    def get_case(self, case_id: Union[str, int]) -> dict:
+        """Get processed data for a case from Domsdatabasen.
+
+        If the case_id is already in the dataset, the data is returned from the dataset.
+        Else, the case will be scraped and processed.
+
+        Args:
+            case_id (str, int):
+                The case_id of the case to get data for.
+
+        Returns:
+            dataset_sample (dict):
+                Processed data for the case.
+        """
+        if isinstance(case_id, int):
+            case_id = str(case_id)
+
+        # Check if case_id is already in dataset
+        for dataset_sample in self.dataset:
+            if dataset_sample["case_id"] == case_id:
+                logger.info(f"Found case_id {case_id} in cached dataset.")
+                return dataset_sample
+
+        # If case_id is not in dataset, scrape and process the case
+        logger.info(
+            f"Case_id {case_id} not found in cached dataset. "
+            "Scraping and processing the case..."
+        )
+        self._initialize_objects()
+        self.scraper.scrape(case_id=case_id)
+        processed_data = self.processor.process(case_id=case_id)
+        dataset_sample = self.dataset_builder.make_dataset_sample(
+            processed_data=processed_data
+        )
+
+        return dataset_sample
+
+    def _initialize_objects(self):
+        """Initialize Scraper, Processor and DatasetBuilder objects.
+
+        We don't want to initialize these objects before they are needed.
+        """
+        if self.scraper is not None:
+            return
+
+        self.scraper = Scraper(config=self.config)
+        self.processor = Processor(config=self.config)
+        self.dataset_builder = DatasetBuilder(config=self.config)