Skip to content

Commit

Permalink
Merge pull request #5 from alexandrainst/domsdatabasen
Browse files Browse the repository at this point in the history
Domsdatabasen API
  • Loading branch information
saattrupdan authored Jul 17, 2024
2 parents 972c8bb + a87d4b5 commit 33edefd
Show file tree
Hide file tree
Showing 9 changed files with 2,834 additions and 35 deletions.
18 changes: 12 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,34 @@ name: CI

on:
pull_request:
types:
- opened
- synchronize
- reopened
- ready_for_review
branches:
- main

jobs:

lint:
if: github.event.pull_request.draft == false
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: jpetrucciani/black-check@master
- uses: actions/checkout@v4
- uses: jpetrucciani/ruff-check@main

pytest:
if: github.event.pull_request.draft == false
strategy:
matrix:
os: [ubuntu-latest]
python-version: ["3.11"]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Install Poetry
run: pip3 install poetry==1.4.0
run: pipx install poetry==1.8.2

- name: Set up Python
uses: actions/setup-python@v4
Expand All @@ -37,4 +43,4 @@ jobs:
poetry install --no-interaction --no-cache
- name: Test with pytest
run: poetry run pytest --dist no -n 0
run: poetry run pytest
4 changes: 2 additions & 2 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Install Poetry
run: pip3 install poetry==1.4.0
run: pipx install poetry==1.8.2

- name: Set up Python
uses: actions/setup-python@v4
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,6 @@ models/*

# Dotenv file with name and email
.name_and_email


znotes.md
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,27 @@ To install the package simply write the following command in your favorite termi
pip install alexandra-ai-data
```

### Domsdatabasen
The processing part of the Domsdatabasen API depends on [poppler](https://poppler.freedesktop.org/). To install it on macOS, run the following command:

```
brew install poppler
```

This is only necessary if you use the API to get cases that are not in [the cached dataset](https://huggingface.co/datasets/alexandrainst/domsdatabasen).

## Quickstart

TODO
### Domsdatabasen usage example
```python
from alexandra_ai_data.domsdatabasen import Domsdatabasen

domsdatabasen = Domsdatabasen()

case = domsdatabasen.get_case(case_id="100")

print(case)
```


## Contributors
Expand Down
14 changes: 11 additions & 3 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,16 @@ help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' makefile | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

install: ## Install dependencies
@echo "Installing the 'ScandEval' project..."
@echo "Installing the 'AlexandraAI-data' project..."
@$(MAKE) --quiet install-brew
@$(MAKE) --quiet install-pipx
@$(MAKE) --quiet install-poetry
@$(MAKE) --quiet setup-poetry
@$(MAKE) --quiet setup-environment-variables
@echo "Installed the 'ScandEval' project."
@echo "Installed the 'AlexandraAI-data' project. If you want to use pre-commit hooks, run 'make install-pre-commit'."

install-pre-commit: ## Install pre-commit hooks
@poetry run pre-commit install

install-brew:
@if [ $$(uname) = "Darwin" ] && [ "$(shell which brew)" = "" ]; then \
Expand Down Expand Up @@ -72,7 +75,6 @@ install-poetry:
setup-poetry:
@poetry env use python3.11
@poetry install
@poetry run pre-commit install

setup-environment-variables:
@poetry run python src/scripts/fix_dot_env_file.py
Expand Down Expand Up @@ -123,6 +125,12 @@ publish-minor: bump-minor publish ## Publish a minor version

publish-patch: bump-patch publish ## Publish a patch version

lint: ## Lint code
@poetry run ruff check src --fix

type-check: ## Run type checking
@poetry run mypy src --install-types --non-interactive --ignore-missing-imports --show-error-codes --check-untyped-defs

test: ## Run tests
@poetry run pytest && poetry run readme-cov

Expand Down
2,615 changes: 2,592 additions & 23 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ license = "MIT"

[tool.poetry.dependencies]
python = ">=3.11,<3.13"
domsdatabasen = "^0.1.4"

[tool.poetry.group.dev.dependencies]
pytest = ">=7.4.2"
Expand Down
106 changes: 106 additions & 0 deletions src/alexandra_ai_data/config_domsdatabasen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Configuration for the domsdatabasen package."""

from omegaconf import DictConfig, OmegaConf

config: DictConfig = OmegaConf.create(
{
"scrape": {
"paths": {"download_dir": "download_tmp/", "test_dir": "tmp/"},
"force": False,
"case_id": "1",
"all": False,
"start_case_id": "3962",
"messages": {
"give_correct_input": "Please specify either a 'case_id' or use 'all' to scrape all cases.\n",
"done": "Scraping done!\n",
},
"test_case_name": "test_case",
"test_case_id": "1",
"sleep": 5,
"max_consecutive_nonexistent_page_count": 100,
"timeout_pdf_download": 10,
},
"process": {
"paths": {
"test_data_raw_dir": "tests/data/processor/raw/",
"test_data_processed_dir": "tests/data/processor/processed",
"blacklist": "data/blacklists/process.jsonl",
},
"force": False,
"case_id": "1",
"all": False,
"start_case_id": "2732",
"blacklist_flag": False,
"test_case_id": "1",
"page_number": False,
"gpu": False,
"max_y_difference": 25,
"neighbor_distance_max": 1,
"box_area_min": 2500,
"box_height_min": 35,
"box_height_upper": 110,
"box_width_min": 35,
"box_accept_ratio": 0.6,
"box_split_white_space": 7,
"shift_up": 0,
"iou_overlap_threshold": 0.5,
"indices_to_split_edge_min_length": 5,
"edge_accept_ratio": 0.95,
"indices_to_split_row_diff": 45,
"max_scale": 1.5,
"anonymized_box_crop_padding": 3,
"make_split_between_overlapping_box_and_line_height_max": 30,
"box_split_delta": 2,
"threshold_binarize_process_image": 50,
"threshold_binarize_anonymized_boxes": 75,
"threshold_binarize_empty_box": 100,
"threshold_binarize_top_page": 230,
"threshold_binarize_process_crop": 200,
"threshold_gap": 11,
"threshold_box_confidence": 0.3,
"threshold_remove_boundary_height": 20,
"threshold_remove_boundary_length": 50,
"threshold_remove_boundary_closely_square": 3,
"threshold_remove_boundary_too_few_pixels": 10,
"threshold_footnote_height": 30,
"invert_find_anonymized_boxes": False,
"invert_find_underline_anonymizations": True,
"underline_length_min": 26,
"underline_height_lower_bound": 2,
"underline_height_upper_bound": 7,
"underline_remove_pad": 1,
"underline_box_height": 50,
"underline_box_height_min": 32,
"underline_box_expand": 3,
"threshold_binarize_line_anonymization": 255,
"line_start_ignore_col": 1250,
"line_start_ignore_row": 3000,
"remove_cell_border": 5,
"cell_box_shrink": 5,
"cell_box_crop_padding": 3,
"cell_multiple_lines_gap_threshold": 10,
"remove_table_border": 7,
"threshold_binarize_process_before_table_search": 1,
"origin_box": "box",
"origin_underline": "underline",
"page_from_top_to_this_row": 500,
"logo_bbox_area_threshold": 50000,
},
"finalize": {"force": False},
"domsdatabasen": {"url": "https://domsdatabasen.dk/#sag"},
"paths": {
"hf_hub": "alexandrainst/domsdatabasen",
"data_raw_dir": "data/raw/",
"data_processed_dir": "data/processed",
"data_final_dir": "data/final",
},
"file_names": {
"tabular_data": "tabular_data.json",
"pdf_document": "document.pdf",
"processed_data": "processed_data.json",
"dataset": "dataset.jsonl",
},
"anon_method": {"underline": "underline", "box": "box", "none": "none"},
"testing": False,
}
)
89 changes: 89 additions & 0 deletions src/alexandra_ai_data/domsdatabasen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""API for accessing processed data from Domsdatabasen."""

from logging import getLogger
from typing import Union

from datasets import load_dataset
from domsdatabasen import DatasetBuilder, Processor, Scraper
from omegaconf import DictConfig

from .config_domsdatabasen import config

logger = getLogger(__name__)


class Domsdatabasen:
"""API for accessing processed data from Domsdatabasen.
Attributes:
config (DictConfig):
Configuration settings object.
scraper (Scraper):
Scraper object for scraping data from Domsdatabasen.
processor (Processor):
Processor object for processing scraped data.
dataset_builder (DatasetBuilder):
DatasetBuilder object for building dataset samples.
dataset (Dataset):
Dataset of processed data from Domsdatabasen.
"""

def __init__(self):
"""Initialize."""
self.config: DictConfig = config
self.dataset = load_dataset("alexandrainst/domsdatabasen", split="train")

# The following objects will not be initialized until
# the first time they are needed.
self.scraper = None
self.processor = None
self.dataset_builder = None

def get_case(self, case_id: Union[str, int]) -> dict:
"""Get processed data for a case from Domsdatabasen.
If the case_id is already in the dataset, the data is returned from the dataset.
Else, the case will be scraped and processed.
Args:
case_id (str, int):
The case_id of the case to get data for.
Returns:
dataset_sample (dict):
Processed data for the case.
"""
if isinstance(case_id, int):
case_id = str(case_id)

# Check if case_id is already in dataset
for dataset_sample in self.dataset:
if dataset_sample["case_id"] == case_id:
logger.info(f"Found case_id {case_id} in cached dataset.")
return dataset_sample

# If case_id is not in dataset, scrape and process the case
logger.info(
f"Case_id {case_id} not found in cached dataset. "
"Scraping and processing the case..."
)
self._initialize_objects()
self.scraper.scrape(case_id=case_id)
processed_data = self.processor.process(case_id=case_id)
dataset_sample = self.dataset_builder.make_dataset_sample(
processed_data=processed_data
)

return dataset_sample

def _initialize_objects(self):
"""Initialize Scraper, Processor and DatasetBuilder objects.
We don't want to initialize these objects before they are needed.
"""
if self.scraper is not None:
return

self.scraper = Scraper(config=self.config)
self.processor = Processor(config=self.config)
self.dataset_builder = DatasetBuilder(config=self.config)

0 comments on commit 33edefd

Please sign in to comment.