Skip to content

Commit

Permalink
Merge pull request #13 from alexandrainst/scrape-date
Browse files Browse the repository at this point in the history
Scrape date
  • Loading branch information
oliverkinch authored Mar 6, 2024
2 parents 3ed0083 + 4fc3a2a commit d41be79
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ extend-select = [
"I",
"D",
]
exclude = [
"src/doms_databasen/_xpaths.py",
]

[tool.ruff.pydocstyle]
convention = "google"
Expand Down
1 change: 1 addition & 0 deletions src/doms_databasen/_xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"Accept cookies": "//a[@id='CybotCookiebotDialogBodyLevelButtonLevelOptinAllowAll']",
"Øvrige sagsoplysninger": "//span[@class='accordion-title'][contains(text(), 'Øvrige sagsoplysninger')]",
"Sagen er ikke tilgængelig": "//h1[contains(text(), 'Sagen er ikke tilgængelig')]",
"Dato": "//tr[@tabindex='0']//td[1]",
}
XPATHS_TABULAR_DATA = {
"Overskrift": "//h4[contains(text(), 'Overskrift')]/following-sibling::span[1]/p",
Expand Down
20 changes: 20 additions & 0 deletions src/doms_databasen/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import os
import re
import shutil
import time
from pathlib import Path
Expand Down Expand Up @@ -255,8 +256,27 @@ def _get_tabular_data(self) -> dict:
element = self.driver.find_element(By.XPATH, xpath)
tabular_data[key] = element.text.strip()

# Not part of the tabular data table, but
# we will include the date of the case here.
tabular_data["Dato"] = self._get_date()

return tabular_data

def _get_date(self) -> str:
"""Gets the date of the case.
Returns:
date (str):
Date of the case
"""
date = ""
element = self.driver.find_element(By.XPATH, XPATHS["Dato"])
# Datetime is on format "dd-mm-yyyy"
found = re.search(r"\d{2}-\d{2}-\d{4}", element.text.strip())
if found:
date = found.group()
return date

def _accept_cookies(self) -> None:
"""Accepts cookies on the page."""
element = WebDriverWait(self.driver, self.config.scrape.sleep).until(
Expand Down

0 comments on commit d41be79

Please sign in to comment.