Skip to content

Commit

Permalink
TLDR-748 docs for classes
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Aug 26, 2024
1 parent c505eaa commit 4ee4791
Show file tree
Hide file tree
Showing 19 changed files with 608 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
Extract basic structure from the given document and add additional information to the lines' metadata.
To get the information about the method's parameters look at the documentation of the class \
:class:`~dedoc.structure_extractors.AbstractStructureExtractor`.
``parameters`` parameter can contain patterns for configuring lines types and their levels in the output document tree ("patterns" key).
Please see :ref:`dedoc_structure_extractors_patterns` and :ref:`using_patterns` to get information how to use patterns for making your custom structure.
"""
parameters = {} if parameters is None else parameters
patterns = self.__get_patterns(parameters)
Expand Down
33 changes: 31 additions & 2 deletions dedoc/structure_extractors/patterns/abstract_pattern.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,28 @@
from abc import ABC, abstractmethod
from typing import Optional
from typing import Optional, Union

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta


class AbstractPattern(ABC):
"""
Base class for all patterns to configure structure extraction by :class:`~dedoc.structure_extractors.DefaultStructureExtractor`.
"""
_name = ""

def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[bool or str]) -> None:
def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[Union[bool, str]]) -> None:
"""
Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
They can be used in :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level`
according to specific pattern logic.
:param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc.
:param level_1: value of a line primary importance
:param level_2: level of the line inside specific class
:param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`,
if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``.
"""
from dedoc.utils.parameter_utils import get_bool_value

self._line_type = line_type
Expand All @@ -18,12 +32,27 @@ def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Op

@classmethod
def name(cls: "AbstractPattern") -> str:
"""
Returns ``_name`` attribute, is used in parameters configuration to choose a specific pattern.
Each pattern has a unique non-empty name.
"""
return cls._name

@abstractmethod
def match(self, line: LineWithMeta) -> bool:
"""
Check if the given line satisfies to the pattern requirements.
Line text, annotations or metadata (``metadata.tag_hierarchy_level``) can be used to decide, if the line matches the pattern or not.
"""
pass

@abstractmethod
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
"""
This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.match`
returned ``True`` for the given line.
Get :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level`` attribute.
Please see :ref:`add_structure_type_hierarchy_level` to get more information about :class:`~dedoc.data_structures.HierarchyLevel`.
"""
pass
37 changes: 35 additions & 2 deletions dedoc/structure_extractors/patterns/bracket_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,44 @@
from typing import Optional
from typing import Optional, Union

from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class BracketListPattern(RegexpPattern):
"""
Pattern for matching numbered lists with brackets, e.g.
::
1) first element
2) second element
Example of library usage:
.. code-block:: python
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import BracketListPattern
reader = ...
structure_extractor = DefaultStructureExtractor()
patterns = [BracketListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)]
document = reader.read(file_path=file_path)
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
Example of API usage:
.. code-block:: python
import requests
patterns = [{"name": "bracket_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
"""
_name = "bracket_list"

def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None:
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
43 changes: 41 additions & 2 deletions dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
from typing import Optional
from typing import Optional, Union

from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class BracketRomanListPattern(RegexpPattern):
"""
Pattern for matching roman lists with brackets, e.g.
::
i) first item
ii) second item
iii) third item
iv) forth item
.. note::
The pattern is case-insensitive (lower and upper letters are not differed).
Example of library usage:
.. code-block:: python
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import BracketRomanListPattern
reader = ...
structure_extractor = DefaultStructureExtractor()
patterns = [BracketRomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)]
document = reader.read(file_path=file_path)
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
Example of API usage:
.. code-block:: python
import requests
patterns = [{"name": "bracket_roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
"""
_name = "bracket_roman_list"

def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None:
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
39 changes: 37 additions & 2 deletions dedoc/structure_extractors/patterns/bullet_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,46 @@
from typing import Optional
from typing import Optional, Union

from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class BulletListPattern(RegexpPattern):
"""
Pattern for matching bulleted lists, e.g.
::
- first item
- second item
or with other bullet markers ``-, —, −, –, ®, ., •, ,, ‚, ©, ⎯, °, *, >, ●, ♣, ①, ▪, *, +``.
Example of library usage:
.. code-block:: python
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import BulletListPattern
reader = ...
structure_extractor = DefaultStructureExtractor()
patterns = [BulletListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)]
document = reader.read(file_path=file_path)
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
Example of API usage:
.. code-block:: python
import requests
patterns = [{"name": "bullet_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
"""
_name = "bullet_list"

def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None:
super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
46 changes: 44 additions & 2 deletions dedoc/structure_extractors/patterns/dotted_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Optional, Union

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
Expand All @@ -7,9 +7,51 @@


class DottedListPattern(RegexpPattern):
"""
Pattern for matching numbered lists with dots, e.g.
::
1. first element
1.1. first sub-element
1.2. second sub-element
2. second element
The number of dots is unlimited.
There is no ``level_2`` parameter in this pattern, ``level_2`` is calculated as the number of numbers between dots, e.g.
* ``1.`` → ``level_2=1``
* ``1.1`` or ``1.1.`` → ``level_2=2``
* ``1.2.3.4`` or ``1.2.3.4.`` → ``level_2=4``
Example of library usage:
.. code-block:: python
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import DottedListPattern
reader = ...
structure_extractor = DefaultStructureExtractor()
patterns = [DottedListPattern(line_type="list_item", level_1=1, can_be_multiline=False)]
document = reader.read(file_path=file_path)
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
Example of API usage:
.. code-block:: python
import requests
patterns = [{"name": "dotted_list", "line_type": "list_item", "level_1": 1, "can_be_multiline": "false"}]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
"""
_name = "dotted_list"

def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[bool or str] = None) -> None:
def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None:
super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=None, can_be_multiline=can_be_multiline)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
Expand Down
49 changes: 47 additions & 2 deletions dedoc/structure_extractors/patterns/letter_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,56 @@
from typing import Optional
from typing import Optional, Union

from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern


class LetterListPattern(RegexpPattern):
"""
Pattern for matching lists with letters and brackets, e.g.
::
a) first element
b) second element
or (example for Armenian language)
::
ա) տեղաբաշխել
բ) Հայաստանի Հանրապետության
գ) սահմանապահ վերակարգերի
.. note::
The pattern is case-insensitive (lower and upper letters are not differed).
Example of library usage:
.. code-block:: python
from dedoc.structure_extractors import DefaultStructureExtractor
from dedoc.structure_extractors.patterns import LetterListPattern
reader = ...
structure_extractor = DefaultStructureExtractor()
patterns = [LetterListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)]
document = reader.read(file_path=file_path)
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
Example of API usage:
.. code-block:: python
import requests
patterns = [{"name": "letter_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}]
parameters = {"patterns": str(patterns)}
with open(file_path, "rb") as file:
files = {"file": (file_name, file)}
r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
"""
_name = "letter_list"

def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None:
super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
Loading

0 comments on commit 4ee4791

Please sign in to comment.