From 4ee4791e661f8872b229278d49c3b6a1efccb4cf Mon Sep 17 00:00:00 2001 From: Nasty Date: Mon, 26 Aug 2024 18:09:38 +0300 Subject: [PATCH] TLDR-748 docs for classes --- .../default_structure_extractor.py | 3 + .../patterns/abstract_pattern.py | 33 ++++++- .../patterns/bracket_list_pattern.py | 37 +++++++- .../patterns/bracket_roman_list_pattern.py | 43 ++++++++- .../patterns/bullet_list_pattern.py | 39 +++++++- .../patterns/dotted_list_pattern.py | 46 ++++++++- .../patterns/letter_list_pattern.py | 49 +++++++++- .../patterns/regexp_pattern.py | 67 ++++++++++++- .../patterns/roman_list_pattern.py | 43 ++++++++- .../patterns/start_word_pattern.py | 61 +++++++++++- .../patterns/tag_header_pattern.py | 4 +- .../patterns/tag_list_pattern.py | 4 +- .../patterns/tag_pattern.py | 80 +++++++++++++++- docs/source/dedoc_api_usage/api.rst | 6 +- docs/source/index.rst | 5 +- docs/source/modules/structure_extractors.rst | 94 +++++++++++++++++++ docs/source/readers_output/line_types.rst | 10 +- docs/source/structure_types/other.rst | 5 + docs/source/tutorials/using_patterns.rst | 12 +++ 19 files changed, 608 insertions(+), 33 deletions(-) create mode 100644 docs/source/tutorials/using_patterns.rst diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index 911780fb..3037476b 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -20,6 +20,9 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N Extract basic structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. + + ``parameters`` parameter can contain patterns for configuring lines types and their levels in the output document tree ("patterns" key). + Please see :ref:`dedoc_structure_extractors_patterns` and :ref:`using_patterns` to get information how to use patterns for making your custom structure. """ parameters = {} if parameters is None else parameters patterns = self.__get_patterns(parameters) diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py index 199eec22..20ac26de 100644 --- a/dedoc/structure_extractors/patterns/abstract_pattern.py +++ b/dedoc/structure_extractors/patterns/abstract_pattern.py @@ -1,14 +1,28 @@ from abc import ABC, abstractmethod -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta class AbstractPattern(ABC): + """ + Base class for all patterns to configure structure extraction by :class:`~dedoc.structure_extractors.DefaultStructureExtractor`. + """ _name = "" - def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[bool or str]) -> None: + def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[Union[bool, str]]) -> None: + """ + Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + They can be used in :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level` + according to specific pattern logic. + + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ from dedoc.utils.parameter_utils import get_bool_value self._line_type = line_type @@ -18,12 +32,27 @@ def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Op @classmethod def name(cls: "AbstractPattern") -> str: + """ + Returns ``_name`` attribute, is used in parameters configuration to choose a specific pattern. + Each pattern has a unique non-empty name. + """ return cls._name @abstractmethod def match(self, line: LineWithMeta) -> bool: + """ + Check if the given line satisfies to the pattern requirements. + Line text, annotations or metadata (``metadata.tag_hierarchy_level``) can be used to decide, if the line matches the pattern or not. + """ pass @abstractmethod def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.match` + returned ``True`` for the given line. + + Get :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level`` attribute. + Please see :ref:`add_structure_type_hierarchy_level` to get more information about :class:`~dedoc.data_structures.HierarchyLevel`. + """ pass diff --git a/dedoc/structure_extractors/patterns/bracket_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_list_pattern.py index 524d2fe1..35a78d98 100644 --- a/dedoc/structure_extractors/patterns/bracket_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_list_pattern.py @@ -1,11 +1,44 @@ -from typing import Optional +from typing import Optional, Union from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern class BracketListPattern(RegexpPattern): + """ + Pattern for matching numbered lists with brackets, e.g. + + :: + + 1) first element + 2) second element + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BracketListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BracketListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bracket_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "bracket_list" - def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py index 35965cd6..281299e1 100644 --- a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py @@ -1,11 +1,50 @@ -from typing import Optional +from typing import Optional, Union from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern class BracketRomanListPattern(RegexpPattern): + """ + Pattern for matching roman lists with brackets, e.g. + + :: + + i) first item + ii) second item + iii) third item + iv) forth item + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BracketRomanListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BracketRomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bracket_roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "bracket_roman_list" - def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bullet_list_pattern.py b/dedoc/structure_extractors/patterns/bullet_list_pattern.py index e0dacd4b..0695abc1 100644 --- a/dedoc/structure_extractors/patterns/bullet_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bullet_list_pattern.py @@ -1,11 +1,46 @@ -from typing import Optional +from typing import Optional, Union from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern class BulletListPattern(RegexpPattern): + """ + Pattern for matching bulleted lists, e.g. + + :: + + - first item + - second item + + or with other bullet markers ``-, —, −, –, ®, ., •, ,, ‚, ©, ⎯, °, *, >, ●, ♣, ①, ▪, *, +``. + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BulletListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BulletListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bullet_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "bullet_list" - def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/dotted_list_pattern.py b/dedoc/structure_extractors/patterns/dotted_list_pattern.py index ef4a071b..d085ddb2 100644 --- a/dedoc/structure_extractors/patterns/dotted_list_pattern.py +++ b/dedoc/structure_extractors/patterns/dotted_list_pattern.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -7,9 +7,51 @@ class DottedListPattern(RegexpPattern): + """ + Pattern for matching numbered lists with dots, e.g. + + :: + + 1. first element + 1.1. first sub-element + 1.2. second sub-element + 2. second element + + The number of dots is unlimited. + There is no ``level_2`` parameter in this pattern, ``level_2`` is calculated as the number of numbers between dots, e.g. + + * ``1.`` → ``level_2=1`` + * ``1.1`` or ``1.1.`` → ``level_2=2`` + * ``1.2.3.4`` or ``1.2.3.4.`` → ``level_2=4`` + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import DottedListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [DottedListPattern(line_type="list_item", level_1=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "dotted_list", "line_type": "list_item", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "dotted_list" - def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=None, can_be_multiline=can_be_multiline) def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: diff --git a/dedoc/structure_extractors/patterns/letter_list_pattern.py b/dedoc/structure_extractors/patterns/letter_list_pattern.py index ae24ba73..b9c39591 100644 --- a/dedoc/structure_extractors/patterns/letter_list_pattern.py +++ b/dedoc/structure_extractors/patterns/letter_list_pattern.py @@ -1,11 +1,56 @@ -from typing import Optional +from typing import Optional, Union from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern class LetterListPattern(RegexpPattern): + """ + Pattern for matching lists with letters and brackets, e.g. + + :: + + a) first element + b) second element + + or (example for Armenian language) + + :: + + ա) տեղաբաշխել + բ) Հայաստանի Հանրապետության + գ) սահմանապահ վերակարգերի + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import LetterListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [LetterListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "letter_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "letter_list" - def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py index b8fc2276..a456983a 100644 --- a/dedoc/structure_extractors/patterns/regexp_pattern.py +++ b/dedoc/structure_extractors/patterns/regexp_pattern.py @@ -1,5 +1,5 @@ import re -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -7,6 +7,47 @@ class RegexpPattern(AbstractPattern): + """ + Pattern for matching line text by a regular expression. + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + Before regular expression matching, the line text is stripped (space symbols are deleted from both sides). + + .. seealso:: + + Syntax for writing regular expressions is described in the `Python documentation `_. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import RegexpPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [ + RegexpPattern(regexp="^chapter\s\d+\.", line_type="chapter", level_1=1, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^part\s\d+\.\d+\."), line_type="part", level_1=2, can_be_multiline=False) + ] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "regexp", "regexp": "^chapter\s\d+\.", "line_type": "chapter", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ # noqa _name = "regexp" def __init__(self, @@ -14,14 +55,36 @@ def __init__(self, line_type: str, level_1: Optional[int] = None, level_2: Optional[int] = None, - can_be_multiline: Optional[bool or str] = None) -> None: + can_be_multiline: Optional[Union[bool, str]] = None) -> None: + """ + Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + + :param regexp: regular expression for checking, if the line text matches the pattern. + Note that regular expression is used on the lowercase and stripped line. + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line. + Line text is checked by applying pattern's regular expression, text is stripped and made lowercase beforehand. + """ text = line.line.strip().lower() match = self._regexp.match(text) return match is not None def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.RegexpPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attributes ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` are equal to values given during class initialisation. + """ return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/roman_list_pattern.py b/dedoc/structure_extractors/patterns/roman_list_pattern.py index 619ba420..03e8761a 100644 --- a/dedoc/structure_extractors/patterns/roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/roman_list_pattern.py @@ -1,11 +1,50 @@ -from typing import Optional +from typing import Optional, Union from dedoc.structure_extractors.feature_extractors.list_features.prefix.roman_prefix import RomanPrefix from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern class RomanListPattern(RegexpPattern): + """ + Pattern for matching roman lists with dots, e.g. + + :: + + I. first item + II. second item + III. third item + IV. forth item + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import RomanListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [RomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "roman_list" - def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py index a7b9b46f..27bb71c8 100644 --- a/dedoc/structure_extractors/patterns/start_word_pattern.py +++ b/dedoc/structure_extractors/patterns/start_word_pattern.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -6,6 +6,41 @@ class StartWordPattern(AbstractPattern): + """ + Pattern for lines that begin with some specific text (e.g. Introduction, Chapter, etc.). + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + Before matching, the line text is stripped (space symbols are deleted from both sides). + Start word for marching is also stripped and made lowercase. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import StartWordPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [StartWordPattern(start_word="chapter", line_type="chapter", level_1=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "start_word", "start_word": "chapter", "line_type": "chapter", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "start_word" def __init__(self, @@ -13,13 +48,35 @@ def __init__(self, line_type: str, level_1: Optional[int] = None, level_2: Optional[int] = None, - can_be_multiline: Optional[bool or str] = None) -> None: + can_be_multiline: Optional[Union[bool, str]] = None) -> None: + """ + Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + + :param start_word: string for checking of line text beginning. + Note that start_word will be stripped and made lowercase, and will be used on the lowercase and stripped line. + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) self.__start_word = start_word.strip().lower() def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line. + Line text is checked if it starts with the given ``start_word``, text is stripped and made lowercase beforehand. + """ text = line.line.strip().lower() return text.startswith(self.__start_word) def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.StartWordPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attributes ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` are equal to values given during class initialisation. + """ return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_header_pattern.py b/dedoc/structure_extractors/patterns/tag_header_pattern.py index 927e148d..e6639d04 100644 --- a/dedoc/structure_extractors/patterns/tag_header_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_header_pattern.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -12,7 +12,7 @@ def __init__(self, line_type: Optional[str] = None, level_1: Optional[int] = None, level_2: Optional[int] = None, - can_be_multiline: Optional[bool or str] = None, + can_be_multiline: Optional[Union[bool, str]] = None, default_line_type: str = HierarchyLevel.header, default_level_1: int = 1, default_level_2: Optional[int] = None) -> None: diff --git a/dedoc/structure_extractors/patterns/tag_list_pattern.py b/dedoc/structure_extractors/patterns/tag_list_pattern.py index 44da867c..d237143d 100644 --- a/dedoc/structure_extractors/patterns/tag_list_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_list_pattern.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -12,7 +12,7 @@ def __init__(self, line_type: Optional[str] = None, level_1: Optional[int] = None, level_2: Optional[int] = None, - can_be_multiline: Optional[bool or str] = None, + can_be_multiline: Optional[Union[bool, str]] = None, default_line_type: str = HierarchyLevel.list_item, default_level_1: int = 2, default_level_2: Optional[int] = None) -> None: diff --git a/dedoc/structure_extractors/patterns/tag_pattern.py b/dedoc/structure_extractors/patterns/tag_pattern.py index 88adf384..9e59437e 100644 --- a/dedoc/structure_extractors/patterns/tag_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_pattern.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Union from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta @@ -6,16 +6,69 @@ class TagPattern(AbstractPattern): + """ + Pattern for using information from readers saved in ``line.metadata.tag_hierarchy_level``. + Can be useful for paragraph extraction in PDF documents and images, + because PDF and image readers save information about paragraphs in ``line.metadata.tag_hierarchy_level.can_be_multiline``. + + .. seealso:: + + Please see :ref:`readers_line_types` if you need information, which line types can be extracted by each reader. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import TagPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [TagPattern(default_line_type="raw_text")] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "tag", "default_line_type": "raw_text"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ _name = "tag" def __init__(self, line_type: Optional[str] = None, level_1: Optional[int] = None, level_2: Optional[int] = None, - can_be_multiline: Optional[bool or str] = None, + can_be_multiline: Optional[Union[bool, str]] = None, default_line_type: str = HierarchyLevel.raw_text, default_level_1: Optional[int] = None, default_level_2: Optional[int] = None) -> None: + """ + Initialise pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + It is recommended to configure ``default_*`` values in case ``line.metadata.tag_hierarchy_level`` miss some values. + If you want to use values from ``line.metadata.tag_hierarchy_level``, it is recommended to leave + ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` empty. + + ``can_be_multiline`` is filled in PDF and images readers during paragraph detection, so if you want to extract paragraphs, + you shouldn't set ``can_be_multiline`` during pattern initialization. + + :param line_type: type of the line, replaces line_type from tag_hierarchy_level if non-empty. + :param level_1: value of a line primary importance, replaces level_1 from tag_hierarchy_level if non-empty. + :param level_2: level of the line inside specific class, replaces level_2 from tag_hierarchy_level if non-empty. + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If not None, replaces can_be_multiline from tag_hierarchy_level. + :param default_line_type: type of the line, is used when tag_hierarchy_level.line_type == "unknown". + :param default_level_1: value of a line primary importance, is used when tag_hierarchy_level.level_1 is None. + :param default_level_2: level of the line inside specific class, is used when tag_hierarchy_level.level_2 is None. + """ super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) self._can_be_multiline_none = can_be_multiline is None self._default_line_type = default_line_type @@ -23,9 +76,32 @@ def __init__(self, self._default_level_2 = default_level_2 def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line: ``line.metadata.tag_hierarchy_level`` should not be empty. + ``line.metadata.tag_hierarchy_level`` is filled during reading step, some readers can skip ``tag_hierarchy_level`` initialisation. + """ return line.metadata.tag_hierarchy_level is not None def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.TagPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attribute ``line_type`` is initialised according to the following rules: + + * if non-empty ``line_type`` is given during pattern initialisation, then its value is used in the result; + * if ``line_type`` is not given (or ``None`` is given) and ``line.metadata.tag_hierarchy_level`` is not ``unknown``, \ + the ``line_type`` value from ``line.metadata.tag_hierarchy_level`` is used in the result; + * otherwise (``line_type`` is empty and ``line.metadata.tag_hierarchy_level`` is ``unknown``) ``default_line_type`` value is used in the result. + + Similar rules work for ``level_1`` and ``level_2`` with comparing with ``None`` instead of ``unknown``. + + The ``can_be_multiline`` attribute is initialised according to the following rules: + + * if non-empty ``can_be_multiline`` is given during pattern initialisation, then its value is used in the result; + * otherwise ``can_be_multiline`` value from ``line.metadata.tag_hierarchy_level`` is used in the result. + """ return HierarchyLevel( line_type=self._get_line_type(line), level_1=self._get_level_1(line), diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 13cd0eaf..96896125 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -42,10 +42,10 @@ Post-requests should be sent to ``http://localhost:1231/upload``. "is_one_column_document": "true", "return_format": 'html' } - with open(filename, 'rb') as file: - files = {'file': (filename, file)} + with open(filename, "rb") as file: + files = {"file": (filename, file)} r = requests.post("http://localhost:1231/upload", files=files, data=data) - result = r.content.decode('utf-8') + result = r.content.decode("utf-8") The ``data`` dictionary in the example contains some parameters to parse the given file. They are described in the section :ref:`api_parameters`. diff --git a/docs/source/index.rst b/docs/source/index.rst index 0f2aed1d..b9dc7e2e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -213,6 +213,7 @@ Currently the following domains can be handled: For a document of unknown or unsupported domain there is an option to use default structure extractor (``document_type=other`` at :ref:`api_parameters`), the default document structure described :ref:`here `. +This type of structure is configurable (see :ref:`using_patterns`). .. toctree:: @@ -223,14 +224,16 @@ For a document of unknown or unsupported domain there is an option to use defaul getting_started/usage parameters/parameters + .. toctree:: :maxdepth: 1 :caption: Tutorials tutorials/add_new_doc_format tutorials/add_new_structure_type - tutorials/creating_document_classes tutorials/add_new_language + tutorials/creating_document_classes + tutorials/using_patterns .. toctree:: diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst index 86ded2c3..14313786 100644 --- a/docs/source/modules/structure_extractors.rst +++ b/docs/source/modules/structure_extractors.rst @@ -64,3 +64,97 @@ dedoc.structure_extractors :members: .. autoattribute:: document_type + + +.. _dedoc_structure_extractors_patterns: + +Patterns for :class:`~dedoc.structure_extractors.DefaultStructureExtractor` +--------------------------------------------------------------------------- + +Structure patterns are used for a more flexible configuring of lines types and levels during structure extraction step. +They are useful only for :class:`~dedoc.structure_extractors.DefaultStructureExtractor` (in API when "document_type"="other"). +Please see :ref:`using_patterns` to get examples of patterns usage. + + +.. autoclass:: dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.RegexpPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.StartWordPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BracketListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BracketRomanListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BulletListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.DottedListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.LetterListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.RomanListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagHeaderPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst index 666a8d35..7dc696af 100644 --- a/docs/source/readers_output/line_types.rst +++ b/docs/source/readers_output/line_types.rst @@ -4,8 +4,8 @@ Types of textual lines ====================== Each reader returns :class:`~dedoc.data_structures.UnstructuredDocument` with textual lines. -Readers don't fill ``hierarchy_level`` metadata field (structure extractors do this), but they can fill ``hierarchy_level_tag`` with information about line types. -Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` in document lines metadata: +Readers don't fill ``hierarchy_level`` metadata field (structure extractors do this), but they can fill ``tag_hierarchy_level`` with information about line types. +Below the readers are enlisted that can return non-empty ``tag_hierarchy_level`` in document lines metadata: * `+` means that the reader can return lines of this type. * `-` means that the reader doesn't return lines of this type due to complexity of the task or lack of information provided by the format. @@ -42,7 +42,7 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.RawTextReader` - `-` - - `+` + - `-` - `+` - `-` @@ -54,7 +54,7 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.PdfImageReader` - `-` - - `+` + - `-` - `+` - `-` @@ -66,6 +66,6 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.PdfTxtlayerReader` - `-` - - `+` + - `-` - `+` - `-` diff --git a/docs/source/structure_types/other.rst b/docs/source/structure_types/other.rst index 13a4e716..022fafd6 100644 --- a/docs/source/structure_types/other.rst +++ b/docs/source/structure_types/other.rst @@ -3,6 +3,11 @@ Default document structure type =============================== +.. note:: + + This structure type is configurable: you can change lines types and levels in the tree hierarchy. + Please see :ref:`using_patterns` for more details. + Below we will consider document lines as nodes of the document tree. In some cases document lines are paragraphs of the text (e.g. in docx). diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst new file mode 100644 index 00000000..3b25a3db --- /dev/null +++ b/docs/source/tutorials/using_patterns.rst @@ -0,0 +1,12 @@ +.. _using_patterns: + +Configure structure extraction using patterns +============================================= + + +Use patterns in Dedoc library +----------------------------- + + +Use patterns in Dedoc API +-------------------------