TLDR-748 review fixes

ispras · Sep 2, 2024 · b6617d9 · b6617d9
1 parent 2b5e47c
commit b6617d9
Show file tree

Hide file tree

Showing 17 changed files with 122 additions and 53 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -54,7 +54,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
         The method return document content with all document's lines, tables and attachments.
-        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
+        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`
+        (``can_be_multiline`` attribute is important for paragraph extraction).
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
 
         You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.

diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py
@@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
-        This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions.
+        This method returns only document lines.
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters

diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py
@@ -1,10 +1,11 @@
-from typing import List, Optional
+from typing import Optional
 
 from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError
 from dedoc.data_structures.hierarchy_level import HierarchyLevel
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor
 from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
+from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition
 
 
 class DefaultStructureExtractor(AbstractStructureExtractor):
@@ -25,40 +26,36 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
         Please see :ref:`dedoc_structure_extractors_patterns` and :ref:`using_patterns` to get information how to use patterns for making your custom structure.
         """
         parameters = {} if parameters is None else parameters
-        patterns = self.__get_patterns(parameters)
+        pattern_composition = self.__get_pattern_composition(parameters)
 
         for line in document.lines:
-            line_pattern = None
-            for pattern in patterns:
-                if pattern.match(line):
-                    line_pattern = pattern
-                    break
-
-            line.metadata.hierarchy_level = line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
-            assert line.metadata.hierarchy_level is not None
-
+            line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line)
         return document
 
-    def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
+    def __get_pattern_composition(self, parameters: dict) -> PatternComposition:
         patterns = parameters.get("patterns")
         if not patterns:
             from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
             from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
             from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
             from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern
+            from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern
             from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
             from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
             from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
 
-            return [
-                TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
-                TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
-                DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
-                BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
-                LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
-                BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
-                TagPattern(default_line_type=HierarchyLevel.raw_text)
-            ]
+            return PatternComposition(
+                patterns=[
+                    TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
+                    TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
+                    DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
+                    RomanListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
+                    BracketListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
+                    LetterListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
+                    BulletListPattern(line_type=HierarchyLevel.list_item, level_1=6, level_2=1, can_be_multiline=False),
+                    TagPattern(default_line_type=HierarchyLevel.raw_text)
+                ]
+            )
 
         import ast
         from dedoc.structure_extractors.patterns.utils import get_pattern
@@ -81,4 +78,4 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
             else:
                 raise StructureExtractorError(msg="Pattern should be dict or `AbstractPattern`")
 
-        return pattern_classes
+        return PatternComposition(patterns=pattern_classes)
diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py
@@ -15,7 +15,7 @@ class BracketRomanPrefix(LinePrefix):
     iv) forth item
     """
 
-    regexp = re.compile(r"^\s*[ivxl]\)")
+    regexp = re.compile(r"^\s*[ivxlcdm]\)")
     name = "roman"
 
     def __init__(self, prefix: str, indent: float) -> None:

diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py
@@ -15,7 +15,7 @@ class RomanPrefix(LinePrefix):
     IV. forth item
     """
 
-    regexp = re.compile(r"^\s*[ivxl]\.")
+    regexp = re.compile(r"^\s*[ivxlcdm]\.")
     name = "roman"
 
     def __init__(self, prefix: str, indent: float) -> None:

diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py
@@ -9,6 +9,7 @@
     AbstractBodyHierarchyLevelBuilder
 from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots
 from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern
+from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition
 
 
 class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
@@ -17,14 +18,16 @@ class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder):
     def __init__(self) -> None:
         super().__init__()
         self.digits_with_dots_regexp = regexps_digits_with_dots
-        self.patterns = [
-            TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
-            DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
-            BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
-            LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
-            BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
-            TagPattern(line_type=HierarchyLevel.raw_text)
-        ]
+        self.pattern_composition = PatternComposition(
+            [
+                TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False),
+                DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
+                BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
+                LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
+                BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
+                TagPattern(line_type=HierarchyLevel.raw_text)
+            ]
+        )
 
     def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]:
         if len(lines_with_labels) > 0:
@@ -51,7 +54,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s
             elif prediction == "raw_text":
                 line = self.__postprocess_raw_text(line, init_hl_depth)
                 if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"):
-                    line.metadata.hierarchy_level = self.__get_level_by_patterns(line)
+                    line.metadata.hierarchy_level = self.pattern_composition.get_hierarchy_level(line)
             else:
                 line.metadata.hierarchy_level = HierarchyLevel.create_raw_text()
                 line.metadata.hierarchy_level.line_type = prediction
@@ -75,15 +78,6 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction
         line.metadata.hierarchy_level = hierarchy_level
         return line
 
-    def __get_level_by_patterns(self, line: LineWithMeta) -> HierarchyLevel:
-        line_pattern = None
-        for pattern in self.patterns:
-            if pattern.match(line):
-                line_pattern = pattern
-                break
-
-        return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
-
     def __postprocess_raw_text(self, line: LineWithMeta, init_hl_depth: int) -> LineWithMeta:
         text = line.line.strip().lower()
         if not text.startswith(self.named_item_keywords):

diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py
@@ -13,7 +13,7 @@ class AbstractPattern(ABC):
 
     def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[Union[bool, str]]) -> None:
         """
-        Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
+        Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
         They can be used in :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level`
         according to specific pattern logic.
 

diff --git a/dedoc/structure_extractors/patterns/pattern_composition.py b/dedoc/structure_extractors/patterns/pattern_composition.py
@@ -0,0 +1,56 @@
+from typing import List
+
+from dedoc.data_structures.hierarchy_level import HierarchyLevel
+from dedoc.data_structures.line_with_meta import LineWithMeta
+from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
+
+
+class PatternComposition:
+    """
+    Class for applying patterns to get line's hierarchy level.
+
+    Example of usage:
+
+    .. code-block:: python
+
+        from dedoc.data_structures.line_with_meta import LineWithMeta
+        from dedoc.structure_extractors.patterns import TagListPattern, TagPattern
+        from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition
+
+
+        pattern_composition = PatternComposition(
+            patterns=[
+                TagListPattern(line_type="list_item", default_level_1=2, can_be_multiline=False),
+                TagPattern(default_line_type="raw_text")
+            ]
+        )
+        line = LineWithMeta(line="Some text")
+        line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line)
+    """
+    def __init__(self, patterns: List[AbstractPattern]) -> None:
+        """
+        Set the list of patterns to apply to lines.
+
+        **Note:** the order of the patterns is important. More specific patterns should go first.
+        Otherwise, they may be ignored because of the patterns which also are applicable to the given line.
+
+        :param patterns: list of patterns to apply to lines.
+        """
+        self.patterns = patterns
+
+    def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
+        """
+        Choose the suitable pattern from the list of patterns for applying to the given line.
+        The first applicable pattern will be chosen.
+        If no applicable pattern was found, the default ``raw_text`` :class:`~dedoc.data_structures.HierarchyLevel` is used as result.
+
+        :param line: line to get hierarchy level for.
+        """
+        line_pattern = None
+
+        for pattern in self.patterns:
+            if pattern.match(line):
+                line_pattern = pattern
+                break
+
+        return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text()
diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py
@@ -57,7 +57,7 @@ def __init__(self,
                  level_2: Optional[int] = None,
                  can_be_multiline: Optional[Union[bool, str]] = None) -> None:
         """
-        Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
+        Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
 
         :param regexp: regular expression for checking, if the line text matches the pattern.
             Note that regular expression is used on the lowercase and stripped line.

diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py
@@ -50,7 +50,7 @@ def __init__(self,
                  level_2: Optional[int] = None,
                  can_be_multiline: Optional[Union[bool, str]] = None) -> None:
         """
-        Initialise pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
+        Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
 
         :param start_word: string for checking of line text beginning.
             Note that start_word will be stripped and made lowercase, and will be used on the lowercase and stripped line.

diff --git a/dedoc/structure_extractors/patterns/tag_pattern.py b/dedoc/structure_extractors/patterns/tag_pattern.py
@@ -52,7 +52,7 @@ def __init__(self,
                  default_level_1: Optional[int] = None,
                  default_level_2: Optional[int] = None) -> None:
         """
-        Initialise pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
+        Initialize pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.
         It is recommended to configure ``default_*`` values in case ``line.metadata.tag_hierarchy_level`` miss some values.
         If you want to use values from ``line.metadata.tag_hierarchy_level``, it is recommended to leave
         ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` empty.
@@ -88,7 +88,7 @@ def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
         returned ``True`` for the given line.
 
         Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``.
-        The attribute ``line_type`` is initialised according to the following rules:
+        The attribute ``line_type`` is initialized according to the following rules:
 
         * if non-empty ``line_type`` is given during pattern initialisation, then its value is used in the result;
         * if ``line_type`` is not given (or ``None`` is given) and ``line.metadata.tag_hierarchy_level`` is not ``unknown``, \
@@ -97,7 +97,7 @@ def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
 
         Similar rules work for ``level_1`` and ``level_2`` with comparing with ``None`` instead of ``unknown``.
 
-        The ``can_be_multiline`` attribute is initialised according to the following rules:
+        The ``can_be_multiline`` attribute is initialized according to the following rules:
 
         * if non-empty ``can_be_multiline`` is given during pattern initialisation, then its value is used in the result;
         * otherwise ``can_be_multiline`` value from ``line.metadata.tag_hierarchy_level`` is used in the result.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -46,7 +46,7 @@
     ("py:class", "train_dataset.data_structures.line_with_label.LineWithLabel"),
     ("py:class", "xgboost.sklearn.XGBClassifier"),
     ("py:class", "collections.Counter"),
-
+    ("py:obj", "typing.Pattern")
 ]
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
@@ -85,6 +85,12 @@ Api parameters description
 
         This type is used for choosing a specific structure extractor (and, in some cases, a specific reader).
 
+    * - patterns
+      - list of patterns dictionaries converted to string
+      - None
+      - This parameter is used only when ``document_type="other"``.
+        Configuration of default document structure, please see :ref:`using_patterns` for more details.
+
     * - structure_type
       - tree, linear
       - tree

diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst
@@ -82,6 +82,10 @@ Please see :ref:`using_patterns` to get examples of patterns usage.
 
     .. autoattribute:: _name
 
+.. autoclass:: dedoc.structure_extractors.patterns.pattern_composition.PatternComposition
+    :special-members: __init__
+    :members:
+
 .. autoclass:: dedoc.structure_extractors.patterns.RegexpPattern
     :show-inheritance:
     :special-members: __init__

diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst
@@ -37,6 +37,16 @@ Structure type configuring
 
         If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition`
 
+    * - patterns
+      - list of patterns based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`,
+        or list of patterns dicts, or list of dictionaries converted to string
+      - None
+      - * :meth:`dedoc.DedocManager.parse`
+        * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract`
+        * :meth:`dedoc.structure_extractors.DefaultStructureExtractor.extract`
+      - This parameter is used only by :class:`~dedoc.structure_extractors.DefaultStructureExtractor` (``document_type="other"``).
+        Configuration of default document structure, please see :ref:`using_patterns` for more details.
+
     * - structure_type
       - tree, linear
       - tree

diff --git a/docs/source/structure_types/other.rst b/docs/source/structure_types/other.rst
@@ -55,7 +55,7 @@ The detailed description of each line type:
 
         Its text is an empty string.
         This type of node is optional, it occurs only if lists are found in the given document.
-        For each list type (dotted, bracket, bullet) the new list node is created.
+        For each list type (dotted, roman, bracket, bullet) the new list node is created.
         This type of node is more important than list_item and raw_text.
         List nodes for less important lists are are nested into list items of more important list types.
         For example, list node for bullet list beginning is less important than a list item of a dotted list.

diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst
@@ -237,7 +237,8 @@ Let's read the document using :class:`~dedoc.readers.PdfTabbyReader` and see the
 
 
 Here we consider class ``LineWithLocation`` almost the same as :class:`~dedoc.data_structures.LineWithMeta`.
-As we see, the reader couldn't extract any useful information about lines types and levels.
+As we see, ``tagHL=(None, None, 'unknown')`` for each line:
+this means that the reader couldn't extract any useful information about lines types and levels.
 So, :class:`~dedoc.structure_extractors.patterns.TagHeaderPattern` and
 :class:`~dedoc.structure_extractors.patterns.TagListPattern` are useless in this case.