From da4ae1e71fa958d3a194820efafff2a99900dcc7 Mon Sep 17 00:00:00 2001 From: Nasty Date: Tue, 13 Aug 2024 17:11:52 +0300 Subject: [PATCH 1/9] TLDR-748 add patterns for structure extraction, delete regexps from readers --- .flake8 | 1 + .pre-commit-config.yaml | 2 +- dedoc/readers/pdf_reader/pdf_base_reader.py | 6 +- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 9 +- dedoc/readers/txt_reader/raw_text_reader.py | 11 +- .../default_structure_extractor.py | 105 +++++++----------- .../diploma_builder/body_builder.py | 23 +++- .../structure_extractors/patterns/__init__.py | 8 ++ .../patterns/abstract_pattern.py | 27 +++++ .../patterns/bracket_list_pattern.py | 11 ++ .../patterns/bracket_roman_list_pattern.py | 11 ++ .../patterns/bullet_list_pattern.py | 11 ++ .../patterns/dotted_list_pattern.py | 26 +++++ .../patterns/letter_list_pattern.py | 11 ++ .../patterns/regexp_pattern.py | 22 ++++ .../patterns/roman_list_pattern.py | 11 ++ .../patterns/start_word_pattern.py | 20 ++++ .../patterns/tag_header_pattern.py | 18 +++ .../patterns/tag_list_pattern.py | 18 +++ dedoc/structure_extractors/patterns/utils.py | 18 +++ tests/api_tests/test_api_doctype_diploma.py | 3 + tests/api_tests/test_api_format_email.py | 3 + tests/api_tests/test_api_format_json.py | 6 + tests/api_tests/test_api_format_pdf.py | 6 + .../test_api_format_pdf_tabby_reader.py | 1 + .../test_api_format_pdf_with_text.py | 6 + tests/api_tests/test_api_format_txt.py | 3 + 27 files changed, 309 insertions(+), 88 deletions(-) create mode 100644 dedoc/structure_extractors/patterns/__init__.py create mode 100644 dedoc/structure_extractors/patterns/abstract_pattern.py create mode 100644 dedoc/structure_extractors/patterns/bracket_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/bullet_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/dotted_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/letter_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/regexp_pattern.py create mode 100644 dedoc/structure_extractors/patterns/roman_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/start_word_pattern.py create mode 100644 dedoc/structure_extractors/patterns/tag_header_pattern.py create mode 100644 dedoc/structure_extractors/patterns/tag_list_pattern.py create mode 100644 dedoc/structure_extractors/patterns/utils.py diff --git a/.flake8 b/.flake8 index 401f544b..0dfafc2b 100644 --- a/.flake8 +++ b/.flake8 @@ -28,6 +28,7 @@ exclude = *__init__.py, resources, venv, + .venv, build, dedoc.egg-info, docs/_build, diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09231202..2b5eae7a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py + exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py args: - "--config=.flake8" additional_dependencies: [ diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 839b5006..a55f210b 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -94,8 +94,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]: import math from joblib import Parallel, delayed + from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.utils.pdf_utils import get_pdf_page_count from dedoc.utils.utils import flatten @@ -129,10 +129,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines) all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments) - prev_line = None for line in all_lines_with_links: - line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line) - prev_line = line + line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links) if page_angles: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 4eaed54c..c7316036 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -215,7 +215,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith lines = [] page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"]) - prev_line = None labeling_mode = self.config.get("labeling_mode", False) for block in page["blocks"]: @@ -261,15 +260,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith uid=uid, location=Location(bbox=bbox, page_number=page_number), order=order) - line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta) - prev_line = line_with_location + line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta) lines.append(line_with_location) return lines - def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel: - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel: from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth if line_type == HierarchyLevel.header: @@ -278,7 +275,7 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_ return HierarchyLevel(1, header_level, False, line_type) if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines - return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line) + return HierarchyLevel(None, None, False, HierarchyLevel.list_item) return HierarchyLevel(None, None, True, line_type) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index 2cb13f6d..87453e5c 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str: def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: import time from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation + from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.utils.utils import calculate_file_hash lines = [] file_hash = calculate_file_hash(path=path) number_of_empty_lines = 0 previous_log_time = time.time() - prev_line = None for line_id, line in self.__get_lines(path=path, encoding=encoding): if time.time() - previous_log_time > 5: @@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: indent_annotation = self.__get_indent_annotation(line) line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid) - line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line) - prev_line = line_with_meta + line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() lines.append(line_with_meta) - if line.isspace(): - number_of_empty_lines += 1 - else: - number_of_empty_lines = 0 + number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0 return lines diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index 3bfaeb21..7607c5cf 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -1,9 +1,9 @@ from typing import List, Optional from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern class DefaultStructureExtractor(AbstractStructureExtractor): @@ -12,82 +12,57 @@ class DefaultStructureExtractor(AbstractStructureExtractor): You can find the description of this type of structure in the section :ref:`other_structure`. """ - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix - document_type = "other" - prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, AnyLetterPrefix, BulletPrefix] - def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract basic structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. """ - previous_line = None + parameters = {} if parameters is None else parameters + patterns = self.__get_patterns(parameters) for line in document.lines: - if line.metadata.tag_hierarchy_level is None: - line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.unknown: - line.metadata.hierarchy_level = self.get_hl_list_using_regexp(line, previous_line) - else: - line.metadata.hierarchy_level = self.__get_hl_with_tag(line) + line_pattern = None + for pattern in patterns: + if pattern.match(line): + line_pattern = pattern + break + line.metadata.hierarchy_level = line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text() assert line.metadata.hierarchy_level is not None - if line.metadata.hierarchy_level.line_type != HierarchyLevel.raw_text: - previous_line = line return document - def __get_hl_with_tag(self, line: LineWithMeta) -> HierarchyLevel: - assert line.metadata.tag_hierarchy_level is not None - level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 - - if level_1 is None or level_2 is None: - return line.metadata.tag_hierarchy_level - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header: - return HierarchyLevel(level_1=1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.header) - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item: - return HierarchyLevel(level_1=level_1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.list_item) - - return line.metadata.tag_hierarchy_level - - @staticmethod - def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel: - from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix - - prefix = get_prefix(DefaultStructureExtractor.prefix_list, line) - - # TODO dotted list without space after numbering, like "1.Some text" - if prefix.name == DottedPrefix.name: # list like 1.1.1 - depth = len(prefix.numbers) - if all((n <= 1900 for n in prefix.numbers)) and depth <= 9: - return HierarchyLevel(2, depth, False, line_type=HierarchyLevel.list_item) - return HierarchyLevel.create_raw_text() - - if prefix.name == BracketPrefix.name: # list like 1) - # check if tesseract recognize russian б as 6 (bi as six) - if prefix.prefix_num == 6 and previous_line is not None and previous_line.line.lower().strip().startswith(("a)", "а)")): - return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) # here is russian and english letters - return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item) - - if prefix.name == AnyLetterPrefix.name: # list like a) - return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) - - if prefix.name == BulletPrefix.name: # bullet list - return HierarchyLevel(5, 1, False, line_type=HierarchyLevel.list_item) # TODO make bullet list - - # no match for any list has been found - return HierarchyLevel(None, None, line.metadata.tag_hierarchy_level.can_be_multiline, HierarchyLevel.raw_text) + def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: + if "patterns" not in parameters: + from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern + from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern + from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern + from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern + from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern + from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern + + patterns = [ + TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1), + TagListPattern(line_type=HierarchyLevel.list_item, level_1=2), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1), + ] + else: + import json + from dedoc.structure_extractors.patterns.utils import get_pattern + + patterns = parameters["patterns"] + if isinstance(patterns, str): + patterns = json.loads(patterns) + assert isinstance(patterns, list) + assert len(patterns) > 0 + if isinstance(patterns[0], dict): + patterns = [get_pattern(pattern) for pattern in patterns] + + assert isinstance(patterns[0], AbstractPattern) + return patterns diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py index 91b67965..98ae1ec6 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py @@ -3,12 +3,12 @@ from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \ AbstractBodyHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots +from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagHeaderPattern, TagListPattern class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder): @@ -17,6 +17,14 @@ class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder): def __int__(self) -> None: super().__init__() self.digits_with_dots_regexp = regexps_digits_with_dots + self.patterns = [ + TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1), + TagListPattern(line_type=HierarchyLevel.list_item, level_1=2), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1), + ] def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: if len(lines_with_labels) > 0: @@ -27,7 +35,6 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s result = [body_line] else: result = [AbstractBodyHierarchyLevelBuilder.get_body_line(init_hl_depth=init_hl_depth)] - previous_raw_text_line = None previous_named_item_line = None for line, prediction in lines_with_labels: @@ -44,8 +51,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s elif prediction == "raw_text": line = self.__postprocess_raw_text(line, init_hl_depth) if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"): - line.metadata.hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, previous_raw_text_line) - previous_raw_text_line = line + line.metadata.hierarchy_level = self.__get_level_by_patterns(line) else: line.metadata.hierarchy_level = HierarchyLevel.create_raw_text() line.metadata.hierarchy_level.line_type = prediction @@ -69,6 +75,15 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction line.metadata.hierarchy_level = hierarchy_level return line + def __get_level_by_patterns(self, line: LineWithMeta) -> HierarchyLevel: + line_pattern = None + for pattern in self.patterns: + if pattern.match(line): + line_pattern = pattern + break + + return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text() + def __postprocess_raw_text(self, line: LineWithMeta, init_hl_depth: int) -> LineWithMeta: text = line.line.strip().lower() if not text.startswith(self.named_item_keywords): diff --git a/dedoc/structure_extractors/patterns/__init__.py b/dedoc/structure_extractors/patterns/__init__.py new file mode 100644 index 00000000..dcda4109 --- /dev/null +++ b/dedoc/structure_extractors/patterns/__init__.py @@ -0,0 +1,8 @@ +from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern +from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern +from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern +from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern +from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern +from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern + +__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"] diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py new file mode 100644 index 00000000..b987ce6e --- /dev/null +++ b/dedoc/structure_extractors/patterns/abstract_pattern.py @@ -0,0 +1,27 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta + + +class AbstractPattern(ABC): + __name = "" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + self._line_type = line_type + self._level_1 = level_1 + self._level_2 = level_2 if level_2 else 1 + self._can_be_multiline = can_be_multiline + + @classmethod + def name(cls: "AbstractPattern") -> str: + return cls.__name + + @abstractmethod + def match(self, line: LineWithMeta) -> bool: + pass + + @abstractmethod + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + pass diff --git a/dedoc/structure_extractors/patterns/bracket_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_list_pattern.py new file mode 100644 index 00000000..496dd8c7 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bracket_list_pattern.py @@ -0,0 +1,11 @@ +from typing import Optional + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BracketListPattern(RegexpPattern): + __name = "bracket_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py new file mode 100644 index 00000000..effacb46 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py @@ -0,0 +1,11 @@ +from typing import Optional + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BracketRomanListPattern(RegexpPattern): + __name = "bracket_roman_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bullet_list_pattern.py b/dedoc/structure_extractors/patterns/bullet_list_pattern.py new file mode 100644 index 00000000..10f5f319 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bullet_list_pattern.py @@ -0,0 +1,11 @@ +from typing import Optional + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BulletListPattern(RegexpPattern): + __name = "bullet_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/dotted_list_pattern.py b/dedoc/structure_extractors/patterns/dotted_list_pattern.py new file mode 100644 index 00000000..34ec0222 --- /dev/null +++ b/dedoc/structure_extractors/patterns/dotted_list_pattern.py @@ -0,0 +1,26 @@ +from typing import Optional + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class DottedListPattern(RegexpPattern): + __name = "dotted_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + level_2 = self.__get_list_depth(line=line) + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) + + def __get_list_depth(self, line: LineWithMeta) -> int: + text = line.line.strip().lower() + match = self._regexp.match(text) + if match is None: + raise ValueError(f'Line text "{text}" does not match dotted list pattern regexp') + + prefix = match.group().strip() + return len([number for number in prefix.split(".") if len(number) > 0]) diff --git a/dedoc/structure_extractors/patterns/letter_list_pattern.py b/dedoc/structure_extractors/patterns/letter_list_pattern.py new file mode 100644 index 00000000..e700ffe8 --- /dev/null +++ b/dedoc/structure_extractors/patterns/letter_list_pattern.py @@ -0,0 +1,11 @@ +from typing import Optional + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class LetterListPattern(RegexpPattern): + __name = "letter_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py new file mode 100644 index 00000000..d7013b9b --- /dev/null +++ b/dedoc/structure_extractors/patterns/regexp_pattern.py @@ -0,0 +1,22 @@ +import re +from typing import Optional + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class RegexpPattern(AbstractPattern): + __name = "regexp" + + def __init__(self, regexp: str or re.Pattern, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp + + def match(self, line: LineWithMeta) -> bool: + text = line.line.strip().lower() + match = self._regexp.match(text) + return match is not None + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/roman_list_pattern.py b/dedoc/structure_extractors/patterns/roman_list_pattern.py new file mode 100644 index 00000000..c267d371 --- /dev/null +++ b/dedoc/structure_extractors/patterns/roman_list_pattern.py @@ -0,0 +1,11 @@ +from typing import Optional + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.roman_prefix import RomanPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class RomanListPattern(RegexpPattern): + __name = "roman_list" + + def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py new file mode 100644 index 00000000..8258122c --- /dev/null +++ b/dedoc/structure_extractors/patterns/start_word_pattern.py @@ -0,0 +1,20 @@ +from typing import Optional + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class StartWordPattern(AbstractPattern): + __name = "start_word" + + def __init__(self, start_word: str, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + self.__start_word = start_word.strip().lower() + + def match(self, line: LineWithMeta) -> bool: + text = line.line.strip().lower() + return text.startswith(self.__start_word) + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_header_pattern.py b/dedoc/structure_extractors/patterns/tag_header_pattern.py new file mode 100644 index 00000000..a730b451 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_header_pattern.py @@ -0,0 +1,18 @@ +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class TagHeaderPattern(AbstractPattern): + __name = "tag_header" + + def match(self, line: LineWithMeta) -> bool: + if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.line_type != HierarchyLevel.header: + return False + + level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 + return level_1 is not None and level_2 is not None + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + level_2 = line.metadata.tag_hierarchy_level.level_2 + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_list_pattern.py b/dedoc/structure_extractors/patterns/tag_list_pattern.py new file mode 100644 index 00000000..01647896 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_list_pattern.py @@ -0,0 +1,18 @@ +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class TagListPattern(AbstractPattern): + __name = "tag_list" + + def match(self, line: LineWithMeta) -> bool: + if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.line_type != HierarchyLevel.list_item: + return False + + level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 + return level_1 is not None and level_2 is not None + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 + return HierarchyLevel(line_type=self._line_type, level_1=level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/utils.py b/dedoc/structure_extractors/patterns/utils.py new file mode 100644 index 00000000..4d8a618a --- /dev/null +++ b/dedoc/structure_extractors/patterns/utils.py @@ -0,0 +1,18 @@ +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +def get_pattern(pattern_parameters: dict) -> AbstractPattern: + import dedoc.structure_extractors.patterns as patterns_module + + assert isinstance(pattern_parameters, dict) + assert "name" in pattern_parameters, "Pattern parameter missing 'name'" + assert "line_type" in pattern_parameters, "Pattern parameter missing 'line_type'" + assert "level_1" in pattern_parameters, "Pattern parameter missing 'level_1'" + + supported_patterns = {pattern.name: pattern for pattern in patterns_module.__all__} + pattern_class = supported_patterns.get(pattern_parameters["name"]) + if pattern_class is None: + raise ValueError(f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}") + + pattern = pattern_class(**pattern_parameters) + return pattern diff --git a/tests/api_tests/test_api_doctype_diploma.py b/tests/api_tests/test_api_doctype_diploma.py index 24ed6aff..dd767dfe 100644 --- a/tests/api_tests/test_api_doctype_diploma.py +++ b/tests/api_tests/test_api_doctype_diploma.py @@ -1,4 +1,5 @@ import os +import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -8,6 +9,7 @@ class TestApiDiploma(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "diplomas", file_name) + @unittest.skip("TLDR-748") def test_diploma_pdf(self) -> None: file_name = "diploma.pdf" result = self._send_request(file_name, dict(document_type="diploma", pdf_with_text_layer="tabby")) @@ -51,6 +53,7 @@ def test_diploma_pdf(self) -> None: self.assertEqual("БИБЛИОГРАФИЧЕСКИЙ СПИСОК", node["text"].strip()) self.assertEqual("named_item", node["metadata"]["paragraph_type"]) + @unittest.skip("TLDR-748") def test_diploma_docx(self) -> None: file_name = "diploma.docx" result = self._send_request(file_name, dict(document_type="diploma")) diff --git a/tests/api_tests/test_api_format_email.py b/tests/api_tests/test_api_format_email.py index 0846538f..02e0e66b 100644 --- a/tests/api_tests/test_api_format_email.py +++ b/tests/api_tests/test_api_format_email.py @@ -1,4 +1,5 @@ import os +import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -8,6 +9,7 @@ class TestApiEmailReader(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "eml", file_name) + @unittest.skip("TLDR-748") def test_email_file(self) -> None: file_name = "spam_mail.eml" result = self._send_request(file_name, data={"with_attachments": "true"}) @@ -29,6 +31,7 @@ def test_email_file(self) -> None: self.assertEqual('"sunny_goldensun@126.com" ', from_message["text"]) self.assertEqual("from", from_message["metadata"]["paragraph_type"]) + @unittest.skip("TLDR-748") def test_email_with_attachments(self) -> None: file_name = "message.eml" result = self._send_request(file_name, data={"with_attachments": "true"}) diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index ce84f073..8369a476 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -1,5 +1,6 @@ import json import os +import unittest from json import JSONDecodeError from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -27,6 +28,7 @@ def test_list(self) -> None: self.assertEqual("list_item", list_items[1]["metadata"]["paragraph_type"]) self.assertEqual("он её любил", list_items[1]["text"]) + @unittest.skip("TLDR-748") def test_dict(self) -> None: file_name = "dict.json" result = self._send_request(file_name)["content"]["structure"] @@ -36,6 +38,7 @@ def test_dict(self) -> None: self.assertEqual("key", nodes[1]["metadata"]["paragraph_type"]) self.assertEqual("он её любил", nodes[1]["subparagraphs"][0]["text"]) + @unittest.skip("TLDR-748") def test_dict_with_list(self) -> None: file_name = "dict_with_list.json" result = self._send_request(file_name)["content"]["structure"] @@ -54,6 +57,7 @@ def test_dict_with_list(self) -> None: self.assertEqual("понедельник", second_list_items[0]["text"]) self.assertEqual("вторник", second_list_items[1]["text"]) + @unittest.skip("TLDR-748") def test_list_with_dict(self) -> None: file_name = "list_with_dict.json" result = self._send_request(file_name)["content"]["structure"] @@ -67,6 +71,7 @@ def test_list_with_dict(self) -> None: self.assertEqual("понедельник", self._get_by_tree_path(result, "0.1.0.0.0.0")["text"]) self.assertEqual("вторник", self._get_by_tree_path(result, "0.1.0.0.0.1")["text"]) + @unittest.skip("TLDR-748") def test_realistic(self) -> None: file_name = "realistic_json.json" result = self._send_request(file_name)["content"]["structure"]["subparagraphs"] @@ -86,6 +91,7 @@ def test_json_attachments2(self) -> None: data = {"html_fields": '[["e"], ["f"]]', "with_attachments": "True", "return_base64": "true"} self._send_request(file_name, expected_code=200, data=data) + @unittest.skip("TLDR-748") def test_json_null(self) -> None: file_name = "test_null.json" result = self._send_request(file_name, expected_code=200) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index ec474b16..6486c89d 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -36,6 +36,7 @@ def test_pdf(self) -> None: self.__check_metainfo(result["metadata"], "application/pdf", file_name) self.assertEqual([], result["attachments"]) + @unittest.skip("TLDR-748") def test_djvu(self) -> None: file_name = "example_with_table7.djvu" result = self._send_request(file_name, dict(document_type="")) @@ -46,6 +47,7 @@ def test_djvu(self) -> None: self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name) + @unittest.skip("TLDR-748") def test_djvu_2(self) -> None: file_name = "example_with_table9.djvu" result = self._send_request(file_name) @@ -63,6 +65,7 @@ def test_broken_djvu(self) -> None: file_name = "broken.djvu" _ = self._send_request(file_name, expected_code=415) + @unittest.skip("TLDR-748") def test_header_pdf(self) -> None: file_name = "header_test.pdf" result = self._send_request(file_name, data=dict(pdf_with_text_layer="true")) @@ -109,6 +112,7 @@ def test_image_binarization(self) -> None: self.assertIn("ЦЕНТРАЛЬНЫЙ БАНК РОССИЙСКОЙ ФЕДЕРАЦИИ\n", result["content"]["structure"]["subparagraphs"][0]["text"]) self.assertIn("Е.И Курицына\n(расшифровка подлиси", result["content"]["structure"]["subparagraphs"][-1]["text"]) + @unittest.skip("TLDR-748") def test_on_ocr_conf_threshold(self) -> None: result = self._send_request("with_trash.jpg", data=dict(structure_type="tree")) tree = result["content"]["structure"] @@ -134,6 +138,7 @@ def test_pdf_with_only_mp_table(self) -> None: for table in result["content"]["tables"]: self.assertTrue(table["metadata"]["uid"] in table_refs) + @unittest.skip("TLDR-748") def test_pdf_with_some_tables(self) -> None: file_name = os.path.join("..", "pdf_with_text_layer", "VVP_6_tables.pdf") result = self._send_request(file_name, data={"pdf_with_text_layer": "true"}) @@ -168,6 +173,7 @@ def test_document_orientation(self) -> None: self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к постановлению\n" "Губернатора Камчатского края") + @unittest.skip("TLDR-748") def test_bold_annotation(self) -> None: file_name = "bold_font.png" result = self._send_request(file_name) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index b2ff91a6..1e949986 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -152,6 +152,7 @@ def test_tables2(self) -> None: self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) self.assertEqual("", node["text"].strip()[:30]) + @unittest.skip("TLDR-748") def test_pdf_with_tables(self) -> None: file_name = "VVP_6_tables.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="tabby", document_orientation="no_change")) diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index a56bfd44..7efc5866 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -1,4 +1,5 @@ import os +import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -19,6 +20,7 @@ def __extract_node_with_annotation(self, tree: dict, node_id: str, ann_name: str node_with_annotation = self._get_by_tree_path(tree["content"]["structure"], node_id) return self.__filter_by_name(node_with_annotation["annotations"], ann_name) + @unittest.skip("TLDR-748") def test_ref_tables(self) -> None: result = self._send_request("example.pdf", dict(pdf_with_text_layer="true")) tables_uids = [table["metadata"]["uid"] for table in result["content"]["tables"]] @@ -80,6 +82,7 @@ def test_pdf_with_text_style(self) -> None: self.assertEqual("Calibri18,", node["text"][word_bboxes[2]["start"]:word_bboxes[2]["end"]]) self.assertEqual("Tahoma16", node["text"][word_bboxes[3]["start"]:word_bboxes[3]["end"]]) + @unittest.skip("TLDR-748") def test_pdf_with_text_style_2(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) @@ -99,6 +102,7 @@ def test_pdf_with_text_style_2(self) -> None: self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"]) + @unittest.skip("TLDR-748") def test_pdf_with_2_columns_text(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="tabby", document_type="")) @@ -120,6 +124,7 @@ def test_pdf_with_2_columns_text(self) -> None: "onion-routing [3, 9, 22, 24], are known to be robust, identity", self._get_by_tree_path(tree, "0.8.0.0")["text"]) + @unittest.skip("TLDR-748") def test_pdf_with_2_columns_text_2(self) -> None: file_name = "liters_state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) @@ -129,6 +134,7 @@ def test_pdf_with_2_columns_text_2(self) -> None: self.assertIn("References", self._get_by_tree_path(tree, "0.0")["text"]) self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", self._get_by_tree_path(tree, "0.1")["text"]) + @unittest.skip("TLDR-748") def test_pdf_with_some_tables(self) -> None: file_name = "VVP_6_tables.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true")) diff --git a/tests/api_tests/test_api_format_txt.py b/tests/api_tests/test_api_format_txt.py index 3be5b0e4..57cc55c3 100644 --- a/tests/api_tests/test_api_format_txt.py +++ b/tests/api_tests/test_api_format_txt.py @@ -1,4 +1,5 @@ import os +import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from tests.test_utils import get_by_tree_path @@ -50,12 +51,14 @@ def test_special_symbols(self) -> None: with open(self._get_abs_path(file_name)) as file_in: self.assertEqual(file_in.read(), content["subparagraphs"][0]["text"]) + @unittest.skip("TLDR-748") def test_paragraphs(self) -> None: file_name = "football.txt" result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"]["subparagraphs"] self.__check_football(content) + @unittest.skip("TLDR-748") def test_paragraphs_gz(self) -> None: file_name = "football.txt.gz" result = self._send_request(file_name, data={"structure_type": "tree"}) From d58421e63cfcd27fb328e71b7943b51f173fbc6e Mon Sep 17 00:00:00 2001 From: Nasty Date: Wed, 14 Aug 2024 16:37:26 +0300 Subject: [PATCH 2/9] TLDR-748 fix tests --- .../pdf_txtlayer_reader/pdf_tabby_reader.py | 2 +- .../pdf_reader/utils/line_object_linker.py | 2 +- dedoc/readers/pptx_reader/paragraph.py | 2 +- dedoc/readers/txt_reader/raw_text_reader.py | 8 +--- .../default_structure_extractor.py | 16 ++++--- .../diploma_builder/body_builder.py | 20 ++++----- .../structure_extractors/patterns/__init__.py | 9 +++- .../patterns/abstract_pattern.py | 43 +++++++++++++++++-- .../patterns/bracket_list_pattern.py | 6 ++- .../patterns/bracket_roman_list_pattern.py | 6 ++- .../patterns/bullet_list_pattern.py | 6 ++- .../patterns/dotted_list_pattern.py | 14 ++++-- .../patterns/letter_list_pattern.py | 6 ++- .../patterns/regexp_pattern.py | 11 ++--- .../patterns/roman_list_pattern.py | 6 ++- .../patterns/start_word_pattern.py | 11 ++--- .../patterns/tag_header_pattern.py | 4 -- .../patterns/tag_list_pattern.py | 4 -- .../patterns/tag_pattern.py | 9 ++++ .../patterns/tag_type_pattern.py | 18 ++++++++ dedoc/structure_extractors/patterns/utils.py | 2 - tests/api_tests/test_api_doctype_diploma.py | 3 -- tests/api_tests/test_api_format_email.py | 3 -- tests/api_tests/test_api_format_json.py | 6 --- tests/api_tests/test_api_format_pdf.py | 6 --- .../test_api_format_pdf_tabby_reader.py | 1 - .../test_api_format_pdf_with_text.py | 6 --- tests/api_tests/test_api_format_txt.py | 3 -- 28 files changed, 146 insertions(+), 87 deletions(-) create mode 100644 dedoc/structure_extractors/patterns/tag_pattern.py create mode 100644 dedoc/structure_extractors/patterns/tag_type_pattern.py diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index c7316036..76c67dc0 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -277,7 +277,7 @@ def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel: if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines return HierarchyLevel(None, None, False, HierarchyLevel.list_item) - return HierarchyLevel(None, None, True, line_type) + return HierarchyLevel.create_unknown() def __jar_path(self) -> str: import os diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 2ddfc064..8c612e24 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i @return: """ if len(lines) == 0: - metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0) + metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0) lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))] last_page_line = self._get_last_page_line(lines) all_objects = list(lines + tables + images) diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py index 129ac3a3..6953454c 100644 --- a/dedoc/readers/pptx_reader/paragraph.py +++ b/dedoc/readers/pptx_reader/paragraph.py @@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta: text = "" paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level) - hierarchy_level = HierarchyLevel.create_raw_text() + hierarchy_level = HierarchyLevel.create_unknown() if is_title or paragraph_properties.title: hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index 87453e5c..3718289e 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -108,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int: return space_this.end() - space_this.start() def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool: - from dedoc.data_structures.hierarchy_level import HierarchyLevel - - if not line.metadata.tag_hierarchy_level.can_be_multiline and \ - line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown): - return True space_this = self.__get_starting_spacing(line) space_prev = self.__get_starting_spacing(previous_line) - return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \ - and not line.line.isspace() and space_this - space_prev >= 2 + return not line.line.isspace() and space_this - space_prev >= 2 def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument: previous_line = None diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index 7607c5cf..a7b1763c 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -43,14 +43,18 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern + from dedoc.structure_extractors.patterns.tag_pattern import TagPattern + from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern patterns = [ - TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1), - TagListPattern(line_type=HierarchyLevel.list_item, level_1=2), - DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2), - BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1), - LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1), - BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1), + TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False), + TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False), + TagTypePattern(), + TagPattern(line_type=HierarchyLevel.raw_text) ] else: import json diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py index 98ae1ec6..da96408f 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py @@ -8,22 +8,22 @@ from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \ AbstractBodyHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots -from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagHeaderPattern, TagListPattern +from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder): named_item_keywords = ("введение", "заключение", "библиографический список", "список литературы", "глава", "приложение", "приложения") - def __int__(self) -> None: + def __init__(self) -> None: super().__init__() self.digits_with_dots_regexp = regexps_digits_with_dots self.patterns = [ - TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1), - TagListPattern(line_type=HierarchyLevel.list_item, level_1=2), - DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2), - BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1), - LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1), - BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1), + TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False), + TagPattern(line_type=HierarchyLevel.raw_text) ] def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: @@ -66,10 +66,10 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction if text.startswith(self.named_item_keywords): hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction) elif item_depth == -1: - if previous_named_item_line and previous_named_item_line.metadata.hierarchy_level.line_type == "named_item": + if previous_named_item_line: hierarchy_level = previous_named_item_line.metadata.hierarchy_level else: - hierarchy_level = HierarchyLevel(init_hl_depth + 1, 0, True, prediction) + hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction) else: hierarchy_level = HierarchyLevel(init_hl_depth, item_depth - 1, True, prediction) line.metadata.hierarchy_level = hierarchy_level diff --git a/dedoc/structure_extractors/patterns/__init__.py b/dedoc/structure_extractors/patterns/__init__.py index dcda4109..ffccf7b3 100644 --- a/dedoc/structure_extractors/patterns/__init__.py +++ b/dedoc/structure_extractors/patterns/__init__.py @@ -1,8 +1,15 @@ from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern +from dedoc.structure_extractors.patterns.bracket_roman_list_pattern import BracketRomanListPattern from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern +from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern +from dedoc.structure_extractors.patterns.start_word_pattern import StartWordPattern from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern +from dedoc.structure_extractors.patterns.tag_pattern import TagPattern +from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern -__all__ = ["BracketListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "TagHeaderPattern", "TagListPattern"] +__all__ = ["BracketListPattern", "BracketRomanListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "RegexpPattern", "RomanListPattern", + "StartWordPattern", "TagHeaderPattern", "TagListPattern", "TagPattern", "TagTypePattern"] diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py index b987ce6e..2391a7b5 100644 --- a/dedoc/structure_extractors/patterns/abstract_pattern.py +++ b/dedoc/structure_extractors/patterns/abstract_pattern.py @@ -8,10 +8,14 @@ class AbstractPattern(ABC): __name = "" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: self._line_type = line_type self._level_1 = level_1 - self._level_2 = level_2 if level_2 else 1 + self._level_2 = level_2 self._can_be_multiline = can_be_multiline @classmethod @@ -22,6 +26,37 @@ def name(cls: "AbstractPattern") -> str: def match(self, line: LineWithMeta) -> bool: pass - @abstractmethod def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - pass + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=self._get_level_1(line), + level_2=self._get_level_2(line), + can_be_multiline=self._get_can_be_multiline(line) + ) + + def _get_line_type(self, line: LineWithMeta) -> str: + if self._line_type is not None: + return self._line_type + + if line.metadata.tag_hierarchy_level is None: + raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self.__name} line_type isn't configured") + + return line.metadata.tag_hierarchy_level.line_type + + def _get_level_1(self, line: LineWithMeta) -> Optional[int]: + if self._level_1 is not None: + return self._level_1 + + return line.metadata.tag_hierarchy_level.level_1 if line.metadata.tag_hierarchy_level else None + + def _get_level_2(self, line: LineWithMeta) -> Optional[int]: + if self._level_2 is not None: + return self._level_2 + + return line.metadata.tag_hierarchy_level.level_2 if line.metadata.tag_hierarchy_level else None + + def _get_can_be_multiline(self, line: LineWithMeta) -> bool: + if self._can_be_multiline is not None: + return self._can_be_multiline + + return line.metadata.tag_hierarchy_level.can_be_multiline if line.metadata.tag_hierarchy_level else True diff --git a/dedoc/structure_extractors/patterns/bracket_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_list_pattern.py index 496dd8c7..c92a251a 100644 --- a/dedoc/structure_extractors/patterns/bracket_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_list_pattern.py @@ -7,5 +7,9 @@ class BracketListPattern(RegexpPattern): __name = "bracket_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py index effacb46..d1db663a 100644 --- a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py @@ -7,5 +7,9 @@ class BracketRomanListPattern(RegexpPattern): __name = "bracket_roman_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bullet_list_pattern.py b/dedoc/structure_extractors/patterns/bullet_list_pattern.py index 10f5f319..59b616a3 100644 --- a/dedoc/structure_extractors/patterns/bullet_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bullet_list_pattern.py @@ -7,5 +7,9 @@ class BulletListPattern(RegexpPattern): __name = "bullet_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/dotted_list_pattern.py b/dedoc/structure_extractors/patterns/dotted_list_pattern.py index 34ec0222..03bf8347 100644 --- a/dedoc/structure_extractors/patterns/dotted_list_pattern.py +++ b/dedoc/structure_extractors/patterns/dotted_list_pattern.py @@ -9,12 +9,20 @@ class DottedListPattern(RegexpPattern): __name = "dotted_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - level_2 = self.__get_list_depth(line=line) - return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=self._get_level_1(line), + level_2=self.__get_list_depth(line=line), + can_be_multiline=self._get_can_be_multiline(line) + ) def __get_list_depth(self, line: LineWithMeta) -> int: text = line.line.strip().lower() diff --git a/dedoc/structure_extractors/patterns/letter_list_pattern.py b/dedoc/structure_extractors/patterns/letter_list_pattern.py index e700ffe8..892e6a9b 100644 --- a/dedoc/structure_extractors/patterns/letter_list_pattern.py +++ b/dedoc/structure_extractors/patterns/letter_list_pattern.py @@ -7,5 +7,9 @@ class LetterListPattern(RegexpPattern): __name = "letter_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py index d7013b9b..b4d275f6 100644 --- a/dedoc/structure_extractors/patterns/regexp_pattern.py +++ b/dedoc/structure_extractors/patterns/regexp_pattern.py @@ -1,7 +1,6 @@ import re from typing import Optional -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern @@ -9,7 +8,12 @@ class RegexpPattern(AbstractPattern): __name = "regexp" - def __init__(self, regexp: str or re.Pattern, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + regexp: str or re.Pattern, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp @@ -17,6 +21,3 @@ def match(self, line: LineWithMeta) -> bool: text = line.line.strip().lower() match = self._regexp.match(text) return match is not None - - def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/roman_list_pattern.py b/dedoc/structure_extractors/patterns/roman_list_pattern.py index c267d371..93026697 100644 --- a/dedoc/structure_extractors/patterns/roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/roman_list_pattern.py @@ -7,5 +7,9 @@ class RomanListPattern(RegexpPattern): __name = "roman_list" - def __init__(self, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py index 8258122c..69f575f3 100644 --- a/dedoc/structure_extractors/patterns/start_word_pattern.py +++ b/dedoc/structure_extractors/patterns/start_word_pattern.py @@ -1,6 +1,5 @@ from typing import Optional -from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern @@ -8,13 +7,15 @@ class StartWordPattern(AbstractPattern): __name = "start_word" - def __init__(self, start_word: str, line_type: str, level_1: int, level_2: Optional[int] = None, can_be_multiline: bool = False) -> None: + def __init__(self, + start_word: str, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[bool] = None) -> None: super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) self.__start_word = start_word.strip().lower() def match(self, line: LineWithMeta) -> bool: text = line.line.strip().lower() return text.startswith(self.__start_word) - - def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_header_pattern.py b/dedoc/structure_extractors/patterns/tag_header_pattern.py index a730b451..9d8ce555 100644 --- a/dedoc/structure_extractors/patterns/tag_header_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_header_pattern.py @@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool: level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 return level_1 is not None and level_2 is not None - - def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - level_2 = line.metadata.tag_hierarchy_level.level_2 - return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_list_pattern.py b/dedoc/structure_extractors/patterns/tag_list_pattern.py index 01647896..c5b2e92e 100644 --- a/dedoc/structure_extractors/patterns/tag_list_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_list_pattern.py @@ -12,7 +12,3 @@ def match(self, line: LineWithMeta) -> bool: level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 return level_1 is not None and level_2 is not None - - def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: - level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 - return HierarchyLevel(line_type=self._line_type, level_1=level_1, level_2=level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_pattern.py b/dedoc/structure_extractors/patterns/tag_pattern.py new file mode 100644 index 00000000..2fc128d7 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_pattern.py @@ -0,0 +1,9 @@ +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class TagPattern(AbstractPattern): + __name = "tag" + + def match(self, line: LineWithMeta) -> bool: + return line.metadata.tag_hierarchy_level is not None diff --git a/dedoc/structure_extractors/patterns/tag_type_pattern.py b/dedoc/structure_extractors/patterns/tag_type_pattern.py new file mode 100644 index 00000000..664f4101 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_type_pattern.py @@ -0,0 +1,18 @@ +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class TagTypePattern(AbstractPattern): + __name = "tag_type" + + def match(self, line: LineWithMeta) -> bool: + return line.metadata.tag_hierarchy_level is not None and not line.metadata.tag_hierarchy_level.is_unknown() + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel( + line_type=line.metadata.tag_hierarchy_level.line_type, + level_1=self._get_level_1(line), + level_2=self._get_level_2(line), + can_be_multiline=self._get_can_be_multiline(line) + ) diff --git a/dedoc/structure_extractors/patterns/utils.py b/dedoc/structure_extractors/patterns/utils.py index 4d8a618a..0c41242c 100644 --- a/dedoc/structure_extractors/patterns/utils.py +++ b/dedoc/structure_extractors/patterns/utils.py @@ -6,8 +6,6 @@ def get_pattern(pattern_parameters: dict) -> AbstractPattern: assert isinstance(pattern_parameters, dict) assert "name" in pattern_parameters, "Pattern parameter missing 'name'" - assert "line_type" in pattern_parameters, "Pattern parameter missing 'line_type'" - assert "level_1" in pattern_parameters, "Pattern parameter missing 'level_1'" supported_patterns = {pattern.name: pattern for pattern in patterns_module.__all__} pattern_class = supported_patterns.get(pattern_parameters["name"]) diff --git a/tests/api_tests/test_api_doctype_diploma.py b/tests/api_tests/test_api_doctype_diploma.py index dd767dfe..24ed6aff 100644 --- a/tests/api_tests/test_api_doctype_diploma.py +++ b/tests/api_tests/test_api_doctype_diploma.py @@ -1,5 +1,4 @@ import os -import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -9,7 +8,6 @@ class TestApiDiploma(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "diplomas", file_name) - @unittest.skip("TLDR-748") def test_diploma_pdf(self) -> None: file_name = "diploma.pdf" result = self._send_request(file_name, dict(document_type="diploma", pdf_with_text_layer="tabby")) @@ -53,7 +51,6 @@ def test_diploma_pdf(self) -> None: self.assertEqual("БИБЛИОГРАФИЧЕСКИЙ СПИСОК", node["text"].strip()) self.assertEqual("named_item", node["metadata"]["paragraph_type"]) - @unittest.skip("TLDR-748") def test_diploma_docx(self) -> None: file_name = "diploma.docx" result = self._send_request(file_name, dict(document_type="diploma")) diff --git a/tests/api_tests/test_api_format_email.py b/tests/api_tests/test_api_format_email.py index 02e0e66b..0846538f 100644 --- a/tests/api_tests/test_api_format_email.py +++ b/tests/api_tests/test_api_format_email.py @@ -1,5 +1,4 @@ import os -import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -9,7 +8,6 @@ class TestApiEmailReader(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "eml", file_name) - @unittest.skip("TLDR-748") def test_email_file(self) -> None: file_name = "spam_mail.eml" result = self._send_request(file_name, data={"with_attachments": "true"}) @@ -31,7 +29,6 @@ def test_email_file(self) -> None: self.assertEqual('"sunny_goldensun@126.com" ', from_message["text"]) self.assertEqual("from", from_message["metadata"]["paragraph_type"]) - @unittest.skip("TLDR-748") def test_email_with_attachments(self) -> None: file_name = "message.eml" result = self._send_request(file_name, data={"with_attachments": "true"}) diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index 8369a476..ce84f073 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -1,6 +1,5 @@ import json import os -import unittest from json import JSONDecodeError from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -28,7 +27,6 @@ def test_list(self) -> None: self.assertEqual("list_item", list_items[1]["metadata"]["paragraph_type"]) self.assertEqual("он её любил", list_items[1]["text"]) - @unittest.skip("TLDR-748") def test_dict(self) -> None: file_name = "dict.json" result = self._send_request(file_name)["content"]["structure"] @@ -38,7 +36,6 @@ def test_dict(self) -> None: self.assertEqual("key", nodes[1]["metadata"]["paragraph_type"]) self.assertEqual("он её любил", nodes[1]["subparagraphs"][0]["text"]) - @unittest.skip("TLDR-748") def test_dict_with_list(self) -> None: file_name = "dict_with_list.json" result = self._send_request(file_name)["content"]["structure"] @@ -57,7 +54,6 @@ def test_dict_with_list(self) -> None: self.assertEqual("понедельник", second_list_items[0]["text"]) self.assertEqual("вторник", second_list_items[1]["text"]) - @unittest.skip("TLDR-748") def test_list_with_dict(self) -> None: file_name = "list_with_dict.json" result = self._send_request(file_name)["content"]["structure"] @@ -71,7 +67,6 @@ def test_list_with_dict(self) -> None: self.assertEqual("понедельник", self._get_by_tree_path(result, "0.1.0.0.0.0")["text"]) self.assertEqual("вторник", self._get_by_tree_path(result, "0.1.0.0.0.1")["text"]) - @unittest.skip("TLDR-748") def test_realistic(self) -> None: file_name = "realistic_json.json" result = self._send_request(file_name)["content"]["structure"]["subparagraphs"] @@ -91,7 +86,6 @@ def test_json_attachments2(self) -> None: data = {"html_fields": '[["e"], ["f"]]', "with_attachments": "True", "return_base64": "true"} self._send_request(file_name, expected_code=200, data=data) - @unittest.skip("TLDR-748") def test_json_null(self) -> None: file_name = "test_null.json" result = self._send_request(file_name, expected_code=200) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index 6486c89d..ec474b16 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -36,7 +36,6 @@ def test_pdf(self) -> None: self.__check_metainfo(result["metadata"], "application/pdf", file_name) self.assertEqual([], result["attachments"]) - @unittest.skip("TLDR-748") def test_djvu(self) -> None: file_name = "example_with_table7.djvu" result = self._send_request(file_name, dict(document_type="")) @@ -47,7 +46,6 @@ def test_djvu(self) -> None: self.__check_metainfo(result["metadata"], "image/vnd.djvu", file_name) - @unittest.skip("TLDR-748") def test_djvu_2(self) -> None: file_name = "example_with_table9.djvu" result = self._send_request(file_name) @@ -65,7 +63,6 @@ def test_broken_djvu(self) -> None: file_name = "broken.djvu" _ = self._send_request(file_name, expected_code=415) - @unittest.skip("TLDR-748") def test_header_pdf(self) -> None: file_name = "header_test.pdf" result = self._send_request(file_name, data=dict(pdf_with_text_layer="true")) @@ -112,7 +109,6 @@ def test_image_binarization(self) -> None: self.assertIn("ЦЕНТРАЛЬНЫЙ БАНК РОССИЙСКОЙ ФЕДЕРАЦИИ\n", result["content"]["structure"]["subparagraphs"][0]["text"]) self.assertIn("Е.И Курицына\n(расшифровка подлиси", result["content"]["structure"]["subparagraphs"][-1]["text"]) - @unittest.skip("TLDR-748") def test_on_ocr_conf_threshold(self) -> None: result = self._send_request("with_trash.jpg", data=dict(structure_type="tree")) tree = result["content"]["structure"] @@ -138,7 +134,6 @@ def test_pdf_with_only_mp_table(self) -> None: for table in result["content"]["tables"]: self.assertTrue(table["metadata"]["uid"] in table_refs) - @unittest.skip("TLDR-748") def test_pdf_with_some_tables(self) -> None: file_name = os.path.join("..", "pdf_with_text_layer", "VVP_6_tables.pdf") result = self._send_request(file_name, data={"pdf_with_text_layer": "true"}) @@ -173,7 +168,6 @@ def test_document_orientation(self) -> None: self._check_similarity(tree["subparagraphs"][0]["text"], "Приложение к постановлению\n" "Губернатора Камчатского края") - @unittest.skip("TLDR-748") def test_bold_annotation(self) -> None: file_name = "bold_font.png" result = self._send_request(file_name) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 1e949986..b2ff91a6 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -152,7 +152,6 @@ def test_tables2(self) -> None: self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) self.assertEqual("", node["text"].strip()[:30]) - @unittest.skip("TLDR-748") def test_pdf_with_tables(self) -> None: file_name = "VVP_6_tables.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="tabby", document_orientation="no_change")) diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index 7efc5866..a56bfd44 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -1,5 +1,4 @@ import os -import unittest from typing import List from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -20,7 +19,6 @@ def __extract_node_with_annotation(self, tree: dict, node_id: str, ann_name: str node_with_annotation = self._get_by_tree_path(tree["content"]["structure"], node_id) return self.__filter_by_name(node_with_annotation["annotations"], ann_name) - @unittest.skip("TLDR-748") def test_ref_tables(self) -> None: result = self._send_request("example.pdf", dict(pdf_with_text_layer="true")) tables_uids = [table["metadata"]["uid"] for table in result["content"]["tables"]] @@ -82,7 +80,6 @@ def test_pdf_with_text_style(self) -> None: self.assertEqual("Calibri18,", node["text"][word_bboxes[2]["start"]:word_bboxes[2]["end"]]) self.assertEqual("Tahoma16", node["text"][word_bboxes[3]["start"]:word_bboxes[3]["end"]]) - @unittest.skip("TLDR-748") def test_pdf_with_text_style_2(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) @@ -102,7 +99,6 @@ def test_pdf_with_text_style_2(self) -> None: self.assertIn("Pere Manils, Abdelberi Chaabane, Stevens Le Blond,", self._get_by_tree_path(tree, "0.1")["text"]) - @unittest.skip("TLDR-748") def test_pdf_with_2_columns_text(self) -> None: file_name = "2-column-state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="tabby", document_type="")) @@ -124,7 +120,6 @@ def test_pdf_with_2_columns_text(self) -> None: "onion-routing [3, 9, 22, 24], are known to be robust, identity", self._get_by_tree_path(tree, "0.8.0.0")["text"]) - @unittest.skip("TLDR-748") def test_pdf_with_2_columns_text_2(self) -> None: file_name = "liters_state.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true", need_pdf_table_analysis="false")) @@ -134,7 +129,6 @@ def test_pdf_with_2_columns_text_2(self) -> None: self.assertIn("References", self._get_by_tree_path(tree, "0.0")["text"]) self.assertIn("[1] Navaneeth Bodla, Bharat Singh, Rama Chellappa, and", self._get_by_tree_path(tree, "0.1")["text"]) - @unittest.skip("TLDR-748") def test_pdf_with_some_tables(self) -> None: file_name = "VVP_6_tables.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="true")) diff --git a/tests/api_tests/test_api_format_txt.py b/tests/api_tests/test_api_format_txt.py index 57cc55c3..3be5b0e4 100644 --- a/tests/api_tests/test_api_format_txt.py +++ b/tests/api_tests/test_api_format_txt.py @@ -1,5 +1,4 @@ import os -import unittest from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from tests.test_utils import get_by_tree_path @@ -51,14 +50,12 @@ def test_special_symbols(self) -> None: with open(self._get_abs_path(file_name)) as file_in: self.assertEqual(file_in.read(), content["subparagraphs"][0]["text"]) - @unittest.skip("TLDR-748") def test_paragraphs(self) -> None: file_name = "football.txt" result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"]["subparagraphs"] self.__check_football(content) - @unittest.skip("TLDR-748") def test_paragraphs_gz(self) -> None: file_name = "football.txt.gz" result = self._send_request(file_name, data={"structure_type": "tree"}) From 697946d30080a37a8b749a54b6f47adaa9371bcd Mon Sep 17 00:00:00 2001 From: Nasty Date: Wed, 14 Aug 2024 17:54:16 +0300 Subject: [PATCH 3/9] TLDR-748 unit test added --- .../docx_reader/line_with_meta_converter.py | 2 +- .../default_structure_extractor.py | 4 +- .../structure_extractors/patterns/__init__.py | 4 +- .../patterns/abstract_pattern.py | 6 +- .../patterns/bracket_list_pattern.py | 2 +- .../patterns/bracket_roman_list_pattern.py | 2 +- .../patterns/bullet_list_pattern.py | 2 +- .../patterns/dotted_list_pattern.py | 2 +- .../patterns/letter_list_pattern.py | 2 +- .../patterns/regexp_pattern.py | 2 +- .../patterns/roman_list_pattern.py | 2 +- .../patterns/start_word_pattern.py | 2 +- .../patterns/tag_header_pattern.py | 8 +- .../patterns/tag_list_pattern.py | 15 ++-- .../patterns/tag_pattern.py | 2 +- .../patterns/tag_type_pattern.py | 2 +- dedoc/structure_extractors/patterns/utils.py | 3 +- tests/api_tests/test_api_doctype_default.py | 10 +++ ...est_doctype_default_structure_extractor.py | 84 +++++++++++++++++++ 19 files changed, 126 insertions(+), 30 deletions(-) create mode 100644 tests/api_tests/test_api_doctype_default.py create mode 100644 tests/unit_tests/test_doctype_default_structure_extractor.py diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py index eb65b3eb..ead068d7 100644 --- a/dedoc/readers/docx_reader/line_with_meta_converter.py +++ b/dedoc/readers/docx_reader/line_with_meta_converter.py @@ -64,4 +64,4 @@ def __get_tag(self, paragraph: Paragraph) -> HierarchyLevel: if paragraph.list_level is not None: return HierarchyLevel(2, paragraph.list_level, False, HierarchyLevel.list_item) - return HierarchyLevel(None, None, True, HierarchyLevel.unknown) + return HierarchyLevel.create_unknown() diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index a7b1763c..f2ea745b 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -48,8 +48,8 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: patterns = [ TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False), - TagListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), - DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + TagListPattern(line_type=HierarchyLevel.list_item, can_be_multiline=False), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, level_2=1, can_be_multiline=False), BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False), LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False), BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False), diff --git a/dedoc/structure_extractors/patterns/__init__.py b/dedoc/structure_extractors/patterns/__init__.py index ffccf7b3..7f905e46 100644 --- a/dedoc/structure_extractors/patterns/__init__.py +++ b/dedoc/structure_extractors/patterns/__init__.py @@ -11,5 +11,5 @@ from dedoc.structure_extractors.patterns.tag_pattern import TagPattern from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern -__all__ = ["BracketListPattern", "BracketRomanListPattern", "BulletListPattern", "DottedListPattern", "LetterListPattern", "RegexpPattern", "RomanListPattern", - "StartWordPattern", "TagHeaderPattern", "TagListPattern", "TagPattern", "TagTypePattern"] +__all__ = [BracketListPattern, BracketRomanListPattern, BulletListPattern, DottedListPattern, LetterListPattern, RegexpPattern, RomanListPattern, + StartWordPattern, TagHeaderPattern, TagListPattern, TagPattern, TagTypePattern] diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py index 2391a7b5..c3d6f8b9 100644 --- a/dedoc/structure_extractors/patterns/abstract_pattern.py +++ b/dedoc/structure_extractors/patterns/abstract_pattern.py @@ -6,7 +6,7 @@ class AbstractPattern(ABC): - __name = "" + _name = "" def __init__(self, line_type: Optional[str] = None, @@ -20,7 +20,7 @@ def __init__(self, @classmethod def name(cls: "AbstractPattern") -> str: - return cls.__name + return cls._name @abstractmethod def match(self, line: LineWithMeta) -> bool: @@ -39,7 +39,7 @@ def _get_line_type(self, line: LineWithMeta) -> str: return self._line_type if line.metadata.tag_hierarchy_level is None: - raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self.__name} line_type isn't configured") + raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self._name} line_type isn't configured") return line.metadata.tag_hierarchy_level.line_type diff --git a/dedoc/structure_extractors/patterns/bracket_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_list_pattern.py index c92a251a..1c9e5a03 100644 --- a/dedoc/structure_extractors/patterns/bracket_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_list_pattern.py @@ -5,7 +5,7 @@ class BracketListPattern(RegexpPattern): - __name = "bracket_list" + _name = "bracket_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py index d1db663a..69f49b8b 100644 --- a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py @@ -5,7 +5,7 @@ class BracketRomanListPattern(RegexpPattern): - __name = "bracket_roman_list" + _name = "bracket_roman_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/bullet_list_pattern.py b/dedoc/structure_extractors/patterns/bullet_list_pattern.py index 59b616a3..c8cabe0f 100644 --- a/dedoc/structure_extractors/patterns/bullet_list_pattern.py +++ b/dedoc/structure_extractors/patterns/bullet_list_pattern.py @@ -5,7 +5,7 @@ class BulletListPattern(RegexpPattern): - __name = "bullet_list" + _name = "bullet_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/dotted_list_pattern.py b/dedoc/structure_extractors/patterns/dotted_list_pattern.py index 03bf8347..1bb1aef8 100644 --- a/dedoc/structure_extractors/patterns/dotted_list_pattern.py +++ b/dedoc/structure_extractors/patterns/dotted_list_pattern.py @@ -7,7 +7,7 @@ class DottedListPattern(RegexpPattern): - __name = "dotted_list" + _name = "dotted_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/letter_list_pattern.py b/dedoc/structure_extractors/patterns/letter_list_pattern.py index 892e6a9b..eb1613a2 100644 --- a/dedoc/structure_extractors/patterns/letter_list_pattern.py +++ b/dedoc/structure_extractors/patterns/letter_list_pattern.py @@ -5,7 +5,7 @@ class LetterListPattern(RegexpPattern): - __name = "letter_list" + _name = "letter_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py index b4d275f6..93cc41a3 100644 --- a/dedoc/structure_extractors/patterns/regexp_pattern.py +++ b/dedoc/structure_extractors/patterns/regexp_pattern.py @@ -6,7 +6,7 @@ class RegexpPattern(AbstractPattern): - __name = "regexp" + _name = "regexp" def __init__(self, regexp: str or re.Pattern, diff --git a/dedoc/structure_extractors/patterns/roman_list_pattern.py b/dedoc/structure_extractors/patterns/roman_list_pattern.py index 93026697..425b990c 100644 --- a/dedoc/structure_extractors/patterns/roman_list_pattern.py +++ b/dedoc/structure_extractors/patterns/roman_list_pattern.py @@ -5,7 +5,7 @@ class RomanListPattern(RegexpPattern): - __name = "roman_list" + _name = "roman_list" def __init__(self, line_type: Optional[str] = None, diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py index 69f575f3..3194bd70 100644 --- a/dedoc/structure_extractors/patterns/start_word_pattern.py +++ b/dedoc/structure_extractors/patterns/start_word_pattern.py @@ -5,7 +5,7 @@ class StartWordPattern(AbstractPattern): - __name = "start_word" + _name = "start_word" def __init__(self, start_word: str, diff --git a/dedoc/structure_extractors/patterns/tag_header_pattern.py b/dedoc/structure_extractors/patterns/tag_header_pattern.py index 9d8ce555..e390db58 100644 --- a/dedoc/structure_extractors/patterns/tag_header_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_header_pattern.py @@ -4,11 +4,7 @@ class TagHeaderPattern(AbstractPattern): - __name = "tag_header" + _name = "tag_header" def match(self, line: LineWithMeta) -> bool: - if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.line_type != HierarchyLevel.header: - return False - - level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 - return level_1 is not None and level_2 is not None + return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header diff --git a/dedoc/structure_extractors/patterns/tag_list_pattern.py b/dedoc/structure_extractors/patterns/tag_list_pattern.py index c5b2e92e..cd802ed2 100644 --- a/dedoc/structure_extractors/patterns/tag_list_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_list_pattern.py @@ -4,11 +4,16 @@ class TagListPattern(AbstractPattern): - __name = "tag_list" + _name = "tag_list" def match(self, line: LineWithMeta) -> bool: - if line.metadata.tag_hierarchy_level is None or line.metadata.tag_hierarchy_level.line_type != HierarchyLevel.list_item: - return False + return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item - level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 - return level_1 is not None and level_2 is not None + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + level_1, level_2 = self._get_level_1(line), self._get_level_2(line) + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=level_1 if level_1 is not None else 2, + level_2=level_2 if level_2 is not None else 1, + can_be_multiline=self._get_can_be_multiline(line) + ) diff --git a/dedoc/structure_extractors/patterns/tag_pattern.py b/dedoc/structure_extractors/patterns/tag_pattern.py index 2fc128d7..20ad469b 100644 --- a/dedoc/structure_extractors/patterns/tag_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_pattern.py @@ -3,7 +3,7 @@ class TagPattern(AbstractPattern): - __name = "tag" + _name = "tag" def match(self, line: LineWithMeta) -> bool: return line.metadata.tag_hierarchy_level is not None diff --git a/dedoc/structure_extractors/patterns/tag_type_pattern.py b/dedoc/structure_extractors/patterns/tag_type_pattern.py index 664f4101..a99f4974 100644 --- a/dedoc/structure_extractors/patterns/tag_type_pattern.py +++ b/dedoc/structure_extractors/patterns/tag_type_pattern.py @@ -4,7 +4,7 @@ class TagTypePattern(AbstractPattern): - __name = "tag_type" + _name = "tag_type" def match(self, line: LineWithMeta) -> bool: return line.metadata.tag_hierarchy_level is not None and not line.metadata.tag_hierarchy_level.is_unknown() diff --git a/dedoc/structure_extractors/patterns/utils.py b/dedoc/structure_extractors/patterns/utils.py index 0c41242c..99ba34c7 100644 --- a/dedoc/structure_extractors/patterns/utils.py +++ b/dedoc/structure_extractors/patterns/utils.py @@ -7,10 +7,11 @@ def get_pattern(pattern_parameters: dict) -> AbstractPattern: assert isinstance(pattern_parameters, dict) assert "name" in pattern_parameters, "Pattern parameter missing 'name'" - supported_patterns = {pattern.name: pattern for pattern in patterns_module.__all__} + supported_patterns = {pattern.name(): pattern for pattern in patterns_module.__all__} pattern_class = supported_patterns.get(pattern_parameters["name"]) if pattern_class is None: raise ValueError(f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}") + pattern_parameters.pop("name") pattern = pattern_class(**pattern_parameters) return pattern diff --git a/tests/api_tests/test_api_doctype_default.py b/tests/api_tests/test_api_doctype_default.py new file mode 100644 index 00000000..25932630 --- /dev/null +++ b/tests/api_tests/test_api_doctype_default.py @@ -0,0 +1,10 @@ +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiDefaultStructure(AbstractTestApiDocReader): + + def test_all_patterns(self) -> None: + pass + + def test_wrong_patterns(self) -> None: + pass diff --git a/tests/unit_tests/test_doctype_default_structure_extractor.py b/tests/unit_tests/test_doctype_default_structure_extractor.py new file mode 100644 index 00000000..05f6c56d --- /dev/null +++ b/tests/unit_tests/test_doctype_default_structure_extractor.py @@ -0,0 +1,84 @@ +import os +import re +import unittest + +from dedoc.readers.docx_reader.docx_reader import DocxReader +from dedoc.readers.reader_composition import ReaderComposition +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader +from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor +from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern +from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern +from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern +from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern +from tests.test_utils import get_test_config + + +class TestDefaultStructureExtractor(unittest.TestCase): + data_directory_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) + structure_extractor = DefaultStructureExtractor(config=get_test_config()) + reader = ReaderComposition(readers=[RawTextReader(), DocxReader()]) + + def test_tag_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "with_tags.docx") + patterns = [ + TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False), + TagListPattern(line_type="custom_list", level_1=2), + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "custom_header") + self.assertEqual(document.lines[0].metadata.hierarchy_level.level_1, 1) + self.assertEqual(document.lines[0].metadata.hierarchy_level.level_2, 1) + self.assertFalse(document.lines[0].metadata.hierarchy_level.can_be_multiline) + + self.assertEqual(document.lines[1].metadata.hierarchy_level.line_type, "custom_header") + self.assertEqual(document.lines[1].metadata.hierarchy_level.level_1, 1) + self.assertEqual(document.lines[1].metadata.hierarchy_level.level_2, 2) + + self.assertEqual(document.lines[3].metadata.hierarchy_level.line_type, "raw_text") + self.assertTrue(document.lines[3].metadata.hierarchy_level.can_be_multiline) + + self.assertEqual(document.lines[4].metadata.hierarchy_level.line_type, "custom_list") + self.assertEqual(document.lines[4].metadata.hierarchy_level.level_1, 2) + self.assertEqual(document.lines[4].metadata.hierarchy_level.level_2, 1) + self.assertFalse(document.lines[4].metadata.hierarchy_level.can_be_multiline) + + def test_list_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "txt", "pr_17.txt") + patterns = [ + RomanListPattern(line_type="chapter", level_1=1, level_2=1, can_be_multiline=False), + DottedListPattern(line_type="dotted_list", level_1=2, can_be_multiline=False), + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "raw_text") + self.assertEqual(document.lines[12].metadata.hierarchy_level.line_type, "chapter") + self.assertEqual(document.lines[14].metadata.hierarchy_level.line_type, "dotted_list") + + def test_regexp_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "without_numbering.docx") + patterns = [ + RegexpPattern(regexp="^глава\s\d+\.", line_type="глава", level_1=1), # noqa + RegexpPattern(regexp=re.compile(r"^статья\s\d+\.\d+\."), line_type="статья", level_1=2) + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "raw_text") + self.assertEqual(document.lines[9].metadata.hierarchy_level.line_type, "глава") + self.assertEqual(document.lines[11].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[15].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[83].metadata.hierarchy_level.line_type, "глава") + + def test_start_word_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "example.docx") + patterns = [ + {"name": "start_word", "start_word": "глава", "level_1": 1, "line_type": "глава"}, + {"name": "start_word", "start_word": "статья", "level_1": 2, "line_type": "статья"}, + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[1].metadata.hierarchy_level.line_type, "глава") + self.assertEqual(document.lines[3].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[5].metadata.hierarchy_level.line_type, "статья") From d39d5939fe1dd6b9f301562e14dd1d3ef8e5a7ce Mon Sep 17 00:00:00 2001 From: Nasty Date: Thu, 15 Aug 2024 17:49:34 +0300 Subject: [PATCH 4/9] TLDR-748 API tests added --- dedoc/api/api_args.py | 1 + .../default_structure_extractor.py | 21 +++++++++------- dedoc/structure_extractors/patterns/utils.py | 11 +++++---- tests/api_tests/test_api_doctype_default.py | 24 ++++++++++++++++--- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 8f3e1415..cd75b6e5 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -8,6 +8,7 @@ class QueryParameters: # type of document structure parsing document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain") + patterns: str = Form(None, description='Patterns for default document type (when document_type="")') structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], description="Response representation, most types (except json) are used for debug purposes only") diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index f2ea745b..8b4ac6cb 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -1,5 +1,6 @@ from typing import List, Optional +from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor @@ -36,7 +37,8 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N return document def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: - if "patterns" not in parameters: + patterns = parameters.get("patterns") + if not patterns: from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern @@ -46,7 +48,7 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: from dedoc.structure_extractors.patterns.tag_pattern import TagPattern from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern - patterns = [ + return [ TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False), TagListPattern(line_type=HierarchyLevel.list_item, can_be_multiline=False), DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, level_2=1, can_be_multiline=False), @@ -56,17 +58,20 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]: TagTypePattern(), TagPattern(line_type=HierarchyLevel.raw_text) ] - else: - import json + + try: + import ast from dedoc.structure_extractors.patterns.utils import get_pattern patterns = parameters["patterns"] if isinstance(patterns, str): - patterns = json.loads(patterns) - assert isinstance(patterns, list) - assert len(patterns) > 0 + patterns = ast.literal_eval(patterns) + assert isinstance(patterns, list), "Patterns parameter should contain a list of patterns" + assert len(patterns) > 0, "Patterns parameter should contain a non-empty list of patterns" if isinstance(patterns[0], dict): patterns = [get_pattern(pattern) for pattern in patterns] - assert isinstance(patterns[0], AbstractPattern) + assert isinstance(patterns[0], AbstractPattern), "Patterns should be initialized properly" + except AssertionError as e: + raise StructureExtractorError(msg=str(e)) return patterns diff --git a/dedoc/structure_extractors/patterns/utils.py b/dedoc/structure_extractors/patterns/utils.py index 99ba34c7..6f79ac5b 100644 --- a/dedoc/structure_extractors/patterns/utils.py +++ b/dedoc/structure_extractors/patterns/utils.py @@ -1,17 +1,20 @@ +from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern def get_pattern(pattern_parameters: dict) -> AbstractPattern: import dedoc.structure_extractors.patterns as patterns_module - assert isinstance(pattern_parameters, dict) + assert isinstance(pattern_parameters, dict), "Pattern configuration must be a dict" assert "name" in pattern_parameters, "Pattern parameter missing 'name'" supported_patterns = {pattern.name(): pattern for pattern in patterns_module.__all__} pattern_class = supported_patterns.get(pattern_parameters["name"]) - if pattern_class is None: - raise ValueError(f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}") + assert pattern_class is not None, f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}" pattern_parameters.pop("name") - pattern = pattern_class(**pattern_parameters) + try: + pattern = pattern_class(**pattern_parameters) + except TypeError as e: + raise StructureExtractorError(msg=str(e)) return pattern diff --git a/tests/api_tests/test_api_doctype_default.py b/tests/api_tests/test_api_doctype_default.py index 25932630..40345629 100644 --- a/tests/api_tests/test_api_doctype_default.py +++ b/tests/api_tests/test_api_doctype_default.py @@ -3,8 +3,26 @@ class TestApiDefaultStructure(AbstractTestApiDocReader): - def test_all_patterns(self) -> None: - pass + def test_patterns(self) -> None: + file_name = "docx/without_numbering.docx" + patterns = [ + {"name": "regexp", "regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}, # noqa + {"name": "start_word", "start_word": "статья", "level_1": 2, "line_type": "статья"}, + {"name": "dotted_list", "level_1": 3, "line_type": "list_item"}, + {"name": "bracket_list", "level_1": 4, "level_2": 1, "line_type": "list_item"} + ] + result = self._send_request(file_name, {"patterns": str(patterns)}) + structure = result["content"]["structure"] + + node = self._get_by_tree_path(structure, "0.1") + self.assertEqual(node["text"], "Глава 1. Общие положения") + self.assertEqual(node["metadata"]["paragraph_type"], "глава") + node = self._get_by_tree_path(structure, "0.2") + self.assertEqual(node["text"], "Глава 2. Административные правонарушения, посягающие на права граждан и здоровье населения") + self.assertEqual(node["metadata"]["paragraph_type"], "глава") def test_wrong_patterns(self) -> None: - pass + file_name = "docx/example.docx" + self._send_request(file_name, {"patterns": str([{"regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}])}, expected_code=400) # noqa + self._send_request(file_name, {"patterns": str([{"name": "start_word", "line_type": "глава", "level_1": 1}])}, expected_code=400) + self._send_request(file_name, {"patterns": str([1])}, expected_code=400) From c505eaa4c53d6e12115c6edaeb9dbb4f03ba5f03 Mon Sep 17 00:00:00 2001 From: Nasty Date: Fri, 16 Aug 2024 15:20:06 +0300 Subject: [PATCH 5/9] TLDR-748 finishing fixes --- dedoc/api/api_args.py | 2 +- dedoc/api/web/index.html | 26 ++++++++- .../default_structure_extractor.py | 40 +++++++------ .../structure_extractors/patterns/__init__.py | 3 +- .../patterns/abstract_pattern.py | 45 ++------------ .../patterns/bracket_list_pattern.py | 6 +- .../patterns/bracket_roman_list_pattern.py | 6 +- .../patterns/bullet_list_pattern.py | 6 +- .../patterns/dotted_list_pattern.py | 14 ++--- .../patterns/letter_list_pattern.py | 6 +- .../patterns/regexp_pattern.py | 8 ++- .../patterns/roman_list_pattern.py | 6 +- .../patterns/start_word_pattern.py | 8 ++- .../patterns/tag_header_pattern.py | 25 +++++++- .../patterns/tag_list_pattern.py | 22 +++++-- .../patterns/tag_pattern.py | 58 +++++++++++++++++++ .../patterns/tag_type_pattern.py | 18 ------ dedoc/structure_extractors/patterns/utils.py | 15 +++-- dedoc/utils/parameter_utils.py | 7 +++ tests/api_tests/test_api_doctype_default.py | 29 ++++++++-- 20 files changed, 215 insertions(+), 135 deletions(-) delete mode 100644 dedoc/structure_extractors/patterns/tag_type_pattern.py diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index cd75b6e5..8ffdc7b9 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -8,7 +8,7 @@ class QueryParameters: # type of document structure parsing document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain") - patterns: str = Form(None, description='Patterns for default document type (when document_type="")') + patterns: str = Form("", description='Patterns for default document type (when document_type="")') structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], description="Response representation, most types (except json) are used for debug purposes only") diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 423dbcfe..d0c8b984 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -28,7 +28,7 @@

Parameters configuration

Type of document structure parsing

-
document_type, structure_type, return_format +
document_type, patterns, structure_type, return_format

Type of document structure parsing

+

+

+ Patterns for default structure extractor (document_type="other")
+
+ +
+

+