Skip to content

Commit

Permalink
TLDR-748 API tests added
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Aug 15, 2024
1 parent 5a9cfb2 commit 0c5fb93
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 15 deletions.
1 change: 1 addition & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
patterns: str = Form(None, description='Patterns for default document type (when document_type="")')
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import List, Optional

from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor
Expand Down Expand Up @@ -36,7 +37,8 @@ def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = N
return document

def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
if "patterns" not in parameters:
patterns = parameters.get("patterns")
if not patterns:
from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern
from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern
from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern
Expand All @@ -46,7 +48,7 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern

patterns = [
return [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, level_2=1, can_be_multiline=False),
Expand All @@ -56,17 +58,20 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
TagTypePattern(),
TagPattern(line_type=HierarchyLevel.raw_text)
]
else:
import json

try:
import ast
from dedoc.structure_extractors.patterns.utils import get_pattern

patterns = parameters["patterns"]
if isinstance(patterns, str):
patterns = json.loads(patterns)
assert isinstance(patterns, list)
assert len(patterns) > 0
patterns = ast.literal_eval(patterns)
assert isinstance(patterns, list), "Patterns parameter should contain a list of patterns"
assert len(patterns) > 0, "Patterns parameter should contain a non-empty list of patterns"
if isinstance(patterns[0], dict):
patterns = [get_pattern(pattern) for pattern in patterns]

assert isinstance(patterns[0], AbstractPattern)
assert isinstance(patterns[0], AbstractPattern), "Patterns should be initialized properly"
except AssertionError as e:
raise StructureExtractorError(msg=str(e))
return patterns
11 changes: 7 additions & 4 deletions dedoc/structure_extractors/patterns/utils.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


def get_pattern(pattern_parameters: dict) -> AbstractPattern:
import dedoc.structure_extractors.patterns as patterns_module

assert isinstance(pattern_parameters, dict)
assert isinstance(pattern_parameters, dict), "Pattern configuration must be a dict"
assert "name" in pattern_parameters, "Pattern parameter missing 'name'"

supported_patterns = {pattern.name(): pattern for pattern in patterns_module.__all__}
pattern_class = supported_patterns.get(pattern_parameters["name"])
if pattern_class is None:
raise ValueError(f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}")
assert pattern_class is not None, f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}"

pattern_parameters.pop("name")
pattern = pattern_class(**pattern_parameters)
try:
pattern = pattern_class(**pattern_parameters)
except TypeError as e:
raise StructureExtractorError(msg=str(e))
return pattern
24 changes: 21 additions & 3 deletions tests/api_tests/test_api_doctype_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,26 @@

class TestApiDefaultStructure(AbstractTestApiDocReader):

def test_all_patterns(self) -> None:
pass
def test_patterns(self) -> None:
file_name = "docx/without_numbering.docx"
patterns = [
{"name": "regexp", "regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}, # noqa
{"name": "start_word", "start_word": "статья", "level_1": 2, "line_type": "статья"},
{"name": "dotted_list", "level_1": 3, "line_type": "list_item"},
{"name": "bracket_list", "level_1": 4, "level_2": 1, "line_type": "list_item"}
]
result = self._send_request(file_name, {"patterns": str(patterns)})
structure = result["content"]["structure"]

node = self._get_by_tree_path(structure, "0.1")
self.assertEqual(node["text"], "Глава 1. Общие положения")
self.assertEqual(node["metadata"]["paragraph_type"], "глава")
node = self._get_by_tree_path(structure, "0.2")
self.assertEqual(node["text"], "Глава 2. Административные правонарушения, посягающие на права граждан и здоровье населения")
self.assertEqual(node["metadata"]["paragraph_type"], "глава")

def test_wrong_patterns(self) -> None:
pass
file_name = "docx/example.docx"
self._send_request(file_name, {"patterns": str([{"regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}])}, expected_code=400) # noqa
self._send_request(file_name, {"patterns": str([{"name": "start_word", "line_type": "глава", "level_1": 1}])}, expected_code=400)
self._send_request(file_name, {"patterns": str([1])}, expected_code=400)

0 comments on commit 0c5fb93

Please sign in to comment.