Skip to content

Commit

Permalink
TLDR-748 finishing fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
NastyBoget committed Aug 16, 2024
1 parent 0c5fb93 commit 3322468
Show file tree
Hide file tree
Showing 20 changed files with 215 additions and 135 deletions.
2 changes: 1 addition & 1 deletion dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
patterns: str = Form(None, description='Patterns for default document type (when document_type="")')
patterns: str = Form("", description='Patterns for default document type (when document_type="")')
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
Expand Down
26 changes: 24 additions & 2 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>

<div class="parameters">
<h4>Type of document structure parsing</h4>
<details><summary>document_type, structure_type, return_format</summary>
<details><summary>document_type, patterns, structure_type, return_format</summary>
<br>
<p>
<label>
Expand All @@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
</label>
</p>

<p>
<div>
Patterns for default structure extractor (document_type="other")<br>
<label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
<button type="button" onclick="Format()">Format</button>
</div>
</p>

<p>
<label>
<select name="structure_type">
Expand Down Expand Up @@ -114,7 +122,7 @@ <h4>Tables handling </h4>

<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<br>
<p>
<label>
Expand Down Expand Up @@ -213,4 +221,18 @@ <h3>Useful links</h3>
</ul>

</body>

<script>
function Format() {
try {
let input = document.getElementById("patterns")
let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
}
catch (error) {
alert("Incorrect JSON syntax")
}
}
</script>

</html>
Original file line number Diff line number Diff line change
Expand Up @@ -46,32 +46,36 @@ def __get_patterns(self, parameters: dict) -> List[AbstractPattern]:
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern

return [
TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, level_2=1, can_be_multiline=False),
TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, default_level_2=1, can_be_multiline=False),
DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False),
BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False),
LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False),
BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False),
TagTypePattern(),
TagPattern(line_type=HierarchyLevel.raw_text)
TagPattern(default_line_type=HierarchyLevel.raw_text)
]

try:
import ast
from dedoc.structure_extractors.patterns.utils import get_pattern
import ast
from dedoc.structure_extractors.patterns.utils import get_pattern

patterns = parameters["patterns"]
if isinstance(patterns, str):
if isinstance(patterns, str):
try:
patterns = ast.literal_eval(patterns)
assert isinstance(patterns, list), "Patterns parameter should contain a list of patterns"
assert len(patterns) > 0, "Patterns parameter should contain a non-empty list of patterns"
if isinstance(patterns[0], dict):
patterns = [get_pattern(pattern) for pattern in patterns]
except ValueError as e:
raise StructureExtractorError(msg=f"Bad syntax for patterns: {str(e)}")

assert isinstance(patterns[0], AbstractPattern), "Patterns should be initialized properly"
except AssertionError as e:
raise StructureExtractorError(msg=str(e))
return patterns
if not isinstance(patterns, list):
raise StructureExtractorError(msg="Patterns parameter should contain a list of patterns")

pattern_classes = []
for pattern in patterns:
if isinstance(pattern, dict):
pattern_classes.append(get_pattern(pattern))
elif isinstance(pattern, AbstractPattern):
pattern_classes.append(pattern)
else:
raise StructureExtractorError(msg="Pattern should be dict or `AbstractPattern`")

return pattern_classes
3 changes: 1 addition & 2 deletions dedoc/structure_extractors/patterns/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern
from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern
from dedoc.structure_extractors.patterns.tag_type_pattern import TagTypePattern

__all__ = [BracketListPattern, BracketRomanListPattern, BulletListPattern, DottedListPattern, LetterListPattern, RegexpPattern, RomanListPattern,
StartWordPattern, TagHeaderPattern, TagListPattern, TagPattern, TagTypePattern]
StartWordPattern, TagHeaderPattern, TagListPattern, TagPattern]
45 changes: 6 additions & 39 deletions dedoc/structure_extractors/patterns/abstract_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,13 @@
class AbstractPattern(ABC):
_name = ""

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[bool or str]) -> None:
from dedoc.utils.parameter_utils import get_bool_value

self._line_type = line_type
self._level_1 = level_1
self._level_2 = level_2
self._can_be_multiline = can_be_multiline
self._can_be_multiline = get_bool_value(can_be_multiline, default_value=True)

@classmethod
def name(cls: "AbstractPattern") -> str:
Expand All @@ -26,37 +24,6 @@ def name(cls: "AbstractPattern") -> str:
def match(self, line: LineWithMeta) -> bool:
pass

@abstractmethod
def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=self._get_level_1(line),
level_2=self._get_level_2(line),
can_be_multiline=self._get_can_be_multiline(line)
)

def _get_line_type(self, line: LineWithMeta) -> str:
if self._line_type is not None:
return self._line_type

if line.metadata.tag_hierarchy_level is None:
raise ValueError(f"Cannot resolve line type: tag_hierarchy_level is missing and {self._name} line_type isn't configured")

return line.metadata.tag_hierarchy_level.line_type

def _get_level_1(self, line: LineWithMeta) -> Optional[int]:
if self._level_1 is not None:
return self._level_1

return line.metadata.tag_hierarchy_level.level_1 if line.metadata.tag_hierarchy_level else None

def _get_level_2(self, line: LineWithMeta) -> Optional[int]:
if self._level_2 is not None:
return self._level_2

return line.metadata.tag_hierarchy_level.level_2 if line.metadata.tag_hierarchy_level else None

def _get_can_be_multiline(self, line: LineWithMeta) -> bool:
if self._can_be_multiline is not None:
return self._can_be_multiline

return line.metadata.tag_hierarchy_level.can_be_multiline if line.metadata.tag_hierarchy_level else True
pass
6 changes: 1 addition & 5 deletions dedoc/structure_extractors/patterns/bracket_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@
class BracketListPattern(RegexpPattern):
_name = "bracket_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@
class BracketRomanListPattern(RegexpPattern):
_name = "bracket_roman_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
6 changes: 1 addition & 5 deletions dedoc/structure_extractors/patterns/bullet_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@
class BulletListPattern(RegexpPattern):
_name = "bullet_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
14 changes: 5 additions & 9 deletions dedoc/structure_extractors/patterns/dotted_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,15 @@
class DottedListPattern(RegexpPattern):
_name = "dotted_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=None, can_be_multiline=can_be_multiline)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=self._get_level_1(line),
line_type=self._line_type,
level_1=self._level_1,
level_2=self.__get_list_depth(line=line),
can_be_multiline=self._get_can_be_multiline(line)
can_be_multiline=self._can_be_multiline
)

def __get_list_depth(self, line: LineWithMeta) -> int:
Expand Down
6 changes: 1 addition & 5 deletions dedoc/structure_extractors/patterns/letter_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@
class LetterListPattern(RegexpPattern):
_name = "letter_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
8 changes: 6 additions & 2 deletions dedoc/structure_extractors/patterns/regexp_pattern.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern

Expand All @@ -10,14 +11,17 @@ class RegexpPattern(AbstractPattern):

def __init__(self,
regexp: str or re.Pattern,
line_type: Optional[str] = None,
line_type: str,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

def match(self, line: LineWithMeta) -> bool:
text = line.line.strip().lower()
match = self._regexp.match(text)
return match is not None

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)
6 changes: 1 addition & 5 deletions dedoc/structure_extractors/patterns/roman_list_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,5 @@
class RomanListPattern(RegexpPattern):
_name = "roman_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
8 changes: 6 additions & 2 deletions dedoc/structure_extractors/patterns/start_word_pattern.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern

Expand All @@ -9,13 +10,16 @@ class StartWordPattern(AbstractPattern):

def __init__(self,
start_word: str,
line_type: Optional[str] = None,
line_type: str,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool] = None) -> None:
can_be_multiline: Optional[bool or str] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
self.__start_word = start_word.strip().lower()

def match(self, line: LineWithMeta) -> bool:
text = line.line.strip().lower()
return text.startswith(self.__start_word)

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)
25 changes: 23 additions & 2 deletions dedoc/structure_extractors/patterns/tag_header_pattern.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,31 @@
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern


class TagHeaderPattern(AbstractPattern):
class TagHeaderPattern(TagPattern):
_name = "tag_header"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool or str] = None,
default_line_type: str = HierarchyLevel.header,
default_level_1: int = 1,
default_level_2: Optional[int] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline, default_line_type=default_line_type,
default_level_1=default_level_1, default_level_2=default_level_2)

def match(self, line: LineWithMeta) -> bool:
return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=self._get_level_1(line),
level_2=self._get_regexp_level_2(line),
can_be_multiline=self._get_can_be_multiline(line)
)
22 changes: 17 additions & 5 deletions dedoc/structure_extractors/patterns/tag_list_pattern.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
from typing import Optional

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern
from dedoc.structure_extractors.patterns.tag_pattern import TagPattern


class TagListPattern(AbstractPattern):
class TagListPattern(TagPattern):
_name = "tag_list"

def __init__(self,
line_type: Optional[str] = None,
level_1: Optional[int] = None,
level_2: Optional[int] = None,
can_be_multiline: Optional[bool or str] = None,
default_line_type: str = HierarchyLevel.list_item,
default_level_1: int = 2,
default_level_2: Optional[int] = None) -> None:
super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline, default_line_type=default_line_type,
default_level_1=default_level_1, default_level_2=default_level_2)

def match(self, line: LineWithMeta) -> bool:
return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item

def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
level_1, level_2 = self._get_level_1(line), self._get_level_2(line)
return HierarchyLevel(
line_type=self._get_line_type(line),
level_1=level_1 if level_1 is not None else 2,
level_2=level_2 if level_2 is not None else 1,
level_1=self._get_level_1(line),
level_2=self._get_regexp_level_2(line),
can_be_multiline=self._get_can_be_multiline(line)
)
Loading

0 comments on commit 3322468

Please sign in to comment.