Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TLDR-748 structure pattern #483

Merged
merged 9 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ exclude =
*__init__.py,
resources,
venv,
.venv,
build,
dedoc.egg-info,
docs/_build,
Expand All @@ -48,5 +49,5 @@ per-file-ignores =
scripts/*:T201
scripts/benchmark_pdf_performance*:JS101
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
docs/source/_static/code_examples/*:I251
docs/source/_static/code_examples/*:I251,T201
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,4 @@ jobs:
python dedoc_usage_tutorial.py
python dedoc_add_new_doc_type_tutorial.py
python dedoc_add_new_structure_type_tutorial.py
python dedoc_using_patterns_tutorial.py
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
args:
- "--config=.flake8"
additional_dependencies: [
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class QueryParameters:
# type of document structure parsing
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
patterns: str = Form("", description='Patterns for default document type (when document_type="")')
oksidgy marked this conversation as resolved.
Show resolved Hide resolved
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
description="Response representation, most types (except json) are used for debug purposes only")
Expand Down
26 changes: 24 additions & 2 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>

<div class="parameters">
<h4>Type of document structure parsing</h4>
<details><summary>document_type, structure_type, return_format</summary>
<details><summary>document_type, patterns, structure_type, return_format</summary>
<br>
<p>
<label>
Expand All @@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
</label>
</p>

<p>
<div>
Patterns for default structure extractor (document_type="other")<br>
<label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
<button type="button" onclick="Format()">Format</button>
</div>
</p>

<p>
<label>
<select name="structure_type">
Expand Down Expand Up @@ -114,7 +122,7 @@ <h4>Tables handling </h4>

<div class="parameters">
<h4>PDF handling</h4>
<details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
<br>
<p>
<label>
Expand Down Expand Up @@ -213,4 +221,18 @@ <h3>Useful links</h3>
</ul>

</body>

<script>
function Format() {
try {
let input = document.getElementById("patterns")
let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
}
catch (error) {
alert("Incorrect JSON syntax")
}
}
</script>

</html>
3 changes: 1 addition & 2 deletions dedoc/data_structures/line_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ def __init__(self,
:param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line.
The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree.
"""
self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \
if tag_hierarchy_level is None else tag_hierarchy_level
self.tag_hierarchy_level = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level
self.hierarchy_level = hierarchy_level
self.page_id = page_id
self.line_id = line_id
Expand Down
8 changes: 6 additions & 2 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,12 @@ def set_metadata(self, metadata: LineMetadata) -> None:
self._metadata = metadata

def __repr__(self) -> str:
return (f"LineWithMeta({self.line[:65]}, "
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \
f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}"
hl = "None" if self.metadata.hierarchy_level is None else \
f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}"
return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})"

def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta":
from dedoc.utils.annotation_merger import AnnotationMerger
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/line_with_meta_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,4 @@ def __get_tag(self, paragraph: Paragraph) -> HierarchyLevel:
if paragraph.list_level is not None:
return HierarchyLevel(2, paragraph.list_level, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, HierarchyLevel.unknown)
return HierarchyLevel.create_unknown()
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
super().__init__(line, metadata, annotations, uid)

def __repr__(self) -> str:
text = self.line if len(self.line) < 65 else self.line[:62] + "..."
return f"LineWithLocation({text[:65]})"
parent_repr = super().__repr__()
return parent_repr.replace("LineWithMeta", "LineWithLocation")

def __str__(self) -> str:
return self.__repr__()
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class PdfAutoReader(BaseReader):

:class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file:

* if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
* if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \
for document content extraction;

* if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction.
Expand Down
9 changes: 4 additions & 5 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
The method return document content with all document's lines, tables and attachments.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`
(``can_be_multiline`` attribute is important for paragraph extraction).
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.

You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
Expand Down Expand Up @@ -94,8 +95,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]:
import math
from joblib import Parallel, delayed
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.utils.utils import flatten

Expand Down Expand Up @@ -129,10 +130,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)

prev_line = None
for line in all_lines_with_links:
line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
prev_line = line
line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()

all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
Expand Down
11 changes: 4 additions & 7 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith

lines = []
page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"])
prev_line = None
labeling_mode = self.config.get("labeling_mode", False)

for block in page["blocks"]:
Expand Down Expand Up @@ -261,15 +260,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
uid=uid,
location=Location(bbox=bbox, page_number=page_number),
order=order)
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta)
prev_line = line_with_location
line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta)

lines.append(line_with_location)

return lines

def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel:
from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth

if line_type == HierarchyLevel.header:
Expand All @@ -278,9 +275,9 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_
return HierarchyLevel(1, header_level, False, line_type)

if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines
return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line)
return HierarchyLevel(None, None, False, HierarchyLevel.list_item)

return HierarchyLevel(None, None, True, line_type)
return HierarchyLevel.create_unknown()

def __jar_path(self) -> str:
import os
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pdf_reader/utils/line_object_linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i
@return:
"""
if len(lines) == 0:
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0)
metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0)
lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))]
last_page_line = self._get_last_page_line(lines)
all_objects = list(lines + tables + images)
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/pptx_reader/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties
def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
text = ""
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
hierarchy_level = HierarchyLevel.create_raw_text()
hierarchy_level = HierarchyLevel.create_unknown()

if is_title or paragraph_properties.title:
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
Expand Down
21 changes: 5 additions & 16 deletions dedoc/readers/txt_reader/raw_text_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None,

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions.
This method returns only document lines.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
Expand All @@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str:
def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
import time
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor
from dedoc.utils.utils import calculate_file_hash

lines = []
file_hash = calculate_file_hash(path=path)
number_of_empty_lines = 0
previous_log_time = time.time()
prev_line = None

for line_id, line in self.__get_lines(path=path, encoding=encoding):
if time.time() - previous_log_time > 5:
Expand All @@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]:
indent_annotation = self.__get_indent_annotation(line)

line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid)
line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line)
prev_line = line_with_meta
line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown()
lines.append(line_with_meta)

if line.isspace():
number_of_empty_lines += 1
else:
number_of_empty_lines = 0
number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0

return lines

Expand Down Expand Up @@ -113,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int:
return space_this.end() - space_this.start()

def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool:
from dedoc.data_structures.hierarchy_level import HierarchyLevel

if not line.metadata.tag_hierarchy_level.can_be_multiline and \
line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown):
return True
space_this = self.__get_starting_spacing(line)
space_prev = self.__get_starting_spacing(previous_line)
return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \
and not line.line.isspace() and space_this - space_prev >= 2
return not line.line.isspace() and space_this - space_prev >= 2

def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument:
previous_line = None
Expand Down
Loading
Loading