Skip to content

Commit

Permalink
update master (#321)
Browse files Browse the repository at this point in the history
* Add BBoxAnnotation to TabbyPDF reader (#312)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* TLDR-444 added words bbox pdfminer (#313)

* TLDR-444 added word supporting into pdfminer-reader

* TLDR-444 added word extraction from pdfminer; pdfminer refactoring

* TLDR-444 added tests (word bounding box)

* TLDR-444 fixed code style

* TLDR-444 fixed after review

* TLDR-437 plain_text return format added (#314)

* Change base image name and tesseract benchmark script (#315)

* Change base image name and tesseract benchmark script

* Benchmarks updated

* Added error hint

* Small fix

* add version ranges to requirements (#316)

* TLDR-440: Tabby pdf cell properties (#319)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Add cell properties

* Fix add CellPropertyInfo

* Fix invisible property

* Fix colspan and row_span

* Add test for tables with merged cells

* Add data to test merged cells

* Add BBoxAnnotation to TabbyPDF reader (#312)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* TLDR-444 added words bbox pdfminer (#313)

* TLDR-444 added word supporting into pdfminer-reader

* TLDR-444 added word extraction from pdfminer; pdfminer refactoring

* TLDR-444 added tests (word bounding box)

* TLDR-444 fixed code style

* TLDR-444 fixed after review

* TLDR-437 plain_text return format added (#314)

* Change base image name and tesseract benchmark script (#315)

* Change base image name and tesseract benchmark script

* Benchmarks updated

* Added error hint

* Small fix

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Add BBoxAnnotation to TabbyPDF reader (#312)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Fix import and add test

* Remove unused import

* Add test for tables with merged cells

* Fix flake8 warnings

* Resolve comments
Add assert

---------

Co-authored-by: Oksana Belyaeva <[email protected]>
Co-authored-by: Bogatenkova Anastasiya <[email protected]>

* new version 0.11.1 (#320)

* new version 0.11.1

---------

Co-authored-by: Andrey Mikhailov <[email protected]>
Co-authored-by: Oksana Belyaeva <[email protected]>
Co-authored-by: Andrew Perminov <[email protected]>
  • Loading branch information
4 people authored Aug 30, 2023
1 parent 9a1f7ff commit 79fb6e3
Show file tree
Hide file tree
Showing 26 changed files with 435 additions and 416 deletions.
6 changes: 3 additions & 3 deletions docker/Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
ARG REPOSITORY="docker.io"
FROM dedocproject/baseimg
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28

ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
ENV RESOURCES_PATH "/dedoc_root/resources"

ADD requirements.txt .
RUN pip3 install -r requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

RUN mkdir /dedoc_root
ADD dedoc /dedoc_root/dedoc
Expand All @@ -17,4 +17,4 @@ RUN python3 /dedoc_root/dedoc/download_models.py
ADD tests /dedoc_root/tests
ADD resources /dedoc_root/resources

CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.11.0
0.11.1
6 changes: 6 additions & 0 deletions dedoc/api/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,9 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str:
text += "</tr>\n"
text += "</tbody>\n</table>"
return text


def json2txt(paragraph: TreeNode) -> str:
subparagraphs_text = "\n".join([json2txt(subparagraph) for subparagraph in paragraph.subparagraphs])
text = f"{paragraph.text}\n{subparagraphs_text}"
return text
5 changes: 4 additions & 1 deletion dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.common.exceptions.missing_file_error import MissingFileError
from dedoc.config import get_config
Expand Down Expand Up @@ -76,6 +76,9 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
return HTMLResponse(content=html_content, status_code=200)
elif return_format == "plain_text":
txt_content = json2txt(paragraph=document_tree.content.structure)
return PlainTextResponse(content=txt_content, status_code=200)
elif return_format == "tree":
html_content = json2tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/static/html_eng/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ <h2>Structure Document Recognition</h2>
<select name="return_format">
<option value="html" selected>html</option>
<option value="pretty_json">pretty_json</option>
<option value="plain_text">plain_text</option>
<option value="tree">tree</option>
<option value="json">json</option>
<option value="collapsed_tree">collapsed_tree</option>
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/static/html_rus/form_input.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ <h2>Распознавание структуры документа</h2>
<select name="return_format">
<option value="html" selected>html</option>
<option value="pretty_json">pretty_json</option>
<option value="plain_text">plain_text</option>
<option value="tree">tree</option>
<option value="json">json</option>
<option value="collapsed_tree">collapsed_tree</option>
Expand Down
57 changes: 45 additions & 12 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import math
import os
import subprocess
from collections import namedtuple
from typing import List, Optional, Tuple

import numpy as np

from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
Expand All @@ -33,6 +35,8 @@
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.utils import calculate_file_hash

CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")


class PdfTabbyReader(PdfBaseReader):
"""
Expand Down Expand Up @@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, scan_tables = self.__extract(path=path)
lines, scan_tables, tables_cell_properties = self.__extract(path=path)
warnings = []
document_metadata = None

Expand All @@ -93,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
tables = []
for scan_table in scan_tables:
assert len(scan_tables) == len(tables_cell_properties)
for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
cell_properties = [[cellp for cellp in row] for row in table_cells_property]
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
cells = [[cell for cell in row] for row in scan_table.matrix_cells]
table = Table(metadata=metadata, cells=cells)
table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
tables.append(table)

attachments = []
Expand All @@ -111,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
all_lines = []
all_tables = []
all_cell_properties = []
for page in document.get("pages", []):
lines = self.__get_lines_with_location(page, file_hash)
if lines:
all_lines.extend(lines)
tables = self.__get_tables(page, file_hash)
tables, cell_properties = self.__get_tables(page, file_hash)
if tables:
all_tables.extend(tables)
all_cell_properties.extend(cell_properties)

return all_lines, all_tables
return all_lines, all_tables, all_cell_properties

def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
tables = []
cell_properties = []
page_number = page["number"]
i = 0
for table in page["tables"]:
Expand All @@ -138,26 +147,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
y_bottom_right = y_top_left + table["height"]
order = table["order"]
rows = table["rows"]
cell_properties_json = table["cell_properties"]
cell_property_list = []

for cell_properties_row in cell_properties_json:
cell_property_row_list = []

for cell_property in cell_properties_row:
cell_property_info = CellPropertyInfo(cell_property["col_span"],
cell_property["row_span"],
bool(cell_property["invisible"]))

cell_property_row_list.append(cell_property_info)

cell_property_list.append(cell_property_row_list)

cells = [row for row in rows]
bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))

tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
cell_properties.append(cell_property_list)

return tables
return tables, cell_properties

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
page_number = page["number"]
page_width = int(page["width"])
page_height = int(page["height"])
prev_line = None

for block in page["blocks"]:
annotations = []
order = block["order"]
block_text = block["text"]
bx_top_left = block["x_top_left"]
by_top_left = block["y_top_left"]
bx_bottom_right = bx_top_left + block["width"]
by_bottom_right = by_top_left + block["height"]
bx_top_left = int(block["x_top_left"])
by_top_left = int(block["y_top_left"])
bx_bottom_right = bx_top_left + int(block["width"])
by_bottom_right = by_top_left + int(block["height"])
indent = block["indent"]
spacing = block["spacing"]
len_block = len(block_text)
Expand All @@ -173,7 +200,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
url = annotation["url"]
start = annotation["start"]
end = annotation["end"]

x_top_left = int(annotation["x_top_left"])
y_top_left = int(annotation["y_top_left"])
x_bottom_right = bx_top_left + int(annotation["width"])
y_bottom_right = by_top_left + int(annotation["height"])
box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
annotations.append(SizeAnnotation(start, end, str(font_size)))
annotations.append(StyleAnnotation(start, end, font_name))

Expand All @@ -189,6 +221,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
meta = block["metadata"].lower()
uid = f"txt_{file_hash}_{order}"
bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))

metadata = LineMetadata(page_id=page_number, line_id=order)
line_with_location = LineWithLocation(line=block_text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox


Expand All @@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None:
:param config: configuration of the reader, e.g. logger for logging
"""
super().__init__(config=config)
self.extractor_layer = ExtractorPdfTextLayer(config=config)
self.extractor_layer = PdfminerExtractor(config=config)

def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
Empty file.
Loading

0 comments on commit 79fb6e3

Please sign in to comment.