Skip to content

Commit

Permalink
TLDR-440: Tabby pdf cell properties (#319)
Browse files Browse the repository at this point in the history
* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Add cell properties

* Fix add CellPropertyInfo

* Fix invisible property

* Fix colspan and row_span

* Add test for tables with merged cells

* Add data to test merged cells

* Add BBoxAnnotation to TabbyPDF reader (#312)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* TLDR-444 added words bbox pdfminer (#313)

* TLDR-444 added word supporting into pdfminer-reader

* TLDR-444 added word extraction from pdfminer; pdfminer refactoring

* TLDR-444 added tests (word bounding box)

* TLDR-444 fixed code style

* TLDR-444 fixed after review

* TLDR-437 plain_text return format added (#314)

* Change base image name and tesseract benchmark script (#315)

* Change base image name and tesseract benchmark script

* Benchmarks updated

* Added error hint

* Small fix

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Add BBoxAnnotation to TabbyPDF reader (#312)

* Add BBoxAnnotation to TabbyPDF reader

* Fix import and add test

* Remove unused import

* Fix import and add test

* Remove unused import

* Add test for tables with merged cells

* Fix flake8 warnings

* Resolve comments
Add assert

---------

Co-authored-by: Oksana Belyaeva <[email protected]>
Co-authored-by: Bogatenkova Anastasiya <[email protected]>
  • Loading branch information
3 people authored Aug 29, 2023
1 parent 3190f41 commit 53c346f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 7 deletions.
38 changes: 31 additions & 7 deletions dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import math
import os
import subprocess
from collections import namedtuple
from typing import List, Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -34,6 +35,8 @@
from dedoc.utils.parameter_utils import get_param_page_slice
from dedoc.utils.utils import calculate_file_hash

CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")


class PdfTabbyReader(PdfBaseReader):
"""
Expand Down Expand Up @@ -77,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
parameters = {} if parameters is None else parameters
lines, scan_tables = self.__extract(path=path)
lines, scan_tables, tables_cell_properties = self.__extract(path=path)
warnings = []
document_metadata = None

Expand All @@ -94,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
tables = []
for scan_table in scan_tables:
assert len(scan_tables) == len(tables_cell_properties)
for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
cell_properties = [[cellp for cellp in row] for row in table_cells_property]
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
cells = [[cell for cell in row] for row in scan_table.matrix_cells]
table = Table(metadata=metadata, cells=cells)
table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
tables.append(table)

attachments = []
Expand All @@ -112,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio

return self._postprocess(result)

def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
file_hash = calculate_file_hash(path=path)
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
all_lines = []
all_tables = []
all_cell_properties = []
for page in document.get("pages", []):
lines = self.__get_lines_with_location(page, file_hash)
if lines:
all_lines.extend(lines)
tables = self.__get_tables(page, file_hash)
tables, cell_properties = self.__get_tables(page, file_hash)
if tables:
all_tables.extend(tables)
all_cell_properties.extend(cell_properties)

return all_lines, all_tables
return all_lines, all_tables, all_cell_properties

def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
tables = []
cell_properties = []
page_number = page["number"]
i = 0
for table in page["tables"]:
Expand All @@ -139,12 +147,28 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
y_bottom_right = y_top_left + table["height"]
order = table["order"]
rows = table["rows"]
cell_properties_json = table["cell_properties"]
cell_property_list = []

for cell_properties_row in cell_properties_json:
cell_property_row_list = []

for cell_property in cell_properties_row:
cell_property_info = CellPropertyInfo(cell_property["col_span"],
cell_property["row_span"],
bool(cell_property["invisible"]))

cell_property_row_list.append(cell_property_info)

cell_property_list.append(cell_property_row_list)

cells = [row for row in rows]
bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))

tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
cell_properties.append(cell_property_list)

return tables
return tables, cell_properties

def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
lines = []
Expand Down
Binary file not shown.
17 changes: 17 additions & 0 deletions tests/api_tests/test_api_format_pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,20 @@ def test_pdf_annotations(self) -> None:
self.assertIn(BoldAnnotation.name, annotation_names)
self.assertIn(SpacingAnnotation.name, annotation_names)
self.assertIn(BBoxAnnotation.name, annotation_names)

def test_tables_with_merged_cells(self) -> None:
file_name = "big_table_with_merged_cells.pdf"
result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
table = result["content"]["tables"][0]
cell_properties = table["metadata"]["cell_properties"]

hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]]

for (i, j), k in hidden_cells_big_table_with_colspan:
self.assertFalse(cell_properties[i][j]["invisible"])
self.assertEqual(cell_properties[i][j]["rowspan"], 1)
self.assertEqual(cell_properties[i][j]["colspan"], k)

self.assertFalse(cell_properties[3][0]["invisible"])
self.assertEqual(cell_properties[3][0]["rowspan"], 3)
self.assertEqual(cell_properties[3][0]["colspan"], 4)
Binary file not shown.

0 comments on commit 53c346f

Please sign in to comment.