TLDR-440: Tabby pdf cell properties (#319)

* Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Add cell properties * Fix add CellPropertyInfo * Fix invisible property * Fix colspan and row_span * Add test for tables with merged cells * Add data to test merged cells * Add BBoxAnnotation to TabbyPDF reader (#312) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * TLDR-444 added words bbox pdfminer (#313) * TLDR-444 added word supporting into pdfminer-reader * TLDR-444 added word extraction from pdfminer; pdfminer refactoring * TLDR-444 added tests (word bounding box) * TLDR-444 fixed code style * TLDR-444 fixed after review * TLDR-437 plain_text return format added (#314) * Change base image name and tesseract benchmark script (#315) * Change base image name and tesseract benchmark script * Benchmarks updated * Added error hint * Small fix * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Add BBoxAnnotation to TabbyPDF reader (#312) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Fix import and add test * Remove unused import * Add test for tables with merged cells * Fix flake8 warnings * Resolve comments Add assert --------- Co-authored-by: Oksana Belyaeva <[email protected]> Co-authored-by: Bogatenkova Anastasiya <[email protected]>
ispras · Aug 29, 2023 · 53c346f · 53c346f
1 parent 3190f41
commit 53c346f
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 7 deletions.
diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -3,6 +3,7 @@
 import math
 import os
 import subprocess
+from collections import namedtuple
 from typing import List, Optional, Tuple
 
 import numpy as np
@@ -34,6 +35,8 @@
 from dedoc.utils.parameter_utils import get_param_page_slice
 from dedoc.utils.utils import calculate_file_hash
 
+CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")
+
 
 class PdfTabbyReader(PdfBaseReader):
     """
@@ -77,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
         parameters = {} if parameters is None else parameters
-        lines, scan_tables = self.__extract(path=path)
+        lines, scan_tables, tables_cell_properties = self.__extract(path=path)
         warnings = []
         document_metadata = None
 
@@ -94,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
         tables = []
-        for scan_table in scan_tables:
+        assert len(scan_tables) == len(tables_cell_properties)
+        for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
+            cell_properties = [[cellp for cellp in row] for row in table_cells_property]
             metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
             cells = [[cell for cell in row] for row in scan_table.matrix_cells]
-            table = Table(metadata=metadata, cells=cells)
+            table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
             tables.append(table)
 
         attachments = []
@@ -112,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
 
         return self._postprocess(result)
 
-    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
+    def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
         file_hash = calculate_file_hash(path=path)
         document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
         all_lines = []
         all_tables = []
+        all_cell_properties = []
         for page in document.get("pages", []):
             lines = self.__get_lines_with_location(page, file_hash)
             if lines:
                 all_lines.extend(lines)
-            tables = self.__get_tables(page, file_hash)
+            tables, cell_properties = self.__get_tables(page, file_hash)
             if tables:
                 all_tables.extend(tables)
+                all_cell_properties.extend(cell_properties)
 
-        return all_lines, all_tables
+        return all_lines, all_tables, all_cell_properties
 
     def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
         tables = []
+        cell_properties = []
         page_number = page["number"]
         i = 0
         for table in page["tables"]:
@@ -139,12 +147,28 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
             y_bottom_right = y_top_left + table["height"]
             order = table["order"]
             rows = table["rows"]
+            cell_properties_json = table["cell_properties"]
+            cell_property_list = []
+
+            for cell_properties_row in cell_properties_json:
+                cell_property_row_list = []
+
+                for cell_property in cell_properties_row:
+                    cell_property_info = CellPropertyInfo(cell_property["col_span"],
+                                                          cell_property["row_span"],
+                                                          bool(cell_property["invisible"]))
+
+                    cell_property_row_list.append(cell_property_info)
+
+                cell_property_list.append(cell_property_row_list)
+
             cells = [row for row in rows]
             bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
 
             tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
+            cell_properties.append(cell_property_list)
 
-        return tables
+        return tables, cell_properties
 
     def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
         lines = []

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar
diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py
@@ -280,3 +280,20 @@ def test_pdf_annotations(self) -> None:
         self.assertIn(BoldAnnotation.name, annotation_names)
         self.assertIn(SpacingAnnotation.name, annotation_names)
         self.assertIn(BBoxAnnotation.name, annotation_names)
+
+    def test_tables_with_merged_cells(self) -> None:
+        file_name = "big_table_with_merged_cells.pdf"
+        result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby"))
+        table = result["content"]["tables"][0]
+        cell_properties = table["metadata"]["cell_properties"]
+
+        hidden_cells_big_table_with_colspan = [[(1, 0), 10], [(5, 5), 5]]
+
+        for (i, j), k in hidden_cells_big_table_with_colspan:
+            self.assertFalse(cell_properties[i][j]["invisible"])
+            self.assertEqual(cell_properties[i][j]["rowspan"], 1)
+            self.assertEqual(cell_properties[i][j]["colspan"], k)
+
+        self.assertFalse(cell_properties[3][0]["invisible"])
+        self.assertEqual(cell_properties[3][0]["rowspan"], 3)
+        self.assertEqual(cell_properties[3][0]["colspan"], 4)
diff --git a/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf b/tests/data/pdf_with_text_layer/big_table_with_merged_cells.pdf