refactored gost table recognizer and added tests

ispras · Sep 5, 2024 · 9b8087a · 9b8087a
1 parent b643a1b
commit 9b8087a
Show file tree

Hide file tree

Showing 11 changed files with 117 additions and 126 deletions.
diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py
@@ -181,23 +181,11 @@ def to_api_schema(self) -> ApiLineWithMeta:
         annotations = [annotation.to_api_schema() for annotation in self.annotations]
         return ApiLineWithMeta(text=self._line, annotations=annotations)
 
-    @staticmethod
-    def shift_line_with_meta(line_with_meta: "LineWithMeta", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "LineWithMeta":
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        import json
         from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
-        from dedocutils.data_structures import BBox
-        new_annotations = []
-        for annotation in line_with_meta.annotations:
-            if annotation.name != "bounding box":
-                new_annotations.append(annotation)
-            else:
+        for i_ann, annotation in enumerate(self.annotations):
+            if annotation.name == "bounding box":
                 bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
-                new_bbox = BBox.shift_bbox(bbox, shift_x, shift_y)
-                new_annotations.append(BBoxAnnotation(start=annotation.start,
-                                                      end=annotation.end,
-                                                      value=new_bbox,
-                                                      page_width=image_width,
-                                                      page_height=image_height))
-        return LineWithMeta(line=line_with_meta.line,
-                            metadata=line_with_meta.metadata,
-                            annotations=new_annotations,
-                            uid=line_with_meta.uid)
+                bbox.shift(shift_x, shift_y)
+                self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py
@@ -13,30 +13,15 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         self.order = order
         super().__init__(line, metadata, annotations, uid)
 
-    @staticmethod
-    def shift_line_with_location(line_with_location: "LineWithLocation", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "LineWithLocation":
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        import json
         from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
-        from dedocutils.data_structures import BBox
-        new_annotations = []
-        for i_ann, annotation in enumerate(line_with_location.annotations):
-            if line_with_location.annotations[i_ann].name != "bounding box":
-                new_annotations.append(annotation)
-            else:
+        for i_ann, annotation in enumerate(self.annotations):
+            if self.annotations[i_ann].name == "bounding box":
                 bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
-                new_bbox = BBox.shift_bbox(bbox, shift_x, shift_y)
-                new_bbox_annotation = BBoxAnnotation(start=annotation.start,
-                                                     end=annotation.end,
-                                                     value=new_bbox,
-                                                     page_width=image_width,
-                                                     page_height=image_height)
-                new_annotations.append(new_bbox_annotation)
-        new_location = Location.shift_location(line_with_location.location, shift_x, shift_y)
-        return LineWithLocation(line=line_with_location.line,
-                                metadata=line_with_location.metadata,
-                                annotations=new_annotations,
-                                location=new_location,
-                                uid=line_with_location.uid,
-                                order=line_with_location.order)
+                bbox.shift(shift_x, shift_y)
+                self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
+        self.location.shift(shift_x, shift_y)
 
     def __repr__(self) -> str:
         parent_repr = super().__repr__()

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py
@@ -30,31 +30,16 @@ def copy_from(cell: "Cell",
                     uid=cell.cell_uid,
                     contour_coord=cell.con_coord)
 
-    @staticmethod
-    def shift_cell(cell: "Cell", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "Cell":
-        if cell.lines and len(cell.lines) >= 1:
-            shifted_lines = []
-            for line in cell.lines:
-                shifted_lines.append(LineWithMeta.shift_line_with_meta(line_with_meta=line,
-                                                                       shift_x=shift_x,
-                                                                       shift_y=shift_y,
-                                                                       image_width=image_width,
-                                                                       image_height=image_height))
-        else:
-            shifted_lines = cell.lines
-        return Cell(x_top_left=cell.x_top_left + shift_x,
-                    x_bottom_right=cell.x_bottom_right + shift_x,
-                    y_top_left=cell.y_top_left + shift_y,
-                    y_bottom_right=cell.y_bottom_right + shift_y,
-                    id_con=cell.id_con,
-                    lines=shifted_lines,
-                    is_attribute=cell.is_attribute,
-                    is_attribute_required=cell.is_attribute_required,
-                    rotated_angle=cell.rotated_angle,
-                    uid=cell.cell_uid,
-                    contour_coord=BBox.shift_bbox(bbox=cell.con_coord,
-                                                  shift_x=shift_x,
-                                                  shift_y=shift_y)) if cell.con_coord else None
+    def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
+        if self.lines and len(self.lines) >= 1:
+            for i_lin, _line in enumerate(self.lines):
+                self.lines[i_lin].shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+        self.x_top_left += shift_x
+        self.x_bottom_right += shift_x
+        self.y_top_left += shift_y
+        self.y_bottom_right += shift_y
+        if self.con_coord:
+            self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)
 
     def __init__(self,
                  x_top_left: int,

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py
@@ -12,12 +12,8 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
         self.name = name
         self.rotated_angle = rotated_angle
 
-    @staticmethod
-    def shift_location(location: "Location", shift_x: int, shift_y: int):
-        return Location(page_number=location.page_number,
-                        bbox=BBox.shift_bbox(location.bbox, shift_x, shift_y),
-                        name=location.name,
-                        rotated_angle=location.rotated_angle)
+    def shift(self, shift_x: int, shift_y: int) -> None:
+        self.bbox.shift(shift_x, shift_y)
 
     def to_dict(self) -> Dict[str, Any]:
         from collections import OrderedDict

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -2,6 +2,8 @@
 from collections import namedtuple
 from typing import Iterator, List, Optional, Set, Tuple
 
+import numpy as np
+from dedocutils.data_structures.bbox import BBox
 from numpy import ndarray
 
 from dedoc.common.exceptions.bad_file_error import BadFileFormatError
@@ -107,14 +109,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         images = self._get_images(path, first_page, last_page)
 
         if parameters.need_gost_frame_analysis:
-            from dedocutils.data_structures import BBox
             from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
             self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
             gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(
                 delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images
             )
             result = Parallel(n_jobs=self.config["n_jobs"])(
-                delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in enumerate(gost_analyzed_images, start=first_page)
+                delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
+                enumerate(gost_analyzed_images, start=first_page)
             )
         else:
             result = Parallel(n_jobs=self.config["n_jobs"])(
@@ -150,77 +152,49 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         if page_angles:
             metadata["rotated_page_angles"] = page_angles
         if parameters.need_gost_frame_analysis:
-            shifted_lines, shifted_scan_tables, shifted_attachments = self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
-                                                                                               mp_tables=mp_tables,
-                                                                                               attachments=attachments,
-                                                                                               gost_analyzed_images=gost_analyzed_images)
-            return shifted_lines, shifted_scan_tables, shifted_attachments, warnings, metadata
+            self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
+                                     mp_tables=mp_tables,
+                                     attachments=attachments,
+                                     gost_analyzed_images=gost_analyzed_images)
         return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata
 
-    def _shift_all_contents(self, all_lines_with_paragraphs, mp_tables, attachments, gost_analyzed_images) \
-            -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
-        from dedocutils.data_structures import BBox
-        from dedoc.readers.pdf_reader.data_classes.tables.location import Location
-        from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
+    def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
+                            gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
         # shift mp_tables
-        shifted_scan_tables = []
         for scan_table in mp_tables:
-            shifted_table_locations = []
-            for location in scan_table.locations:
+            for i_loc, location in enumerate(scan_table.locations):
                 table_page_number = location.page_number
-                shifted_table_locations.append(Location.shift_location(location=location,
-                                                                       shift_x=gost_analyzed_images[table_page_number][1].x_top_left,
-                                                                       shift_y=gost_analyzed_images[table_page_number][1].y_top_left))
-            shifted_matrix_cells = []
+                scan_table.locations[i_loc].shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left,
+                                                  shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
             for row in scan_table.matrix_cells:
-                shifted_matrix_cells_row = []
                 row_page_number = scan_table.page_number
                 for cell in row:  # check page number information in the current table row, because table can be located on multiple pages
                     if cell.lines and len(cell.lines) >= 1:
                         row_page_number = cell.lines[0].metadata.page_id
                         break
-                for cell in row:  # if cell doesn't contain page number information we use row_page_number
+                for i_cel, cell in enumerate(row):  # if cell doesn't contain page number information we use row_page_number
                     page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number
                     image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
                     shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left
-                    shifted_cell = Cell.shift_cell(cell=cell,
-                                                   shift_x=shift_x,
-                                                   shift_y=shift_y,
-                                                   image_width=image_width,
-                                                   image_height=image_height)
-                    shifted_matrix_cells_row.append(shifted_cell)
-                shifted_matrix_cells.append(shifted_matrix_cells_row)
-            shifted_scan_table = ScanTable(page_number=scan_table.page_number,
-                                           name=scan_table.name,
-                                           order=scan_table.order,
-                                           matrix_cells=shifted_matrix_cells,
-                                           bbox=BBox(0, 0, 0, 0))  # pass empty box since we overwrite locations in the next line anyway
-            shifted_scan_table.locations = shifted_table_locations  # ScanTable doesn't use bbox for anything else but locations and now we overwrite them
-            shifted_scan_tables.append(shifted_scan_table)
+                    row[i_cel].shift(shift_x=shift_x,
+                                     shift_y=shift_y,
+                                     image_width=image_width,
+                                     image_height=image_height)
 
         # shift attachments
-        shifted_attachments = []
-        for attachment in attachments:
+        for i_att, attachment in enumerate(attachments):
             attachment_page_number = attachment.location.page_number
             shift_x, shift_y = gost_analyzed_images[attachment_page_number][1].x_top_left, gost_analyzed_images[attachment_page_number][1].y_top_left
-            new_attachment_location = Location.shift_location(attachment.location, shift_x, shift_y)
-            shifted_attachments.append(PdfImageAttachment(original_name=attachment.original_name,
-                                                          tmp_file_path=attachment.tmp_file_path,
-                                                          need_content_analysis=attachment.need_content_analysis,
-                                                          location=new_attachment_location,
-                                                          uid=attachment.uid))
+            attachments[i_att].location.shift(shift_x, shift_y)
+
         # shift lines
-        shifted_lines = []
-        for line in all_lines_with_paragraphs:
+        for i_lin, line in enumerate(all_lines_with_paragraphs):
             page_number = line.metadata.page_id
             image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
-            shifted_line = LineWithLocation.shift_line_with_location(line_with_location=line,
-                                                                     shift_x=gost_analyzed_images[page_number][1].x_top_left,
-                                                                     shift_y=gost_analyzed_images[page_number][1].y_top_left,
-                                                                     image_width=image_width,
-                                                                     image_height=image_height)
-            shifted_lines.append(shifted_line)
-        return shifted_lines, shifted_scan_tables, shifted_attachments
+            all_lines_with_paragraphs[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
+                                                   shift_y=gost_analyzed_images[page_number][1].y_top_left,
+                                                   image_width=image_width,
+                                                   image_height=image_height)
 
     @abstractmethod
     def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -26,7 +26,10 @@ def __init__(self, *, config: dict = None) -> None:
         self.config = config
 
     def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
-        thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
+        if len(image.shape) < 3:  # check if an image is already converted to grayscale
+            thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
+        else:
+            thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
         lines_bin = detect_lines(255 - img_bin, self.config, "tables")
         contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
         tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)
@@ -37,7 +40,7 @@ def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
             return BBox.crop_image_by_box(image, main_box), main_box
         return image, BBox(0, 0, image.shape[1], image.shape[0])
 
-    def _analyze_table_on_frame(self, tree_table, img_area) -> Tuple[bool, Optional[BBox]]:
+    def _analyze_table_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
         try:
             sub_bboxes = tree_table.children[0].children
             for box in sub_bboxes:

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 beautifulsoup4>=4.10.0,<=4.12.2
 charset-normalizer>=2.0.12,<=3.2.0
 Cython>=0.29.28,<=3.0.2
-dedoc-utils==0.3.7
+dedoc-utils==0.3.8
 fastapi>=0.77.0,<1.0
 huggingface-hub>=0.14.1,<1.0
 imutils==0.5.4

diff --git a/tests/data/tables/gost_frame_1.png b/tests/data/tables/gost_frame_1.png
diff --git a/tests/data/tables/gost_frame_2.png b/tests/data/tables/gost_frame_2.png
diff --git a/tests/data/tables/gost_frame_3.png b/tests/data/tables/gost_frame_3.png