partially fixed code

ispras · Sep 10, 2024 · f925a65 · f925a65
1 parent e923216
commit f925a65
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 75 deletions.
diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py
@@ -14,13 +14,7 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
         super().__init__(line, metadata, annotations, uid)
 
     def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
-        import json
-        from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
-        for i_ann, annotation in enumerate(self.annotations):
-            if self.annotations[i_ann].name == "bounding box":
-                bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
-                bbox.shift(shift_x, shift_y)
-                self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
+        super().shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
         self.location.shift(shift_x, shift_y)
 
     def __repr__(self) -> str:

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -13,6 +13,7 @@
 from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
 from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
 
 ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
     "orient_analysis_cells",
@@ -29,7 +30,8 @@
     "with_attachments",
     "attachments_dir",
     "need_content_analysis",
-    "need_gost_frame_analysis"
+    "need_gost_frame_analysis",
+    "pdf_with_txt_layer"
 ])
 
 
@@ -53,6 +55,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
         self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
         self.linker = LineObjectLinker(config=self.config)
         self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
+        self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
 
     def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
@@ -83,7 +86,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
             with_attachments=param_utils.get_param_with_attachments(parameters),
             attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
             need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
-            need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters)
+            need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
+            pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)
+
         )
 
         lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
@@ -108,12 +113,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         last_page = math.inf if parameters.last_page is None else parameters.last_page
         images = self._get_images(path, first_page, last_page)
 
-        if parameters.need_gost_frame_analysis:
-            from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
-            self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
-            gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(
-                delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images
-            )
+        if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
+            gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
             result = Parallel(n_jobs=self.config["n_jobs"])(
                 delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
                 enumerate(gost_analyzed_images, start=first_page)
@@ -151,14 +152,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
         all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
         if page_angles:
             metadata["rotated_page_angles"] = page_angles
-        if parameters.need_gost_frame_analysis:
-            self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
+        if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
+            self._shift_all_contents(lines=all_lines_with_paragraphs,
                                      mp_tables=mp_tables,
                                      attachments=attachments,
                                      gost_analyzed_images=gost_analyzed_images)
         return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata
 
-    def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
+    def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
                             gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
         # shift mp_tables
         for scan_table in mp_tables:
@@ -185,13 +186,13 @@ def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_
             attachments[i_att].location.shift(shift_x, shift_y)
 
         # shift lines
-        for i_lin, line in enumerate(all_lines_with_paragraphs):
+        for i_lin, line in enumerate(lines):
             page_number = line.metadata.page_id
             image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
-            all_lines_with_paragraphs[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
-                                                   shift_y=gost_analyzed_images[page_number][1].y_top_left,
-                                                   image_width=image_width,
-                                                   image_height=image_height)
+            lines[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
+                               shift_y=gost_analyzed_images[page_number][1].y_top_left,
+                               image_width=image_width,
+                               image_height=image_height)
 
     @abstractmethod
     def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py
@@ -0,0 +1,43 @@
+import logging
+from typing import Optional, Tuple
+
+import cv2
+import numpy as np
+from dedocutils.data_structures import BBox
+
+from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines
+
+MIN_FRAME_CONTENT_AREA = 0.7
+
+
+class GOSTFrameRecognizer:
+    def __init__(self, *, config: dict = None) -> None:
+        self.logger = config.get("logger", logging.getLogger())
+        self.config = config
+
+    def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
+        if len(image.shape) < 3:  # check if an image is already converted to grayscale
+            thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
+        else:
+            thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
+        lines_bin = detect_lines(255 - img_bin, self.config, "tables")
+        contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
+        tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)
+
+        img_area = image.shape[0] * image.shape[1]
+        has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area)
+        if has_gost_frame:
+            return BBox.crop_image_by_box(image, main_box), main_box
+        return image, BBox(0, 0, image.shape[1], image.shape[0])
+
+    def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
+        try:
+            sub_bboxes = tree_table.children[0].children
+            for box in sub_bboxes:
+                if box.cell_box.square / img_area > MIN_FRAME_CONTENT_AREA:
+                    return True, box.cell_box
+            return False, None
+        except Exception as ex:
+            self.logger.warning(ex)
+            return False, None
diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py
@@ -11,47 +11,13 @@
 
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
-from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
 from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
 from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import __detect_horizontal_and_vertical_lines as detect_lines
 
 """-------------------------------------entry class of Table Recognizer Module---------------------------------------"""
 
 
-class GOSTFrameRecognizer(object):
-    def __init__(self, *, config: dict = None) -> None:
-        self.logger = config.get("logger", logging.getLogger())
-        self.config = config
-
-    def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
-        if len(image.shape) < 3:  # check if an image is already converted to grayscale
-            thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
-        else:
-            thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
-        lines_bin = detect_lines(255 - img_bin, self.config, "tables")
-        contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
-        tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)
-
-        img_area = image.shape[0] * image.shape[1]
-        has_gost_frame, main_box = self._analyze_table_on_frame(tree_table, img_area)
-        if has_gost_frame:
-            return BBox.crop_image_by_box(image, main_box), main_box
-        return image, BBox(0, 0, image.shape[1], image.shape[0])
-
-    def _analyze_table_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
-        try:
-            sub_bboxes = tree_table.children[0].children
-            for box in sub_bboxes:
-                if box.cell_box.square / img_area > 0.7:
-                    return True, box.cell_box
-            return False, None
-        except Exception as ex:
-            self.logger.warning(ex)
-            return False, None
-
-
 class TableRecognizer(object):
 
     def __init__(self, *, config: dict = None) -> None:

diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py
@@ -83,7 +83,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
     if config.get("debug_mode", False):
         cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
     # step 2
-    img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
+    img_final_bin = detect_horizontal_and_vertical_lines(img_bin, config, "tables")
     # step 3
     img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)
 
@@ -182,7 +182,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
     return img_final_bin_houph, angle_alignment
 
 
-def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
+def detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
     # Defining a kernel length
 
     if task == "orientation":

diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst
@@ -224,6 +224,13 @@ Api parameters description
         * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
         * **false** -- use the textual layer classifier to detect textual layer and prove its correctness.
 
+    * - need_gost_frame_analysis
+      - True, False
+      - False
+      - This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
+        The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
+        ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
+        of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.
 
     * - language
       - rus, eng, rus+eng, fra, spa

diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst
@@ -151,6 +151,17 @@ PDF and images handling
         If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`,
         in this case tables will be parsed much easier and faster.
 
+    * - need_gost_frame_analysis
+      - True, False
+      - False
+      - * :meth:`dedoc.DedocManager.parse`
+        * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
+        * :meth:`dedoc.readers.ReaderComposition.read`
+      - This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
+        The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
+        ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
+        of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.
+
     * - orient_analysis_cells
       - True, False
       - False

diff --git a/tests/data/tables/gost_multipage_table.pdf b/tests/data/tables/gost_multipage_table.pdf
diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py
@@ -1,13 +1,14 @@
 import os.path
 import unittest
+from typing import Optional
 
 import cv2
 import numpy as np
 
 import dedoc.utils.parameter_utils as param_utils
 from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc
 from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
-from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
+from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
 from tests.test_utils import get_test_config
 
 
@@ -17,20 +18,9 @@ class TestGOSTFrameRecognizer(unittest.TestCase):
     test_data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "tables"))
     pdf_image_reader = PdfImageReader(config=get_test_config())
 
-    def test_gost_frame_recognition(self) -> None:
-        image_names = [
-            "gost_frame_1.png", "gost_frame_2.png", "gost_frame_3.png", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
-        ]
-        gt = [True, True, True, False, False, False]
-        for index, image_name in enumerate(image_names):
-            path_image = os.path.join(self.test_data_folder, image_name)
-            image = cv2.imread(path_image)
-            result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
-            self.assertEqual(not np.array_equal(result_image, image), gt[index])  # check if we cut something from original image or not
-
-    def test_coordinates_shift(self) -> None:
-        file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
-        parameters = {"need_gost_frame_analysis": "True"}
+    def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[str]) -> ParametersForParseDoc:
+        parameters = parameters if parameters else {}
+        file_path = file_path if file_path else ""
         params_for_parse = ParametersForParseDoc(
             language=param_utils.get_param_language(parameters),
             orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
@@ -46,8 +36,26 @@ def test_coordinates_shift(self) -> None:
             with_attachments=param_utils.get_param_with_attachments(parameters),
             attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
             need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
-            need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters)
+            need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
+            pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)
         )
+        return params_for_parse
+
+    def test_gost_frame_recognition(self) -> None:
+        image_names = [
+            "gost_frame_1.png", "gost_frame_2.png", "gost_frame_3.png", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
+        ]
+        gt = [True, True, True, False, False, False]
+        for index, image_name in enumerate(image_names):
+            path_image = os.path.join(self.test_data_folder, image_name)
+            image = cv2.imread(path_image)
+            result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
+            self.assertEqual(not np.array_equal(result_image, image), gt[index])  # check if we cut something from original image or not
+
+    def test_coordinates_shift(self) -> None:
+        file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
+        parameters = {"need_gost_frame_analysis": "True", "pdf_with_text_layer": "false"}
+        params_for_parse = self._get_params_for_parse(parameters=parameters, file_path=file_path)
         result = self.pdf_image_reader._parse_document(path=file_path, parameters=params_for_parse)
         self.assertTrue(len(result[0]) > 0)
         self.assertTrue(abs(result[0][0].location.bbox.x_top_left - 365) < 10)