diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 4b7c7969..0b34ce72 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,6 +1,6 @@ from abc import abstractmethod from collections import namedtuple -from typing import Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterator, List, Optional, Set, Tuple import numpy as np from dedocutils.data_structures.bbox import BBox @@ -88,7 +88,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure need_content_analysis=param_utils.get_param_need_content_analysis(parameters), need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters), pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters) - ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) @@ -108,18 +107,15 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis from dedoc.utils.pdf_utils import get_pdf_page_count from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.utils.utils import flatten first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) - if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): - gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images) - result = Parallel(n_jobs=self.config["n_jobs"])( - delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in - enumerate(gost_analyzed_images, start=first_page) - ) + if parameters.need_gost_frame_analysis and isinstance(self, (PdfImageReader, PdfTxtlayerReader)): + result, gost_analyzed_images = self._process_document_with_gost_frame(images=images, first_page=first_page, parameters=parameters, path=path) else: result = Parallel(n_jobs=self.config["n_jobs"])( delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page) @@ -144,6 +140,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( lines = [lines for lines, _, _, _ in result] lines, headers, footers = footer_header_analysis(lines) all_lines = list(flatten(lines)) + if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): + self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images) mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines) all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments) @@ -153,27 +151,36 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links) if page_angles: metadata["rotated_page_angles"] = page_angles - if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): - self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images) return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata - def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment], - gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None: - # shift mp_tables - for scan_table in mp_tables: + def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ + Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]: + from joblib import Parallel, delayed + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + + gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images) + page_range = range(first_page, first_page + len(gost_analyzed_images)) + gost_analyzed_images = dict(zip(page_range, gost_analyzed_images)) + if isinstance(self, PdfTxtlayerReader): + self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()])) + result = Parallel(n_jobs=self.config["n_jobs"])( + delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in + gost_analyzed_images.items() + ) + return result, gost_analyzed_images + + def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment], + gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None: + # shift unref_tables + for scan_table in unref_tables: for location in scan_table.locations: table_page_number = location.page_number location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left) + page_number = scan_table.locations[0].page_number for row in scan_table.matrix_cells: - row_page_number = scan_table.page_number - for cell in row: # check page number information in the current table row, because table can be located on multiple pages - if cell.lines and len(cell.lines) >= 1: - row_page_number = cell.lines[0].metadata.page_id - break - for cell in row: # if cell doesn't contain page number information we use row_page_number - page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number - image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0] - shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left + for cell in row: + image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] + shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left) cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) # shift attachments @@ -185,7 +192,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTab # shift lines for line in lines: page_number = line.metadata.page_id - image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0] + image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] line.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, shift_y=gost_analyzed_images[page_number][1].y_top_left, image_width=image_width, diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py index 1b113c48..53201a1a 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py @@ -8,7 +8,7 @@ from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines -MIN_FRAME_CONTENT_AREA = 0.7 +MIN_FRAME_CONTENT_AREA = 0.65 class GOSTFrameRecognizer: @@ -16,7 +16,7 @@ def __init__(self, *, config: dict = None) -> None: self.logger = config.get("logger", logging.getLogger()) self.config = config - def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]: + def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox, Tuple[int, ...]]: if len(image.shape) < 3: # check if an image is already converted to grayscale thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY) else: @@ -28,8 +28,8 @@ def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]: img_area = image.shape[0] * image.shape[1] has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area) if has_gost_frame: - return BBox.crop_image_by_box(image, main_box), main_box - return image, BBox(0, 0, image.shape[1], image.shape[0]) + return BBox.crop_image_by_box(image, main_box), main_box, (int(image.shape[0]), int(image.shape[1])) + return image, BBox(0, 0, image.shape[1], image.shape[0]), (int(image.shape[0]), int(image.shape[1])) def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]: try: diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index d7bb2b6a..46528fcd 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -4,6 +4,7 @@ from numpy import ndarray from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation +from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader @@ -58,6 +59,11 @@ def _process_one_page(self, page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, parameters=parameters) if page is None: return [], [], [], [] + if parameters.need_gost_frame_analysis: + page_shift = self.gost_frame_boxes[page_number] + self._move_table_cells(tables=tables, page_shift=page_shift, page=page) + readable_block = page_shift # bbox representing the content of the gost frame + page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame unreadable_blocks = [location.bbox for table in tables for location in table.locations] page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)] lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False) @@ -65,6 +71,19 @@ def _process_one_page(self, return lines, tables, page.attachments, [] + def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None: + """ + Move tables back to original coordinates when parsing a document containing a gost frame + """ + for table in tables: + shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates + for location in table.locations: + location.bbox.shift(shift_x=shift_x, shift_y=shift_y) + for row in table.matrix_cells: + for cell in row: + image_width, image_height = page.pdf_page_width, page.pdf_page_height + cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) + def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: """ Change table boxes' width height into pdf space like textual lines diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 4b3477bd..c357ac78 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -228,7 +228,7 @@ Api parameters description - true, false - false - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. - The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer. + The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents. * - language - rus, eng, rus+eng, fra, spa diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 71f80e9c..d8788089 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -159,8 +159,8 @@ PDF and images handling * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and - ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content - of the document containing GOST frame. + ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader` + to properly process the content of the document containing GOST frame. * - orient_analysis_cells - True, False diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index 5a815663..a1e48a78 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -223,3 +223,19 @@ def test_multipage_gost_table(self) -> None: self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"]) self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"]) self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"]) + + def test_multipage_gost_table_with_text_layer(self) -> None: + file_name = "gost_multipage_table_2.pdf" + result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"}) + self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14) + self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) + self.assertEqual(len(result["content"]["tables"]), 1) + + def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None: + file_name = "gost_multipage_table_2.pdf" + result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True", "pages": "2:"}) + self.assertEqual(len(result["content"]["tables"]), 1) + self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5) + self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) diff --git a/tests/data/tables/gost_multipage_table_2.pdf b/tests/data/tables/gost_multipage_table_2.pdf new file mode 100644 index 00000000..295df746 Binary files /dev/null and b/tests/data/tables/gost_multipage_table_2.pdf differ diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py index a1e65222..d3f35938 100644 --- a/tests/unit_tests/test_module_gost_frame_recognizer.py +++ b/tests/unit_tests/test_module_gost_frame_recognizer.py @@ -51,13 +51,13 @@ def test_gost_frame_recognition(self) -> None: for index, image_name in enumerate(image_names): path_image = os.path.join(self.test_data_folder, image_name) image = cv2.imread(path_image) - result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image) + result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image) self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not def test_not_gost_frame(self) -> None: path_image = os.path.join(self.test_data_folder, "not_gost_frame.jpg") image = cv2.imread(path_image) - result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image) + result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image) self.assertTrue(abs(result_bbox.x_top_left - 26) < 10) self.assertTrue(abs(result_bbox.y_top_left - 26) < 10) self.assertTrue(abs(result_bbox.width - 722) < 10)