Skip to content

Commit

Permalink
TLDR-807 ignore gost frame in PdfTxtlayerReader (#496)
Browse files Browse the repository at this point in the history
Co-authored-by: Alexander Golodkov <[email protected]>
  • Loading branch information
alexander1999-hub and Alexander Golodkov authored Sep 19, 2024
1 parent f6f6b47 commit 782ef59
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 33 deletions.
55 changes: 31 additions & 24 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import abstractmethod
from collections import namedtuple
from typing import Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterator, List, Optional, Set, Tuple

import numpy as np
from dedocutils.data_structures.bbox import BBox
Expand Down Expand Up @@ -88,7 +88,6 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)

)

lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
Expand All @@ -108,18 +107,15 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis
from dedoc.utils.pdf_utils import get_pdf_page_count
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
from dedoc.utils.utils import flatten

first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
enumerate(gost_analyzed_images, start=first_page)
)
if parameters.need_gost_frame_analysis and isinstance(self, (PdfImageReader, PdfTxtlayerReader)):
result, gost_analyzed_images = self._process_document_with_gost_frame(images=images, first_page=first_page, parameters=parameters, path=path)
else:
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
Expand All @@ -144,6 +140,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
lines = [lines for lines, _, _, _ in result]
lines, headers, footers = footer_header_analysis(lines)
all_lines = list(flatten(lines))
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)

Expand All @@ -153,27 +151,36 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
# shift mp_tables
for scan_table in mp_tables:
def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]:
from joblib import Parallel, delayed
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader

gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
if isinstance(self, PdfTxtlayerReader):
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
return result, gost_analyzed_images

def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None:
# shift unref_tables
for scan_table in unref_tables:
for location in scan_table.locations:
table_page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
page_number = scan_table.locations[0].page_number
for row in scan_table.matrix_cells:
row_page_number = scan_table.page_number
for cell in row: # check page number information in the current table row, because table can be located on multiple pages
if cell.lines and len(cell.lines) >= 1:
row_page_number = cell.lines[0].metadata.page_id
break
for cell in row: # if cell doesn't contain page number information we use row_page_number
page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left
for cell in row:
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

# shift attachments
Expand All @@ -185,7 +192,7 @@ def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTab
# shift lines
for line in lines:
page_number = line.metadata.page_id
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
line.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines

MIN_FRAME_CONTENT_AREA = 0.7
MIN_FRAME_CONTENT_AREA = 0.65


class GOSTFrameRecognizer:
def __init__(self, *, config: dict = None) -> None:
self.logger = config.get("logger", logging.getLogger())
self.config = config

def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox, Tuple[int, ...]]:
if len(image.shape) < 3: # check if an image is already converted to grayscale
thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
else:
Expand All @@ -28,8 +28,8 @@ def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
img_area = image.shape[0] * image.shape[1]
has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area)
if has_gost_frame:
return BBox.crop_image_by_box(image, main_box), main_box
return image, BBox(0, 0, image.shape[1], image.shape[0])
return BBox.crop_image_by_box(image, main_box), main_box, (int(image.shape[0]), int(image.shape[1]))
return image, BBox(0, 0, image.shape[1], image.shape[0]), (int(image.shape[0]), int(image.shape[1]))

def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from numpy import ndarray

from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
Expand Down Expand Up @@ -58,13 +59,31 @@ def _process_one_page(self,
page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, parameters=parameters)
if page is None:
return [], [], [], []
if parameters.need_gost_frame_analysis:
page_shift = self.gost_frame_boxes[page_number]
self._move_table_cells(tables=tables, page_shift=page_shift, page=page)
readable_block = page_shift # bbox representing the content of the gost frame
page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame
unreadable_blocks = [location.bbox for table in tables for location in table.locations]
page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)
self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

return lines, tables, page.attachments, []

def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None:
"""
Move tables back to original coordinates when parsing a document containing a gost frame
"""
for table in tables:
shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates
for location in table.locations:
location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
for row in table.matrix_cells:
for cell in row:
image_width, image_height = page.pdf_page_width, page.pdf_page_height
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
"""
Change table boxes' width height into pdf space like textual lines
Expand Down
2 changes: 1 addition & 1 deletion docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ Api parameters description
- true, false
- false
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents.

* - language
- rus, eng, rus+eng, fra, spa
Expand Down
4 changes: 2 additions & 2 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ PDF and images handling
* :meth:`dedoc.readers.ReaderComposition.read`
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame.
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`
to properly process the content of the document containing GOST frame.

* - orient_analysis_cells
- True, False
Expand Down
16 changes: 16 additions & 0 deletions tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,3 +223,19 @@ def test_multipage_gost_table(self) -> None:
self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"])
self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"])
self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"])

def test_multipage_gost_table_with_text_layer(self) -> None:
file_name = "gost_multipage_table_2.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"})
self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14)
self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
self.assertEqual(len(result["content"]["tables"]), 1)

def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None:
file_name = "gost_multipage_table_2.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True", "pages": "2:"})
self.assertEqual(len(result["content"]["tables"]), 1)
self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5)
self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"])
self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"])
Binary file added tests/data/tables/gost_multipage_table_2.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions tests/unit_tests/test_module_gost_frame_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ def test_gost_frame_recognition(self) -> None:
for index, image_name in enumerate(image_names):
path_image = os.path.join(self.test_data_folder, image_name)
image = cv2.imread(path_image)
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not

def test_not_gost_frame(self) -> None:
path_image = os.path.join(self.test_data_folder, "not_gost_frame.jpg")
image = cv2.imread(path_image)
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertTrue(abs(result_bbox.x_top_left - 26) < 10)
self.assertTrue(abs(result_bbox.y_top_left - 26) < 10)
self.assertTrue(abs(result_bbox.width - 722) < 10)
Expand Down

0 comments on commit 782ef59

Please sign in to comment.