Skip to content

Commit

Permalink
partially fixed code
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander1999-hub committed Sep 10, 2024
1 parent e923216 commit f925a65
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 75 deletions.
8 changes: 1 addition & 7 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,7 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
super().__init__(line, metadata, annotations, uid)

def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
import json
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
for i_ann, annotation in enumerate(self.annotations):
if self.annotations[i_ann].name == "bounding box":
bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
bbox.shift(shift_x, shift_y)
self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
super().shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.location.shift(shift_x, shift_y)

def __repr__(self) -> str:
Expand Down
33 changes: 17 additions & 16 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer

ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"orient_analysis_cells",
Expand All @@ -29,7 +30,8 @@
"with_attachments",
"attachments_dir",
"need_content_analysis",
"need_gost_frame_analysis"
"need_gost_frame_analysis",
"pdf_with_txt_layer"
])


Expand All @@ -53,6 +55,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
self.attachment_extractor = PDFAttachmentsExtractor(config=self.config)
self.linker = LineObjectLinker(config=self.config)
self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config)
self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)

def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
Expand Down Expand Up @@ -83,7 +86,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
with_attachments=param_utils.get_param_with_attachments(parameters),
attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters)
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)

)

lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse)
Expand All @@ -108,12 +113,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis:
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images
)
if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
enumerate(gost_analyzed_images, start=first_page)
Expand Down Expand Up @@ -151,14 +152,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis:
self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
self._shift_all_contents(lines=all_lines_with_paragraphs,
mp_tables=mp_tables,
attachments=attachments,
gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
def _shift_all_contents(self, lines: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
# shift mp_tables
for scan_table in mp_tables:
Expand All @@ -185,13 +186,13 @@ def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_
attachments[i_att].location.shift(shift_x, shift_y)

# shift lines
for i_lin, line in enumerate(all_lines_with_paragraphs):
for i_lin, line in enumerate(lines):
page_number = line.metadata.page_id
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
all_lines_with_paragraphs[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
image_height=image_height)
lines[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
image_height=image_height)

@abstractmethod
def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import logging
from typing import Optional, Tuple

import cv2
import numpy as np
from dedocutils.data_structures import BBox

from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines

MIN_FRAME_CONTENT_AREA = 0.7


class GOSTFrameRecognizer:
def __init__(self, *, config: dict = None) -> None:
self.logger = config.get("logger", logging.getLogger())
self.config = config

def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
if len(image.shape) < 3: # check if an image is already converted to grayscale
thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
else:
thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
lines_bin = detect_lines(255 - img_bin, self.config, "tables")
contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)

img_area = image.shape[0] * image.shape[1]
has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area)
if has_gost_frame:
return BBox.crop_image_by_box(image, main_box), main_box
return image, BBox(0, 0, image.shape[1], image.shape[0])

def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
try:
sub_bboxes = tree_table.children[0].children
for box in sub_bboxes:
if box.cell_box.square / img_area > MIN_FRAME_CONTENT_AREA:
return True, box.cell_box
return False, None
except Exception as ex:
self.logger.warning(ex)
return False, None
Original file line number Diff line number Diff line change
Expand Up @@ -11,47 +11,13 @@

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree
from dedoc.readers.pdf_reader.data_classes.tables.table_type import TableTypeAdditionalOptions
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.multipage_table_extractor import MultiPageTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_extractors.concrete_extractors.onepage_table_extractor import OnePageTableExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import __detect_horizontal_and_vertical_lines as detect_lines

"""-------------------------------------entry class of Table Recognizer Module---------------------------------------"""


class GOSTFrameRecognizer(object):
def __init__(self, *, config: dict = None) -> None:
self.logger = config.get("logger", logging.getLogger())
self.config = config

def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
if len(image.shape) < 3: # check if an image is already converted to grayscale
thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
else:
thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
lines_bin = detect_lines(255 - img_bin, self.config, "tables")
contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)

img_area = image.shape[0] * image.shape[1]
has_gost_frame, main_box = self._analyze_table_on_frame(tree_table, img_area)
if has_gost_frame:
return BBox.crop_image_by_box(image, main_box), main_box
return image, BBox(0, 0, image.shape[1], image.shape[0])

def _analyze_table_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
try:
sub_bboxes = tree_table.children[0].children
for box in sub_bboxes:
if box.cell_box.square / img_area > 0.7:
return True, box.cell_box
return False, None
except Exception as ex:
self.logger.warning(ex)
return False, None


class TableRecognizer(object):

def __init__(self, *, config: dict = None) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An
if config.get("debug_mode", False):
cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin)
# step 2
img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables")
img_final_bin = detect_horizontal_and_vertical_lines(img_bin, config, "tables")
# step 3
img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config)

Expand Down Expand Up @@ -182,7 +182,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np
return img_final_bin_houph, angle_alignment


def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
def detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray:
# Defining a kernel length

if task == "orientation":
Expand Down
7 changes: 7 additions & 0 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,13 @@ Api parameters description
* **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate.
* **false** -- use the textual layer classifier to detect textual layer and prove its correctness.

* - need_gost_frame_analysis
- True, False
- False
- This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.

* - language
- rus, eng, rus+eng, fra, spa
Expand Down
11 changes: 11 additions & 0 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ PDF and images handling
If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`,
in this case tables will be parsed much easier and faster.

* - need_gost_frame_analysis
- True, False
- False
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
* :meth:`dedoc.readers.ReaderComposition.read`
- This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.

* - orient_analysis_cells
- True, False
- False
Expand Down
Binary file added tests/data/tables/gost_multipage_table.pdf
Binary file not shown.
40 changes: 24 additions & 16 deletions tests/unit_tests/test_module_gost_frame_recognizer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os.path
import unittest
from typing import Optional

import cv2
import numpy as np

import dedoc.utils.parameter_utils as param_utils
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
from tests.test_utils import get_test_config


Expand All @@ -17,20 +18,9 @@ class TestGOSTFrameRecognizer(unittest.TestCase):
test_data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "tables"))
pdf_image_reader = PdfImageReader(config=get_test_config())

def test_gost_frame_recognition(self) -> None:
image_names = [
"gost_frame_1.png", "gost_frame_2.png", "gost_frame_3.png", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
]
gt = [True, True, True, False, False, False]
for index, image_name in enumerate(image_names):
path_image = os.path.join(self.test_data_folder, image_name)
image = cv2.imread(path_image)
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not

def test_coordinates_shift(self) -> None:
file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
parameters = {"need_gost_frame_analysis": "True"}
def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[str]) -> ParametersForParseDoc:
parameters = parameters if parameters else {}
file_path = file_path if file_path else ""
params_for_parse = ParametersForParseDoc(
language=param_utils.get_param_language(parameters),
orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters),
Expand All @@ -46,8 +36,26 @@ def test_coordinates_shift(self) -> None:
with_attachments=param_utils.get_param_with_attachments(parameters),
attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path),
need_content_analysis=param_utils.get_param_need_content_analysis(parameters),
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters)
need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters),
pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters)
)
return params_for_parse

def test_gost_frame_recognition(self) -> None:
image_names = [
"gost_frame_1.png", "gost_frame_2.png", "gost_frame_3.png", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
]
gt = [True, True, True, False, False, False]
for index, image_name in enumerate(image_names):
path_image = os.path.join(self.test_data_folder, image_name)
image = cv2.imread(path_image)
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not

def test_coordinates_shift(self) -> None:
file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
parameters = {"need_gost_frame_analysis": "True", "pdf_with_text_layer": "false"}
params_for_parse = self._get_params_for_parse(parameters=parameters, file_path=file_path)
result = self.pdf_image_reader._parse_document(path=file_path, parameters=params_for_parse)
self.assertTrue(len(result[0]) > 0)
self.assertTrue(abs(result[0][0].location.bbox.x_top_left - 365) < 10)
Expand Down

0 comments on commit f925a65

Please sign in to comment.