Skip to content

Commit

Permalink
fixed code according to mr comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Golodkov committed Sep 11, 2024
1 parent 25ace9a commit 9744f54
Show file tree
Hide file tree
Showing 10 changed files with 42 additions and 12 deletions.
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader":
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
Expand Down Expand Up @@ -152,7 +152,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links)
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis and parameters.pdf_with_txt_layer == "false":
if parameters.need_gost_frame_analysis and type(self).__name__ == "PdfImageReader":
self._shift_all_contents(lines=all_lines_with_paragraphs, mp_tables=mp_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

Expand Down
10 changes: 4 additions & 6 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,10 @@ Api parameters description
* **false** -- use the textual layer classifier to detect textual layer and prove its correctness.

* - need_gost_frame_analysis
- True, False
- False
- This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.
- true, false
- false
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer.

* - language
- rus, eng, rus+eng, fra, spa
Expand Down
4 changes: 2 additions & 2 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ PDF and images handling
- * :meth:`dedoc.DedocManager.parse`
* :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`
* :meth:`dedoc.readers.ReaderComposition.read`
- This option is used to enable GOST(Russian government standard) frame recognition for PDF documents or images.
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame. Currently works only when ``pdf_with_text_layer="false"``.
of the document containing GOST frame.

* - orient_analysis_cells
- True, False
Expand Down
10 changes: 10 additions & 0 deletions tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,13 @@ def test_detect_small_table(self) -> None:
result = self._send_request(file_name, data={"language": "rus"})
tables = result["content"]["tables"]
self.assertEqual(2, len(tables))

def test_multipage_gost_table(self):
file_name = "gost_multipage_table.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader
self.assertTrue(len(result['content']['tables'][0]['cells']) > 35)
self.assertTrue('KR13' in result['content']['tables'][0]['cells'][-1][0]['lines'][0]['text']) # check the last row of multipage table
self.assertTrue('R13.1' in result['content']['tables'][0]['cells'][-1][1]['lines'][0]['text']) # check that it belongs to first and only table
self.assertTrue('Испытание по проверке' in result['content']['tables'][0]['cells'][-1][2]['lines'][0]['text'])
self.assertTrue('3.6' in result['content']['tables'][0]['cells'][-1][3]['lines'][0]['text'])
self.assertTrue('7.4.9' in result['content']['tables'][0]['cells'][-1][4]['lines'][0]['text'])
Binary file added tests/data/tables/gost_frame_1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed tests/data/tables/gost_frame_1.png
Binary file not shown.
Binary file added tests/data/tables/gost_frame_3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed tests/data/tables/gost_frame_3.png
Binary file not shown.
Binary file added tests/data/tables/not_gost_frame.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 24 additions & 2 deletions tests/unit_tests/test_module_gost_frame_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import dedoc.utils.parameter_utils as param_utils
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
from tests.test_utils import get_test_config

Expand All @@ -17,6 +18,7 @@ class TestGOSTFrameRecognizer(unittest.TestCase):
gost_frame_recognizer = GOSTFrameRecognizer(config=get_test_config())
test_data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "tables"))
pdf_image_reader = PdfImageReader(config=get_test_config())
pdf_auto_reader = PdfAutoReader(config=get_test_config())

def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[str]) -> ParametersForParseDoc:
parameters = parameters if parameters else {}
Expand All @@ -43,7 +45,7 @@ def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[

def test_gost_frame_recognition(self) -> None:
image_names = [
"gost_frame_1.png", "gost_frame_2.png", "gost_frame_3.png", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
"gost_frame_1.jpg", "gost_frame_2.png", "gost_frame_3.jpg", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png"
]
gt = [True, True, True, False, False, False]
for index, image_name in enumerate(image_names):
Expand All @@ -52,9 +54,18 @@ def test_gost_frame_recognition(self) -> None:
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not

def test_not_gost_frame(self) -> None:
path_image = os.path.join(self.test_data_folder, "not_gost_frame.jpg")
image = cv2.imread(path_image)
result_image, result_bbox = self.gost_frame_recognizer.rec_and_clean_frame(image)
self.assertTrue(abs(result_bbox.x_top_left - 26) < 10)
self.assertTrue(abs(result_bbox.y_top_left - 26) < 10)
self.assertTrue(abs(result_bbox.width - 722) < 10)
self.assertTrue(abs(result_bbox.height - 969) < 10)

def test_coordinates_shift(self) -> None:
file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
parameters = {"need_gost_frame_analysis": "True", "pdf_with_text_layer": "false"}
parameters = {"need_gost_frame_analysis": "True"}
params_for_parse = self._get_params_for_parse(parameters=parameters, file_path=file_path)
result = self.pdf_image_reader._parse_document(path=file_path, parameters=params_for_parse)
self.assertTrue(len(result[0]) > 0)
Expand All @@ -65,3 +76,14 @@ def test_coordinates_shift(self) -> None:
self.assertTrue(len(result[1]) > 0)
self.assertTrue(abs(result[1][0].location.bbox.x_top_left - 81) < 10)
self.assertTrue(abs(result[1][0].location.bbox.y_top_left - 49) < 10)

def test_pdf_auto_reader(self) -> None:
file_path = os.path.join(self.test_data_folder, "gost_frame_2.png")
parameters = {"need_gost_frame_analysis": "True"}
result = self.pdf_auto_reader.read(file_path=file_path, parameters=parameters)
self.assertTrue(len(result.tables) == 1)
self.assertEqual(result.tables[0].cells[0][1].get_text(), "Колонка 2")
self.assertEqual(result.tables[0].cells[0][2].get_text(), "Колонка 3")
self.assertEqual(len(result.tables[0].cells), 22)
self.assertTrue("Названне таблицы (продолженне)" in result.lines[0].line)

0 comments on commit 9744f54

Please sign in to comment.