Skip to content

Commit

Permalink
code fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Golodkov committed Sep 19, 2024
1 parent a4f12b9 commit 90fb752
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 18 deletions.
33 changes: 18 additions & 15 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,21 +114,8 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
last_page = math.inf if parameters.last_page is None else parameters.last_page
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis and (isinstance(self, PdfImageReader) or isinstance(self, PdfTxtlayerReader)):
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
if isinstance(self, PdfImageReader):
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
else:
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
if parameters.need_gost_frame_analysis and isinstance(self, (PdfImageReader, PdfTxtlayerReader)):
result, gost_analyzed_images = self._process_document_with_gost_frame(images=images, first_page=first_page, parameters=parameters, path=path)
else:
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page)
Expand Down Expand Up @@ -166,6 +153,22 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
metadata["rotated_page_angles"] = page_angles
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \
Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]:
from joblib import Parallel, delayed
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader

gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
if isinstance(self, PdfTxtlayerReader):
self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()]))
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
return result, gost_analyzed_images

def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None:
# shift unref_tables
Expand Down
2 changes: 1 addition & 1 deletion docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ Api parameters description
- true, false
- false
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents without correct textual layer.
The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents.

* - language
- rus, eng, rus+eng, fra, spa
Expand Down
4 changes: 2 additions & 2 deletions docs/source/parameters/pdf_handling.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ PDF and images handling
* :meth:`dedoc.readers.ReaderComposition.read`
- This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images.
The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` to properly process the content
of the document containing GOST frame.
ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`
to properly process the content of the document containing GOST frame.

* - orient_analysis_cells
- True, False
Expand Down

0 comments on commit 90fb752

Please sign in to comment.