Skip to content

Commit

Permalink
refactored gost table recognizer and added tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Golodkov committed Sep 5, 2024
1 parent b643a1b commit 9b8087a
Show file tree
Hide file tree
Showing 11 changed files with 117 additions and 126 deletions.
24 changes: 6 additions & 18 deletions dedoc/data_structures/line_with_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,23 +181,11 @@ def to_api_schema(self) -> ApiLineWithMeta:
annotations = [annotation.to_api_schema() for annotation in self.annotations]
return ApiLineWithMeta(text=self._line, annotations=annotations)

@staticmethod
def shift_line_with_meta(line_with_meta: "LineWithMeta", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "LineWithMeta":
def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
import json
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedocutils.data_structures import BBox
new_annotations = []
for annotation in line_with_meta.annotations:
if annotation.name != "bounding box":
new_annotations.append(annotation)
else:
for i_ann, annotation in enumerate(self.annotations):
if annotation.name == "bounding box":
bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
new_bbox = BBox.shift_bbox(bbox, shift_x, shift_y)
new_annotations.append(BBoxAnnotation(start=annotation.start,
end=annotation.end,
value=new_bbox,
page_width=image_width,
page_height=image_height))
return LineWithMeta(line=line_with_meta.line,
metadata=line_with_meta.metadata,
annotations=new_annotations,
uid=line_with_meta.uid)
bbox.shift(shift_x, shift_y)
self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
29 changes: 7 additions & 22 deletions dedoc/readers/pdf_reader/data_classes/line_with_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,15 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati
self.order = order
super().__init__(line, metadata, annotations, uid)

@staticmethod
def shift_line_with_location(line_with_location: "LineWithLocation", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "LineWithLocation":
def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
import json
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
from dedocutils.data_structures import BBox
new_annotations = []
for i_ann, annotation in enumerate(line_with_location.annotations):
if line_with_location.annotations[i_ann].name != "bounding box":
new_annotations.append(annotation)
else:
for i_ann, annotation in enumerate(self.annotations):
if self.annotations[i_ann].name == "bounding box":
bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value)
new_bbox = BBox.shift_bbox(bbox, shift_x, shift_y)
new_bbox_annotation = BBoxAnnotation(start=annotation.start,
end=annotation.end,
value=new_bbox,
page_width=image_width,
page_height=image_height)
new_annotations.append(new_bbox_annotation)
new_location = Location.shift_location(line_with_location.location, shift_x, shift_y)
return LineWithLocation(line=line_with_location.line,
metadata=line_with_location.metadata,
annotations=new_annotations,
location=new_location,
uid=line_with_location.uid,
order=line_with_location.order)
bbox.shift(shift_x, shift_y)
self.annotations[i_ann].value = json.dumps(bbox.to_relative_dict(image_width, image_height))
self.location.shift(shift_x, shift_y)

def __repr__(self) -> str:
parent_repr = super().__repr__()
Expand Down
35 changes: 10 additions & 25 deletions dedoc/readers/pdf_reader/data_classes/tables/cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,16 @@ def copy_from(cell: "Cell",
uid=cell.cell_uid,
contour_coord=cell.con_coord)

@staticmethod
def shift_cell(cell: "Cell", shift_x: int, shift_y: int, image_width: int, image_height: int) -> "Cell":
if cell.lines and len(cell.lines) >= 1:
shifted_lines = []
for line in cell.lines:
shifted_lines.append(LineWithMeta.shift_line_with_meta(line_with_meta=line,
shift_x=shift_x,
shift_y=shift_y,
image_width=image_width,
image_height=image_height))
else:
shifted_lines = cell.lines
return Cell(x_top_left=cell.x_top_left + shift_x,
x_bottom_right=cell.x_bottom_right + shift_x,
y_top_left=cell.y_top_left + shift_y,
y_bottom_right=cell.y_bottom_right + shift_y,
id_con=cell.id_con,
lines=shifted_lines,
is_attribute=cell.is_attribute,
is_attribute_required=cell.is_attribute_required,
rotated_angle=cell.rotated_angle,
uid=cell.cell_uid,
contour_coord=BBox.shift_bbox(bbox=cell.con_coord,
shift_x=shift_x,
shift_y=shift_y)) if cell.con_coord else None
def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None:
if self.lines and len(self.lines) >= 1:
for i_lin, _line in enumerate(self.lines):
self.lines[i_lin].shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
self.x_top_left += shift_x
self.x_bottom_right += shift_x
self.y_top_left += shift_y
self.y_bottom_right += shift_y
if self.con_coord:
self.con_coord.shift(shift_x=shift_x, shift_y=shift_y)

def __init__(self,
x_top_left: int,
Expand Down
8 changes: 2 additions & 6 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
self.name = name
self.rotated_angle = rotated_angle

@staticmethod
def shift_location(location: "Location", shift_x: int, shift_y: int):
return Location(page_number=location.page_number,
bbox=BBox.shift_bbox(location.bbox, shift_x, shift_y),
name=location.name,
rotated_angle=location.rotated_angle)
def shift(self, shift_x: int, shift_y: int) -> None:
self.bbox.shift(shift_x, shift_y)

def to_dict(self) -> Dict[str, Any]:
from collections import OrderedDict
Expand Down
78 changes: 26 additions & 52 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from collections import namedtuple
from typing import Iterator, List, Optional, Set, Tuple

import numpy as np
from dedocutils.data_structures.bbox import BBox
from numpy import ndarray

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
Expand Down Expand Up @@ -107,14 +109,14 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
images = self._get_images(path, first_page, last_page)

if parameters.need_gost_frame_analysis:
from dedocutils.data_structures import BBox
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import GOSTFrameRecognizer
self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config)
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images
)
result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in enumerate(gost_analyzed_images, start=first_page)
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box) in
enumerate(gost_analyzed_images, start=first_page)
)
else:
result = Parallel(n_jobs=self.config["n_jobs"])(
Expand Down Expand Up @@ -150,77 +152,49 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
if page_angles:
metadata["rotated_page_angles"] = page_angles
if parameters.need_gost_frame_analysis:
shifted_lines, shifted_scan_tables, shifted_attachments = self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
mp_tables=mp_tables,
attachments=attachments,
gost_analyzed_images=gost_analyzed_images)
return shifted_lines, shifted_scan_tables, shifted_attachments, warnings, metadata
self._shift_all_contents(all_lines_with_paragraphs=all_lines_with_paragraphs,
mp_tables=mp_tables,
attachments=attachments,
gost_analyzed_images=gost_analyzed_images)
return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata

def _shift_all_contents(self, all_lines_with_paragraphs, mp_tables, attachments, gost_analyzed_images) \
-> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]:
from dedocutils.data_structures import BBox
from dedoc.readers.pdf_reader.data_classes.tables.location import Location
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
def _shift_all_contents(self, all_lines_with_paragraphs: List[LineWithMeta], mp_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: List[Tuple[np.ndarray, BBox]]) -> None:
# shift mp_tables
shifted_scan_tables = []
for scan_table in mp_tables:
shifted_table_locations = []
for location in scan_table.locations:
for i_loc, location in enumerate(scan_table.locations):
table_page_number = location.page_number
shifted_table_locations.append(Location.shift_location(location=location,
shift_x=gost_analyzed_images[table_page_number][1].x_top_left,
shift_y=gost_analyzed_images[table_page_number][1].y_top_left))
shifted_matrix_cells = []
scan_table.locations[i_loc].shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left,
shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
for row in scan_table.matrix_cells:
shifted_matrix_cells_row = []
row_page_number = scan_table.page_number
for cell in row: # check page number information in the current table row, because table can be located on multiple pages
if cell.lines and len(cell.lines) >= 1:
row_page_number = cell.lines[0].metadata.page_id
break
for cell in row: # if cell doesn't contain page number information we use row_page_number
for i_cel, cell in enumerate(row): # if cell doesn't contain page number information we use row_page_number
page_number = cell.lines[0].metadata.page_id if cell.lines and len(cell.lines) >= 1 else row_page_number
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
shift_x, shift_y = gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left
shifted_cell = Cell.shift_cell(cell=cell,
shift_x=shift_x,
shift_y=shift_y,
image_width=image_width,
image_height=image_height)
shifted_matrix_cells_row.append(shifted_cell)
shifted_matrix_cells.append(shifted_matrix_cells_row)
shifted_scan_table = ScanTable(page_number=scan_table.page_number,
name=scan_table.name,
order=scan_table.order,
matrix_cells=shifted_matrix_cells,
bbox=BBox(0, 0, 0, 0)) # pass empty box since we overwrite locations in the next line anyway
shifted_scan_table.locations = shifted_table_locations # ScanTable doesn't use bbox for anything else but locations and now we overwrite them
shifted_scan_tables.append(shifted_scan_table)
row[i_cel].shift(shift_x=shift_x,
shift_y=shift_y,
image_width=image_width,
image_height=image_height)

# shift attachments
shifted_attachments = []
for attachment in attachments:
for i_att, attachment in enumerate(attachments):
attachment_page_number = attachment.location.page_number
shift_x, shift_y = gost_analyzed_images[attachment_page_number][1].x_top_left, gost_analyzed_images[attachment_page_number][1].y_top_left
new_attachment_location = Location.shift_location(attachment.location, shift_x, shift_y)
shifted_attachments.append(PdfImageAttachment(original_name=attachment.original_name,
tmp_file_path=attachment.tmp_file_path,
need_content_analysis=attachment.need_content_analysis,
location=new_attachment_location,
uid=attachment.uid))
attachments[i_att].location.shift(shift_x, shift_y)

# shift lines
shifted_lines = []
for line in all_lines_with_paragraphs:
for i_lin, line in enumerate(all_lines_with_paragraphs):
page_number = line.metadata.page_id
image_width, image_height = gost_analyzed_images[page_number][0].shape[1], gost_analyzed_images[page_number][0].shape[0]
shifted_line = LineWithLocation.shift_line_with_location(line_with_location=line,
shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
image_height=image_height)
shifted_lines.append(shifted_line)
return shifted_lines, shifted_scan_tables, shifted_attachments
all_lines_with_paragraphs[i_lin].shift(shift_x=gost_analyzed_images[page_number][1].x_top_left,
shift_y=gost_analyzed_images[page_number][1].y_top_left,
image_width=image_width,
image_height=image_height)

@abstractmethod
def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def __init__(self, *, config: dict = None) -> None:
self.config = config

def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
if len(image.shape) < 3: # check if an image is already converted to grayscale
thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY)
else:
thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY)
lines_bin = detect_lines(255 - img_bin, self.config, "tables")
contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS)
tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config)
Expand All @@ -37,7 +40,7 @@ def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox]:
return BBox.crop_image_by_box(image, main_box), main_box
return image, BBox(0, 0, image.shape[1], image.shape[0])

def _analyze_table_on_frame(self, tree_table, img_area) -> Tuple[bool, Optional[BBox]]:
def _analyze_table_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]:
try:
sub_bboxes = tree_table.children[0].children
for box in sub_bboxes:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
beautifulsoup4>=4.10.0,<=4.12.2
charset-normalizer>=2.0.12,<=3.2.0
Cython>=0.29.28,<=3.0.2
dedoc-utils==0.3.7
dedoc-utils==0.3.8
fastapi>=0.77.0,<1.0
huggingface-hub>=0.14.1,<1.0
imutils==0.5.4
Expand Down
Binary file added tests/data/tables/gost_frame_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/tables/gost_frame_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/tables/gost_frame_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 9b8087a

Please sign in to comment.