Skip to content

Commit

Permalink
TLDR-419 add confidence annotation (#301)
Browse files Browse the repository at this point in the history
* add new annotation

* add confidence extracting

* add test for confidence annotation

* add confidence annotation to documentation

* fix flake

* add mergeable field for annotation

* review fixes
  • Loading branch information
dronperminov authored Jul 31, 2023
1 parent 9a5704c commit 28fd511
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 10 deletions.
4 changes: 3 additions & 1 deletion dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Annotation(Serializable):
Look to the concrete kind of annotations to get mode examples.
"""

def __init__(self, start: int, end: int, name: str, value: str) -> None:
def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
"""
Some kind of text information about symbols between start and end.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was writen in italic.
Expand All @@ -21,11 +21,13 @@ def __init__(self, start: int, end: int, name: str, value: str) -> None:
:param end: end of the annotated text (end isn't included)
:param name: annotation's name
:param value: information about annotated text
:param is_mergeable: is it possible to merge annotations with the same value
"""
self.start = start
self.end = end
self.name = name
self.value = value
self.is_mergeable = is_mergeable

def __eq__(self, o: object) -> bool:
if not isinstance(o, Annotation):
Expand Down
7 changes: 4 additions & 3 deletions dedoc/data_structures/concrete_annotations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .bbox_annotation import BBoxAnnotation
from .bold_annotation import BoldAnnotation
from .color_annotation import ColorAnnotation
from .confidence_annotation import ConfidenceAnnotation
from .indentation_annotation import IndentationAnnotation
from .italic_annotation import ItalicAnnotation
from .linked_text_annotation import LinkedTextAnnotation
Expand All @@ -15,6 +16,6 @@
from .table_annotation import TableAnnotation
from .underlined_annotation import UnderlinedAnnotation

__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'IndentationAnnotation',
'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation', 'StyleAnnotation',
'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation',
'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation',
'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation']
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from flask_restx import fields, Api, Model

from dedoc.data_structures.annotation import Annotation


class ConfidenceAnnotation(Annotation):
"""
Confidence level of some recognized with OCR text inside the line.
"""
name = "confidence"

def __init__(self, start: int, end: int, value: str) -> None:
"""
:param start: start of the text
:param end: end of the text (not included)
:param value: confidence level in "percents" (float or integer number from 0 to 100)
"""
try:
assert 0.0 <= float(value) <= 100.0
except ValueError:
raise ValueError("the value of confidence annotation should be float value")
except AssertionError:
raise ValueError("the value of confidence annotation should be in range [0, 100]")
super().__init__(start=start, end=end, name=ConfidenceAnnotation.name, value=value, is_mergeable=False)

@staticmethod
def get_api_dict(api: Api) -> Model:
return api.model('BoldAnnotation', {
'start': fields.Integer(description='annotation start index', required=True, example=0),
'end': fields.Integer(description='annotation end index', required=True, example=4),
'value': fields.String(description='confidence value', required=True, example="95")
})
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __split_image2bboxes(self,
output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_thr)
else:
output_dict = get_text_with_bbox_from_cells(image, language, ocr_conf_threshold=0.0)
line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num)
line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_confidence())
for line_num, line in enumerate(output_dict.lines)]

return line_boxes
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List

from dedoc.data_structures.bbox import BBox
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement
from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_word import OcrWord

Expand All @@ -19,6 +20,19 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None:
def text(self) -> str:
return " ".join(word.text for word in self.words if word.text != "") + "\n"

def get_confidence(self) -> List[ConfidenceAnnotation]:
start = 0
annotations = []

for word in self.words:
if word.text == "":
continue

annotations.append(ConfidenceAnnotation(start, start + len(word.text), str(word.confidence)))
start += len(word.text) + 1

return annotations

@staticmethod
def from_list(line: List[OcrElement], ocr_conf_thr: float) -> "OcrLine":

Expand All @@ -32,5 +46,5 @@ def from_list(line: List[OcrElement], ocr_conf_thr: float) -> "OcrLine":
words.append(element)
line = sorted(line, key=lambda word: word.line_num)
line = list(filter(lambda word: float(word.conf) >= ocr_conf_thr, line))
ocr_words = [OcrWord(bbox=word.bbox, text=word.text, order=word.word_num) for word in line]
ocr_words = [OcrWord(bbox=word.bbox, text=word.text, confidence=word.conf, order=word.word_num) for word in line]
return OcrLine(order=head.line_num, words=ocr_words, bbox=head.bbox)
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
class OcrWord:
level = 5

def __init__(self, text: str, bbox: BBox, order: int) -> None:
def __init__(self, text: str, bbox: BBox, confidence: float, order: int) -> None:
"""
Single word from ocr.
:param text: extracted text
Expand All @@ -14,4 +14,5 @@ def __init__(self, text: str, bbox: BBox, order: int) -> None:
super().__init__()
self.text = text.replace("—", " ")
self.bbox = bbox
self.confidence = confidence
self.order = order
2 changes: 1 addition & 1 deletion dedoc/utils/annotation_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _merge_one_group(self, annotations: List[Annotation], spaces: List[Space]) -
"""
Merge one group annotations, assume that all annotations has the same name and value
"""
if len(annotations) <= 1:
if len(annotations) <= 1 or not annotations[0].is_mergeable:
return annotations
self.__check_annotations_group(annotations)
result = []
Expand Down
6 changes: 6 additions & 0 deletions docs/source/modules/data_structures.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,9 @@ Concrete annotations
:special-members: __init__

.. autoattribute:: name

.. autoclass:: dedoc.data_structures.ConfidenceAnnotation
:show-inheritance:
:special-members: __init__

.. autoattribute:: name
7 changes: 5 additions & 2 deletions tests/api_tests/test_api_format_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation
from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation
from dedoc.utils import supported_image_types

Expand All @@ -15,8 +16,10 @@ def __check_example_file(self, result: dict) -> None:
content = result["content"]["structure"]["subparagraphs"]
self._check_similarity("Пример документа", content[0]["text"].strip().split("\n")[0])
annotations = content[0]["annotations"]
self.assertIn(BoldAnnotation.name, [annotation["name"] for annotation in annotations])
self.assertIn(SpacingAnnotation.name, [annotation["name"] for annotation in annotations])
annotation_names = {annotation["name"] for annotation in annotations}
self.assertIn(BoldAnnotation.name, annotation_names)
self.assertIn(SpacingAnnotation.name, annotation_names)
self.assertIn(ConfidenceAnnotation.name, annotation_names)
self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"])

def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None:
Expand Down

0 comments on commit 28fd511

Please sign in to comment.