mindee · felixdittrich92 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -310,6 +310,10 @@ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **
     def synthesize(self, **kwargs) -> np.ndarray:
         """Synthesize the page from the predictions
 
+        Args:
+        ----
+            **kwargs: keyword arguments passed to the `synthesize_page` method
+
         Returns
         -------
             synthesized page
@@ -493,7 +497,7 @@ def synthesize(self, **kwargs) -> np.ndarray:
 
         Args:
         ----
-            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
+            **kwargs: keyword arguments passed to the `synthesize_kie_page` method
 
         Returns:
         -------
@@ -603,11 +607,15 @@ def show(self, **kwargs) -> None:
     def synthesize(self, **kwargs) -> List[np.ndarray]:
         """Synthesize all pages from their predictions
 
+        Args:
+        ----
+            **kwargs: keyword arguments passed to the `Page.synthesize` method
+
         Returns
         -------
             list of synthesized pages
         """
-        return [page.synthesize() for page in self.pages]
+        return [page.synthesize(**kwargs) for page in self.pages]
 
     def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
         """Export the document as XML (hOCR-format)

diff --git a/doctr/utils/reconstitution.py b/doctr/utils/reconstitution.py
@@ -2,6 +2,7 @@
 
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import logging
 from typing import Any, Dict, Optional
 
 import numpy as np
@@ -13,61 +14,163 @@
 __all__ = ["synthesize_page", "synthesize_kie_page"]
 
 
+# Global variable to avoid multiple warnings
+ROTATION_WARNING = False
+
+
+def _warn_rotation(entry: Dict[str, Any]) -> None:  # pragma: no cover
+    global ROTATION_WARNING
+    if not ROTATION_WARNING and len(entry["geometry"]) == 4:
+        logging.warning("Polygons with larger rotations will lead to inaccurate rendering")
+        ROTATION_WARNING = True
+
+
+def _synthesize(
+    response: Image.Image,
+    entry: Dict[str, Any],
+    w: int,
+    h: int,
+    draw_proba: bool = False,
+    font_family: Optional[str] = None,
+    smoothing_factor: float = 0.75,
+    min_font_size: int = 6,
+    max_font_size: int = 50,
+) -> Image.Image:
+    if len(entry["geometry"]) == 2:
+        (xmin, ymin), (xmax, ymax) = entry["geometry"]
+        polygon = [(xmin, ymin), (xmax, ymin), (xmax, ymax), (xmin, ymax)]
+    else:
+        polygon = entry["geometry"]
+
+    # Calculate the bounding box of the word
+    x_coords, y_coords = zip(*polygon)
+    xmin, ymin, xmax, ymax = (
+        int(round(w * min(x_coords))),
+        int(round(h * min(y_coords))),
+        int(round(w * max(x_coords))),
+        int(round(h * max(y_coords))),
+    )
+    word_width = xmax - xmin
+    word_height = ymax - ymin
+
+    # If lines are provided instead of words, concatenate the word entries
+    if "words" in entry:
+        word_text = " ".join(word["value"] for word in entry["words"])
+    else:
+        word_text = entry["value"]
+    # Find the optimal font size
+    try:
+        font_size = min(word_height, max_font_size)
+        font = get_font(font_family, font_size)
+        text_width, text_height = font.getbbox(word_text)[2:4]
+
+        while (text_width > word_width or text_height > word_height) and font_size > min_font_size:
+            font_size = max(int(font_size * smoothing_factor), min_font_size)
+            font = get_font(font_family, font_size)
+            text_width, text_height = font.getbbox(word_text)[2:4]
+    except ValueError:
+        font = get_font(font_family, min_font_size)
+
+    # Create a mask for the word
+    mask = Image.new("L", (w, h), 0)
+    ImageDraw.Draw(mask).polygon([(int(round(w * x)), int(round(h * y))) for x, y in polygon], fill=255)
+
+    # Draw the word text
+    d = ImageDraw.Draw(response)
+    try:
+        try:
+            d.text((xmin, ymin), word_text, font=font, fill=(0, 0, 0), anchor="lt")
+        except UnicodeEncodeError:
+            d.text((xmin, ymin), anyascii(word_text), font=font, fill=(0, 0, 0), anchor="lt")
+    # Catch generic exceptions to avoid crashing the whole rendering
+    except Exception:  # pragma: no cover
+        logging.warning(f"Could not render word: {word_text}")
+
+    if draw_proba:
+        confidence = (
+            entry["confidence"]
+            if "confidence" in entry
+            else sum(w["confidence"] for w in entry["words"]) / len(entry["words"])
+        )
+        p = int(255 * confidence)
+        color = (255 - p, 0, p)  # Red to blue gradient based on probability
+        d.rectangle([(xmin, ymin), (xmax, ymax)], outline=color, width=2)
+
+        prob_font = get_font(font_family, 20)
+        prob_text = f"{confidence:.2f}"
+        prob_text_width, prob_text_height = prob_font.getbbox(prob_text)[2:4]
+
+        # Position the probability slightly above the bounding box
+        prob_x_offset = (word_width - prob_text_width) // 2
+        prob_y_offset = ymin - prob_text_height - 2
+        prob_y_offset = max(0, prob_y_offset)
+
+        d.text((xmin + prob_x_offset, prob_y_offset), prob_text, font=prob_font, fill=color, anchor="lt")
+
+    return response
+
+
 def synthesize_page(
     page: Dict[str, Any],
     draw_proba: bool = False,
     font_family: Optional[str] = None,
+    smoothing_factor: float = 0.95,
+    min_font_size: int = 8,
+    max_font_size: int = 50,
 ) -> np.ndarray:
     """Draw a the content of the element page (OCR response) on a blank page.
 
     Args:
     ----
         page: exported Page object to represent
         draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
-        font_size: size of the font, default font = 13
         font_family: family of the font
+        smoothing_factor: factor to smooth the font size
+        min_font_size: minimum font size
+        max_font_size: maximum font size
 
     Returns:
     -------
         the synthesized page
     """
     # Draw template
     h, w = page["dimensions"]
-    response = 255 * np.ones((h, w, 3), dtype=np.int32)
+    response = Image.new("RGB", (w, h), color=(255, 255, 255))
 
-    # Draw each word
     for block in page["blocks"]:
-        for line in block["lines"]:
-            for word in line["words"]:
-                # Get absolute word geometry
-                (xmin, ymin), (xmax, ymax) = word["geometry"]
-                xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
-                ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
-
-                # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
-                font = get_font(font_family, int(0.75 * (ymax - ymin)))
-                img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
-                d = ImageDraw.Draw(img)
-                # Draw in black the value of the word
-                try:
-                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
-                except UnicodeEncodeError:
-                    # When character cannot be encoded, use its anyascii version
-                    d.text((0, 0), anyascii(word["value"]), font=font, fill=(0, 0, 0))
-
-                # Colorize if draw_proba
-                if draw_proba:
-                    p = int(255 * word["confidence"])
-                    mask = np.where(np.array(img) == 0, 1, 0)
-                    proba: np.ndarray = np.array([255 - p, 0, p])
-                    color = mask * proba[np.newaxis, np.newaxis, :]
-                    white_mask = 255 * (1 - mask)
-                    img = color + white_mask
-
-                # Write to response page
-                response[ymin:ymax, xmin:xmax, :] = np.array(img)
-
-    return response
+        # If lines are provided use these to get better rendering results
+        if len(block["lines"]) > 1:
+            for line in block["lines"]:
+                _warn_rotation(block)  # pragma: no cover
+                response = _synthesize(
+                    response=response,
+                    entry=line,
+                    w=w,
+                    h=h,
+                    draw_proba=draw_proba,
+                    font_family=font_family,
+                    smoothing_factor=smoothing_factor,
+                    min_font_size=min_font_size,
+                    max_font_size=max_font_size,
+                )
+        # Otherwise, draw each word
+        else:
+            for line in block["lines"]:
+                _warn_rotation(block)  # pragma: no cover
+                for word in line["words"]:
+                    response = _synthesize(
+                        response=response,
+                        entry=word,
+                        w=w,
+                        h=h,
+                        draw_proba=draw_proba,
+                        font_family=font_family,
+                        smoothing_factor=smoothing_factor,
+                        min_font_size=min_font_size,
+                        max_font_size=max_font_size,
+                    )
+
+    return np.array(response, dtype=np.uint8)
 
 
 def synthesize_kie_page(
@@ -81,46 +184,29 @@
     ----
         page: exported Page object to represent
         draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
-        font_size: size of the font, default font = 13
         font_family: family of the font
+        smoothing_factor: factor to smooth the font size
+        min_font_size: minimum font size
+        max_font_size: maximum font size
 
     Returns:
     -------
         the synthesized page
     """
     # Draw template
     h, w = page["dimensions"]
-    response = 255 * np.ones((h, w, 3), dtype=np.int32)
+    response = Image.new("RGB", (w, h), color=(255, 255, 255))
 
     # Draw each word
     for predictions in page["predictions"].values():
         for prediction in predictions:
-            # Get aboslute word geometry
-            (xmin, ymin), (xmax, ymax) = prediction["geometry"]
-            xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
-            ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
-
-            # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
-            font = get_font(font_family, int(0.75 * (ymax - ymin)))
-            img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
-            d = ImageDraw.Draw(img)
-            # Draw in black the value of the word
-            try:
-                d.text((0, 0), prediction["value"], font=font, fill=(0, 0, 0))
-            except UnicodeEncodeError:
-                # When character cannot be encoded, use its anyascii version
-                d.text((0, 0), anyascii(prediction["value"]), font=font, fill=(0, 0, 0))
-
-            # Colorize if draw_proba
-            if draw_proba:
-                p = int(255 * prediction["confidence"])
-                mask = np.where(np.array(img) == 0, 1, 0)
-                proba: np.ndarray = np.array([255 - p, 0, p])
-                color = mask * proba[np.newaxis, np.newaxis, :]
-                white_mask = 255 * (1 - mask)
-                img = color + white_mask
-
-            # Write to response page
-            response[ymin:ymax, xmin:xmax, :] = np.array(img)
-
-    return response
+            _warn_rotation(prediction)  # pragma: no cover
+            response = _synthesize(
+                response=response,
+                entry=prediction,
+                w=w,
+                h=h,
+                draw_proba=draw_proba,
+                font_family=font_family,
+            )
+    return np.array(response, dtype=np.uint8)
diff --git a/tests/common/test_utils_reconstitution.py b/tests/common/test_utils_reconstitution.py
@@ -1,12 +1,37 @@
 import numpy as np
-from test_io_elements import _mock_pages
+from test_io_elements import _mock_kie_pages, _mock_pages
 
 from doctr.utils import reconstitution
 
 
 def test_synthesize_page():
     pages = _mock_pages()
-    reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
-    render = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
-    assert isinstance(render, np.ndarray)
-    assert render.shape == (*pages[0].dimensions, 3)
+    # Test without probability rendering
+    render_no_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=False)
+    assert isinstance(render_no_proba, np.ndarray)
+    assert render_no_proba.shape == (*pages[0].dimensions, 3)
+
+    # Test with probability rendering
+    render_with_proba = reconstitution.synthesize_page(pages[0].export(), draw_proba=True)
+    assert isinstance(render_with_proba, np.ndarray)
+    assert render_with_proba.shape == (*pages[0].dimensions, 3)
+
+    # Test with only one line
+    pages_one_line = pages[0].export()
+    pages_one_line["blocks"][0]["lines"] = [pages_one_line["blocks"][0]["lines"][0]]
+    render_one_line = reconstitution.synthesize_page(pages_one_line, draw_proba=True)
+    assert isinstance(render_one_line, np.ndarray)
+    assert render_one_line.shape == (*pages[0].dimensions, 3)
+
+
+def test_synthesize_kie_page():
+    pages = _mock_kie_pages()
+    # Test without probability rendering
+    render_no_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=False)
+    assert isinstance(render_no_proba, np.ndarray)
+    assert render_no_proba.shape == (*pages[0].dimensions, 3)
+
+    # Test with probability rendering
+    render_with_proba = reconstitution.synthesize_kie_page(pages[0].export(), draw_proba=True)
+    assert isinstance(render_with_proba, np.ndarray)
+    assert render_with_proba.shape == (*pages[0].dimensions, 3)