From 7864134a765c3164fe142bdfa7fe09553abd9070 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Mon, 25 Mar 2024 16:13:43 +0100
Subject: [PATCH 01/11] update api

---
 api/README.md                        | 43 +++++++++++++++++------
 api/app/routes/detection.py          | 26 ++++++++++----
 api/app/routes/kie.py                | 52 +++++++++++++++++++---------
 api/app/routes/ocr.py                | 40 ++++++++++++++-------
 api/app/routes/recognition.py        | 28 +++++++++++----
 api/app/schemas.py                   | 28 +++++++++++----
 api/app/vision.py                    |  4 +--
 api/docker-compose.yml               |  2 +-
 api/pyproject.toml                   |  3 +-
 api/tests/conftest.py                |  7 ++++
 api/tests/routes/test_detection.py   | 13 ++++---
 api/tests/routes/test_kie.py         | 15 +++++---
 api/tests/routes/test_ocr.py         | 15 +++++---
 api/tests/routes/test_recognition.py |  8 +++--
 14 files changed, 205 insertions(+), 79 deletions(-)

diff --git a/api/README.md b/api/README.md
index 426e191bf2..200c2a164a 100644
--- a/api/README.md
+++ b/api/README.md
@@ -37,14 +37,21 @@ with this snippet:
 import requests
 with open('/path/to/your/img.jpg', 'rb') as f:
     data = f.read()
-print(requests.post("http://localhost:8080/detection", files={'file': data}).json())
+print(requests.post("http://localhost:8080/detection", files={'files': [data]}).json())
 ```
 
 should yield
 
 ```json
-[{'box': [0.826171875, 0.185546875, 0.90234375, 0.201171875]},
- {'box': [0.75390625, 0.185546875, 0.8173828125, 0.201171875]}]
+[
+  {
+      "name": "invitation.png",
+      "boxes": [
+        [0.50390625, 0.712890625, 0.5185546875, 0.720703125],
+        [0.4716796875, 0.712890625, 0.48828125, 0.720703125]
+      ]
+  },
+]
 ```
 
 #### Text recognition
@@ -58,13 +65,18 @@ with this snippet:
 import requests
 with open('/path/to/your/img.jpg', 'rb') as f:
     data = f.read()
-print(requests.post("http://localhost:8080/recognition", files={'file': data}).json())
+print(requests.post("http://localhost:8080/recognition", files={'files': [data]}).json())
 ```
 
 should yield
 
 ```json
-{'value': 'invite'}
+[
+  {
+      "name": "invitation.png",
+      "value": "invite"
+  },
+]
 ```
 
 #### End-to-end OCR
@@ -78,14 +90,25 @@ with this snippet:
 import requests
 with open('/path/to/your/img.jpg', 'rb') as f:
     data = f.read()
-print(requests.post("http://localhost:8080/ocr", files={'file': data}).json())
+print(requests.post("http://localhost:8080/ocr", files={'files': [data]}).json())
 ```
 
 should yield
 
 ```json
-[{'box': [0.75390625, 0.185546875, 0.8173828125, 0.201171875],
-  'value': 'Hello'},
- {'box': [0.826171875, 0.185546875, 0.90234375, 0.201171875],
-  'value': 'world!'}]
+[
+  {
+      "name": "hello_world.jpg",
+      "items": [
+      {
+          "value": "Hello",
+          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
+      },
+      {
+          "value": "world!",
+          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
+      },
+      ],
+  }
+]
 ```
diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
index 71c64a7c1c..2e9216639e 100644
--- a/api/app/routes/detection.py
+++ b/api/app/routes/detection.py
@@ -5,19 +5,33 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, File, HTTPException, UploadFile, status
 
 from app.schemas import DetectionOut
 from app.vision import det_predictor
 from doctr.file_utils import CLASS_NAME
-from doctr.io import decode_img_as_tensor
+from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[DetectionOut], status_code=status.HTTP_200_OK, summary="Perform text detection")
-async def text_detection(file: UploadFile = File(...)):
+async def text_detection(files: List[UploadFile] = [File(...)]):
     """Runs docTR text detection model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    boxes = det_predictor([img])[0]
-    return [DetectionOut(box=box.tolist()) for box in boxes[CLASS_NAME][:, :-1]]
+    boxes: List[DetectionOut] = []
+    for file in files:
+        mime_type = file.content_type
+        if mime_type in ["image/jpeg", "image/png"]:
+            content = DocumentFile.from_images([await file.read()])
+        elif mime_type == "application/pdf":
+            content = DocumentFile.from_pdf(await file.read())
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported file format for detection endpoint: {mime_type}")
+
+        boxes.append(
+            DetectionOut(
+                name=file.filename or "", boxes=[box.tolist() for box in det_predictor(content)[0][CLASS_NAME][:, :-1]]
+            )
+        )
+
+    return boxes
diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
index 2ef4cce4c8..2d947cc49e 100644
--- a/api/app/routes/kie.py
+++ b/api/app/routes/kie.py
@@ -3,27 +3,47 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, List
+from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, File, HTTPException, UploadFile, status
 
-from app.schemas import OCROut
+from app.schemas import KIEElement, KIEOut
 from app.vision import kie_predictor
-from doctr.io import decode_img_as_tensor
+from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
-@router.post("/", response_model=Dict[str, List[OCROut]], status_code=status.HTTP_200_OK, summary="Perform KIE")
-async def perform_kie(file: UploadFile = File(...)):
+@router.post("/", response_model=List[KIEOut], status_code=status.HTTP_200_OK, summary="Perform KIE")
+async def perform_kie(files: List[UploadFile] = [File(...)]):
     """Runs docTR KIE model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    out = kie_predictor([img])
-
-    return {
-        class_name: [
-            OCROut(box=(*prediction.geometry[0], *prediction.geometry[1]), value=prediction.value)
-            for prediction in out.pages[0].predictions[class_name]
-        ]
-        for class_name in out.pages[0].predictions.keys()
-    }
+    results: List[KIEOut] = []
+    for file in files:
+        mime_type = file.content_type
+        if mime_type in ["image/jpeg", "image/png"]:
+            content = DocumentFile.from_images([await file.read()])
+        elif mime_type == "application/pdf":
+            content = DocumentFile.from_pdf(await file.read())
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported file format for KIE endpoint: {mime_type}")
+
+        out = kie_predictor(content)
+
+        for page in out.pages:
+            results.append(
+                KIEOut(
+                    name=file.filename or "",
+                    predictions=[
+                        KIEElement(
+                            class_name=class_name,
+                            items=[
+                                dict(value=prediction.value, box=(*prediction.geometry[0], *prediction.geometry[1]))
+                                for prediction in page.predictions[class_name]
+                            ],
+                        )
+                        for class_name in page.predictions.keys()
+                    ],
+                )
+            )
+
+    return results
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
index 37bb05e85a..484898daae 100644
--- a/api/app/routes/ocr.py
+++ b/api/app/routes/ocr.py
@@ -5,24 +5,40 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, File, HTTPException, UploadFile, status
 
 from app.schemas import OCROut
 from app.vision import predictor
-from doctr.io import decode_img_as_tensor
+from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[OCROut], status_code=status.HTTP_200_OK, summary="Perform OCR")
-async def perform_ocr(file: UploadFile = File(...)):
+async def perform_ocr(files: List[UploadFile] = [File(...)]):
     """Runs docTR OCR model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    out = predictor([img])
-
-    return [
-        OCROut(box=(*word.geometry[0], *word.geometry[1]), value=word.value)
-        for block in out.pages[0].blocks
-        for line in block.lines
-        for word in line.words
-    ]
+    results: List[OCROut] = []
+    for file in files:
+        mime_type = file.content_type
+        if mime_type in ["image/jpeg", "image/png"]:
+            content = DocumentFile.from_images([await file.read()])
+        elif mime_type == "application/pdf":
+            content = DocumentFile.from_pdf(await file.read())
+        else:
+            raise HTTPException(status_code=400, detail=f"Unsupported file format for OCR endpoint: {mime_type}")
+
+        out = predictor(content)
+        for page in out.pages:
+            results.append(
+                OCROut(
+                    name=file.filename or "",
+                    items=[
+                        dict(value=word.value, box=(*word.geometry[0], *word.geometry[1]))
+                        for block in page.blocks
+                        for line in block.lines
+                        for word in line.words
+                    ],
+                )
+            )
+
+    return results
diff --git a/api/app/routes/recognition.py b/api/app/routes/recognition.py
index 9727424995..e8bf4610e4 100644
--- a/api/app/routes/recognition.py
+++ b/api/app/routes/recognition.py
@@ -3,18 +3,32 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from fastapi import APIRouter, File, UploadFile, status
+from typing import List
+
+from fastapi import APIRouter, File, HTTPException, UploadFile, status
 
 from app.schemas import RecognitionOut
 from app.vision import reco_predictor
-from doctr.io import decode_img_as_tensor
+from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
-@router.post("/", response_model=RecognitionOut, status_code=status.HTTP_200_OK, summary="Perform text recognition")
-async def text_recognition(file: UploadFile = File(...)):
+@router.post(
+    "/", response_model=List[RecognitionOut], status_code=status.HTTP_200_OK, summary="Perform text recognition"
+)
+async def text_recognition(files: List[UploadFile] = [File(...)]):
     """Runs docTR text recognition model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    out = reco_predictor([img])
-    return RecognitionOut(value=out[0][0])
+    words: List[RecognitionOut] = []
+    for file in files:
+        mime_type = file.content_type
+        if mime_type in ["image/jpeg", "image/png"]:
+            content = DocumentFile.from_images([await file.read()])
+        else:
+            raise HTTPException(
+                status_code=400, detail=f"Unsupported file format for recognition endpoint: {mime_type}"
+            )
+
+        words.append(RecognitionOut(name=file.filename or "", value=reco_predictor(content)[0][0]))
+
+    return words
diff --git a/api/app/schemas.py b/api/app/schemas.py
index a5bef9cef8..ad9ea1dd35 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -3,19 +3,35 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Tuple
+from typing import Dict, List, Tuple, Union
 
 from pydantic import BaseModel, Field
 
 
-# Recognition output
 class RecognitionOut(BaseModel):
-    value: str = Field(..., example="Hello")
+    name: str = Field(..., examples=["example.jpg"])
+    value: str = Field(..., examples=["Hello"])
 
 
 class DetectionOut(BaseModel):
-    box: Tuple[float, float, float, float]
+    name: str = Field(..., examples=["example.jpg"])
+    boxes: List[Tuple[float, float, float, float]]
 
 
-class OCROut(RecognitionOut, DetectionOut):
-    pass
+class OCROut(BaseModel):
+    name: str = Field(..., examples=["example.jpg"])
+    items: List[Dict[str, Union[str, Tuple[float, float, float, float]]]] = Field(
+        ..., examples=[{"value": "example", "box": [0.0, 0.0, 0.0, 0.0]}]
+    )
+
+
+class KIEElement(BaseModel):
+    class_name: str = Field(..., examples=["example"])
+    items: List[Dict[str, Union[str, Tuple[float, float, float, float]]]] = Field(
+        ..., examples=[{"value": "example", "box": [0.0, 0.0, 0.0, 0.0]}]
+    )
+
+
+class KIEOut(BaseModel):
+    name: str = Field(..., examples=["example.jpg"])
+    predictions: List[KIEElement]
diff --git a/api/app/vision.py b/api/app/vision.py
index c3e5f7560a..0ec3f73d5e 100644
--- a/api/app/vision.py
+++ b/api/app/vision.py
@@ -11,7 +11,7 @@
 
 from doctr.models import kie_predictor, ocr_predictor
 
-predictor = ocr_predictor(pretrained=True)
+predictor = ocr_predictor(pretrained=True, assume_straight_pages=True)
 det_predictor = predictor.det_predictor
 reco_predictor = predictor.reco_predictor
-kie_predictor = kie_predictor(pretrained=True)
+kie_predictor = kie_predictor(pretrained=True, assume_straight_pages=True)
diff --git a/api/docker-compose.yml b/api/docker-compose.yml
index cc85ef841b..4140ed9cbb 100644
--- a/api/docker-compose.yml
+++ b/api/docker-compose.yml
@@ -1,4 +1,4 @@
-version: '3.7'
+version: '3.8'
 
 services:
   web:
diff --git a/api/pyproject.toml b/api/pyproject.toml
index cb76a5c648..a96a3eeea2 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -10,8 +10,7 @@ authors = ["Mindee <contact@mindee.com>"]
 license = "Apache-2.0"
 
 [tool.poetry.dependencies]
-python = ">=3.9,<3.12"
-tensorflow = ">=2.11.0,<2.16.0"  # cf. https://github.com/mindee/doctr/pull/1461
+python = ">=3.8.2,<3.11"  # pypdfium2 needs a python version above 3.8.2
 python-doctr = {git = "https://github.com/mindee/doctr.git", extras = ['tf'], branch = "main" }
 # Fastapi: minimum version required to avoid pydantic error
 # cf. https://github.com/tiangolo/fastapi/issues/4168
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
index 5fb7340c18..c482a7bb00 100644
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@@ -17,6 +17,13 @@ def mock_detection_image(tmpdir_factory):
     return requests.get(url).content
 
 
+@pytest_asyncio.fixture(scope="session")
+def mock_txt_file(tmpdir_factory):
+    txt_file = tmpdir_factory.mktemp("data").join("mock.txt")
+    txt_file.write("mock text")
+    return txt_file.read("rb")
+
+
 @pytest_asyncio.fixture(scope="function")
 async def test_app_asyncio():
     # for httpx>=20, follow_redirects=True (cf. https://github.com/encode/httpx/releases/tag/0.20.0)
diff --git a/api/tests/routes/test_detection.py b/api/tests/routes/test_detection.py
index db3c17c5e7..5c6852d1eb 100644
--- a/api/tests/routes/test_detection.py
+++ b/api/tests/routes/test_detection.py
@@ -6,8 +6,8 @@
 
 
 @pytest.mark.asyncio
-async def test_text_detection(test_app_asyncio, mock_detection_image):
-    response = await test_app_asyncio.post("/detection", files={"file": mock_detection_image})
+async def test_text_detection(test_app_asyncio, mock_detection_image, mock_txt_file):
+    response = await test_app_asyncio.post("/detection", files={"files": [mock_detection_image] * 2})
     assert response.status_code == 200
     json_response = response.json()
 
@@ -16,9 +16,14 @@ async def test_text_detection(test_app_asyncio, mock_detection_image):
     gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] / 2339
 
     # Check that IoU with GT if reasonable
-    assert isinstance(json_response, list) and len(json_response) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["box"] for elt in json_response])
+    assert isinstance(json_response, list) and len(json_response) == 2
+    first_pred = json_response[0]
+    assert isinstance(first_pred, dict) and len(first_pred["boxes"]) == gt_boxes.shape[0]
+    pred_boxes = np.array(first_pred["boxes"])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
     is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
     assert gt_idxs[is_kept].shape[0] == gt_boxes.shape[0]
+
+    response = await test_app_asyncio.post("/detection", files={"files": [mock_txt_file]})
+    assert response.status_code == 400
diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
index cf3c5678a5..60fcec7e0a 100644
--- a/api/tests/routes/test_kie.py
+++ b/api/tests/routes/test_kie.py
@@ -6,8 +6,8 @@
 
 
 @pytest.mark.asyncio
-async def test_perform_kie(test_app_asyncio, mock_detection_image):
-    response = await test_app_asyncio.post("/kie", files={"file": mock_detection_image})
+async def test_perform_kie(test_app_asyncio, mock_detection_image, mock_txt_file):
+    response = await test_app_asyncio.post("/kie", files={"files": [mock_detection_image] * 2})
     assert response.status_code == 200
     json_response = response.json()
 
@@ -17,12 +17,17 @@ async def test_perform_kie(test_app_asyncio, mock_detection_image):
     gt_labels = ["Hello", "world!"]
 
     # Check that IoU with GT if reasonable
-    assert isinstance(json_response, dict) and len(list(json_response.values())[0]) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["box"] for json_out in json_response.values() for elt in json_out])
-    pred_labels = np.array([elt["value"] for json_out in json_response.values() for elt in json_out])
+    assert isinstance(json_response, list) and len(json_response) == 2
+    first_pred = json_response[0]
+    assert isinstance(first_pred, dict) and len(first_pred["predictions"]["items"]) == gt_boxes.shape[0]
+    pred_boxes = np.array([elt["box"] for elt in first_pred["predictions"]["items"]])
+    pred_labels = np.array([elt["value"] for elt in first_pred["predictions"]["items"]])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
     is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
     gt_idxs, pred_idxs = gt_idxs[is_kept], pred_idxs[is_kept]
     assert gt_idxs.shape[0] == gt_boxes.shape[0]
     assert all(gt_labels[gt_idx] == pred_labels[pred_idx] for gt_idx, pred_idx in zip(gt_idxs, pred_idxs))
+
+    response = await test_app_asyncio.post("/kie", files={"files": [mock_txt_file]})
+    assert response.status_code == 400
diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
index 3d7b3df3b9..a896181948 100644
--- a/api/tests/routes/test_ocr.py
+++ b/api/tests/routes/test_ocr.py
@@ -6,8 +6,8 @@
 
 
 @pytest.mark.asyncio
-async def test_perform_ocr(test_app_asyncio, mock_detection_image):
-    response = await test_app_asyncio.post("/ocr", files={"file": mock_detection_image})
+async def test_perform_ocr(test_app_asyncio, mock_detection_image, mock_txt_file):
+    response = await test_app_asyncio.post("/ocr", files={"files": [mock_detection_image] * 2})
     assert response.status_code == 200
     json_response = response.json()
 
@@ -17,12 +17,17 @@ async def test_perform_ocr(test_app_asyncio, mock_detection_image):
     gt_labels = ["Hello", "world!"]
 
     # Check that IoU with GT if reasonable
-    assert isinstance(json_response, list) and len(json_response) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["box"] for elt in json_response])
-    pred_labels = np.array([elt["value"] for elt in json_response])
+    assert isinstance(json_response, list) and len(json_response) == 2
+    first_pred = json_response[0]
+    assert isinstance(first_pred, dict) and len(first_pred["items"]) == gt_boxes.shape[0]
+    pred_boxes = np.array([elt["box"] for elt in first_pred["items"]])
+    pred_labels = np.array([elt["value"] for elt in first_pred["items"]])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
     is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
     gt_idxs, pred_idxs = gt_idxs[is_kept], pred_idxs[is_kept]
     assert gt_idxs.shape[0] == gt_boxes.shape[0]
     assert all(gt_labels[gt_idx] == pred_labels[pred_idx] for gt_idx, pred_idx in zip(gt_idxs, pred_idxs))
+
+    response = await test_app_asyncio.post("/ocr", files={"files": [mock_txt_file]})
+    assert response.status_code == 400
diff --git a/api/tests/routes/test_recognition.py b/api/tests/routes/test_recognition.py
index 95467758a8..990d9fb900 100644
--- a/api/tests/routes/test_recognition.py
+++ b/api/tests/routes/test_recognition.py
@@ -2,8 +2,10 @@
 
 
 @pytest.mark.asyncio
-async def test_text_recognition(test_app_asyncio, mock_recognition_image):
-    response = await test_app_asyncio.post("/recognition", files={"file": mock_recognition_image})
+async def test_text_recognition(test_app_asyncio, mock_recognition_image, mock_txt_file):
+    response = await test_app_asyncio.post("/recognition", files={"files": [mock_recognition_image] * 2})
     assert response.status_code == 200
+    assert response.json() == [{"value": "invite"}, {"value": "invite"}]
 
-    assert response.json() == {"value": "invite"}
+    response = await test_app_asyncio.post("/recognition", files={"files": [mock_txt_file]})
+    assert response.status_code == 400

From 73e3faa7f07b5121878527fa1f5294a45ce83f90 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Mon, 1 Apr 2024 12:06:53 +0200
Subject: [PATCH 02/11] rebase

---
 api/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/api/pyproject.toml b/api/pyproject.toml
index a96a3eeea2..9824f0442a 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -10,7 +10,7 @@ authors = ["Mindee <contact@mindee.com>"]
 license = "Apache-2.0"
 
 [tool.poetry.dependencies]
-python = ">=3.8.2,<3.11"  # pypdfium2 needs a python version above 3.8.2
+python = ">=3.9,<3.12"
 python-doctr = {git = "https://github.com/mindee/doctr.git", extras = ['tf'], branch = "main" }
 # Fastapi: minimum version required to avoid pydantic error
 # cf. https://github.com/tiangolo/fastapi/issues/4168

From f0412ece3fd4aaefef3fd9eb4141f71ed821ee3d Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 10 Apr 2024 14:30:18 +0200
Subject: [PATCH 03/11] update with included config

---
 api/README.md                      |  91 +++++++++++++++++++-----
 api/app/routes/detection.py        |  40 +++++------
 api/app/routes/kie.py              |  62 +++++++++--------
 api/app/routes/ocr.py              |  70 ++++++++++++-------
 api/app/routes/recognition.py      |  32 ++++-----
 api/app/schemas.py                 | 107 +++++++++++++++++++++++++++--
 api/app/utils.py                   |  49 +++++++++++++
 api/app/vision.py                  |  38 ++++++++--
 api/tests/routes/test_detection.py |   6 +-
 api/tests/routes/test_kie.py       |  10 ++-
 api/tests/routes/test_ocr.py       |   2 +-
 api/tests/utils/test_utils.py      |  26 +++++++
 api/tests/utils/test_vision.py     |  13 ++++
 13 files changed, 416 insertions(+), 130 deletions(-)
 create mode 100644 api/app/utils.py
 create mode 100644 api/tests/utils/test_utils.py
 create mode 100644 api/tests/utils/test_vision.py

diff --git a/api/README.md b/api/README.md
index 200c2a164a..a9501e2542 100644
--- a/api/README.md
+++ b/api/README.md
@@ -45,12 +45,22 @@ should yield
 ```json
 [
   {
-      "name": "invitation.png",
-      "boxes": [
-        [0.50390625, 0.712890625, 0.5185546875, 0.720703125],
-        [0.4716796875, 0.712890625, 0.48828125, 0.720703125]
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "geometries": [
+      [
+        0.724609375,
+        0.1787109375,
+        0.7900390625,
+        0.2080078125
+      ],
+      [
+        0.6748046875,
+        0.1796875,
+        0.7314453125,
+        0.20703125
       ]
-  },
+    ]
+  }
 ]
 ```
 
@@ -73,9 +83,10 @@ should yield
 ```json
 [
   {
-      "name": "invitation.png",
-      "value": "invite"
-  },
+    "name": "117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg",
+    "value": "invite",
+    "confidence": 1.0
+  }
 ]
 ```
 
@@ -98,17 +109,61 @@ should yield
 ```json
 [
   {
-      "name": "hello_world.jpg",
-      "items": [
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "orientation": {
+      "value": 0,
+      "confidence": null
+    },
+    "language": {
+      "value": null,
+      "confidence": null
+    },
+    "items": [
       {
-          "value": "Hello",
-          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
-      },
-      {
-          "value": "world!",
-          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
-      },
-      ],
+        "blocks": [
+          {
+            "geometry": [
+              0.7471996155154171,
+              0.1787109375,
+              0.9101580212741838,
+              0.2080078125
+            ],
+            "lines": [
+              {
+                "geometry": [
+                  0.7471996155154171,
+                  0.1787109375,
+                  0.9101580212741838,
+                  0.2080078125
+                ],
+                "words": [
+                  {
+                    "value": "Hello",
+                    "geometry": [
+                      0.7471996155154171,
+                      0.1796875,
+                      0.8272978149561669,
+                      0.20703125
+                    ],
+                    "confidence": 1.0
+                  },
+                  {
+                    "value": "world!",
+                    "geometry": [
+                      0.8176307908857315,
+                      0.1787109375,
+                      0.9101580212741838,
+                      0.2080078125
+                    ],
+                    "confidence": 1.0
+                  }
+                ]
+              }
+            ]
+          }
+        ]
+      }
+    ]
   }
 ]
 ```
diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
index 2e9216639e..e044d1f815 100644
--- a/api/app/routes/detection.py
+++ b/api/app/routes/detection.py
@@ -5,33 +5,31 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import DetectionOut
-from app.vision import det_predictor
+from app.schemas import DetectionIn, DetectionOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 from doctr.file_utils import CLASS_NAME
-from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[DetectionOut], status_code=status.HTTP_200_OK, summary="Perform text detection")
-async def text_detection(files: List[UploadFile] = [File(...)]):
+async def text_detection(request: DetectionIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR text detection model to analyze the input image"""
-    boxes: List[DetectionOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for detection endpoint: {mime_type}")
-
-        boxes.append(
-            DetectionOut(
-                name=file.filename or "", boxes=[box.tolist() for box in det_predictor(content)[0][CLASS_NAME][:, :-1]]
-            )
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    return [
+        DetectionOut(
+            name=filename,
+            geometries=[
+                geom[:-1].tolist() if len(geom) == 5 else resolve_geometry(geom.tolist()) for geom in doc[CLASS_NAME]
+            ],
         )
-
-    return boxes
+        for doc, filename in zip(predictor(content), filenames)
+    ]
diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
index 2d947cc49e..ece3e1a8cb 100644
--- a/api/app/routes/kie.py
+++ b/api/app/routes/kie.py
@@ -5,45 +5,47 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import KIEElement, KIEOut
-from app.vision import kie_predictor
-from doctr.io import DocumentFile
+from app.schemas import KIEElement, KIEIn, KIEOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[KIEOut], status_code=status.HTTP_200_OK, summary="Perform KIE")
-async def perform_kie(files: List[UploadFile] = [File(...)]):
+async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR KIE model to analyze the input image"""
-    results: List[KIEOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for KIE endpoint: {mime_type}")
-
-        out = kie_predictor(content)
-
-        for page in out.pages:
-            results.append(
-                KIEOut(
-                    name=file.filename or "",
-                    predictions=[
-                        KIEElement(
-                            class_name=class_name,
-                            items=[
-                                dict(value=prediction.value, box=(*prediction.geometry[0], *prediction.geometry[1]))
-                                for prediction in page.predictions[class_name]
-                            ],
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        KIEOut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            predictions=[
+                KIEElement(
+                    class_name=class_name,
+                    items=[
+                        dict(
+                            value=prediction.value,
+                            geometry=resolve_geometry(prediction.geometry),
+                            confidence=round(prediction.confidence, 2),
                         )
-                        for class_name in page.predictions.keys()
+                        for prediction in page.predictions[class_name]
                     ],
                 )
-            )
+                for class_name in page.predictions.keys()
+            ],
+        )
+        for i, page in enumerate(out.pages)
+    ]
 
     return results
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
index 484898daae..dc18af795c 100644
--- a/api/app/routes/ocr.py
+++ b/api/app/routes/ocr.py
@@ -5,40 +5,58 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import OCROut
-from app.vision import predictor
-from doctr.io import DocumentFile
+from app.schemas import OCRBlock, OCRIn, OCRLine, OCROut, OCRPage, OCRWord
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[OCROut], status_code=status.HTTP_200_OK, summary="Perform OCR")
-async def perform_ocr(files: List[UploadFile] = [File(...)]):
+async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR OCR model to analyze the input image"""
-    results: List[OCROut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for OCR endpoint: {mime_type}")
-
-        out = predictor(content)
-        for page in out.pages:
-            results.append(
-                OCROut(
-                    name=file.filename or "",
-                    items=[
-                        dict(value=word.value, box=(*word.geometry[0], *word.geometry[1]))
+    try:
+        # generator object to list
+        content, filenames = await get_documents(files)
+        predictor = init_predictor(request)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        OCROut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            items=[
+                OCRPage(
+                    blocks=[
+                        OCRBlock(
+                            geometry=resolve_geometry(block.geometry),
+                            lines=[
+                                OCRLine(
+                                    geometry=resolve_geometry(line.geometry),
+                                    words=[
+                                        OCRWord(
+                                            value=word.value,
+                                            geometry=resolve_geometry(word.geometry),
+                                            confidence=round(word.confidence, 2),
+                                        )
+                                        for word in line.words
+                                    ],
+                                )
+                                for line in block.lines
+                            ],
+                        )
                         for block in page.blocks
-                        for line in block.lines
-                        for word in line.words
-                    ],
+                    ]
                 )
-            )
+            ],
+        )
+        for i, page in enumerate(out.pages)
+    ]
 
     return results
diff --git a/api/app/routes/recognition.py b/api/app/routes/recognition.py
index e8bf4610e4..65de3e07ba 100644
--- a/api/app/routes/recognition.py
+++ b/api/app/routes/recognition.py
@@ -5,11 +5,11 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import RecognitionOut
-from app.vision import reco_predictor
-from doctr.io import DocumentFile
+from app.schemas import RecognitionIn, RecognitionOut
+from app.utils import get_documents
+from app.vision import init_predictor
 
 router = APIRouter()
 
@@ -17,18 +17,14 @@
 @router.post(
     "/", response_model=List[RecognitionOut], status_code=status.HTTP_200_OK, summary="Perform text recognition"
 )
-async def text_recognition(files: List[UploadFile] = [File(...)]):
+async def text_recognition(request: RecognitionIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR text recognition model to analyze the input image"""
-    words: List[RecognitionOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        else:
-            raise HTTPException(
-                status_code=400, detail=f"Unsupported file format for recognition endpoint: {mime_type}"
-            )
-
-        words.append(RecognitionOut(name=file.filename or "", value=reco_predictor(content)[0][0]))
-
-    return words
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return [
+        RecognitionOut(name=filename, value=res[0], confidence=round(res[1], 2))
+        for res, filename in zip(predictor(content), filenames)
+    ]
diff --git a/api/app/schemas.py b/api/app/schemas.py
index ad9ea1dd35..46a9cb0ac5 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -3,35 +3,130 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Union
 
 from pydantic import BaseModel, Field
 
 
+class KIEIn(BaseModel):
+    det_arch: str = Field(default="db_resnet50", examples=["db_resnet50"])
+    reco_arch: str = Field(default="crnn_vgg16_bn", examples=["crnn_vgg16_bn"])
+    assume_straight_pages: bool = Field(default=True, examples=[True])
+    preserve_aspect_ratio: bool = Field(default=True, examples=[True])
+    detect_orientation: bool = Field(default=False, examples=[False])
+    detect_language: bool = Field(default=False, examples=[False])
+    symmetric_pad: bool = Field(default=True, examples=[True])
+    straighten_pages: bool = Field(default=False, examples=[False])
+    det_bs: int = Field(default=2, examples=[2])
+    reco_bs: int = Field(default=128, examples=[128])
+    bin_thresh: float = Field(default=0.1, examples=[0.1])
+    box_thresh: float = Field(default=0.1, examples=[0.1])
+
+
+class OCRIn(KIEIn):
+    resolve_lines: bool = Field(default=True, examples=[True])
+    resolve_blocks: bool = Field(default=True, examples=[True])
+    paragraph_break: float = Field(default=0.0035, examples=[0.0035])
+
+
+class RecognitionIn(BaseModel):
+    reco_arch: str = Field(default="crnn_vgg16_bn", examples=["crnn_vgg16_bn"])
+    reco_bs: int = Field(default=128, examples=[128])
+
+
+class DetectionIn(BaseModel):
+    det_arch: str = Field(default="db_resnet50", examples=["db_resnet50"])
+    assume_straight_pages: bool = Field(default=True, examples=[True])
+    preserve_aspect_ratio: bool = Field(default=True, examples=[True])
+    symmetric_pad: bool = Field(default=True, examples=[True])
+    det_bs: int = Field(default=2, examples=[2])
+    bin_thresh: float = Field(default=0.1, examples=[0.1])
+    box_thresh: float = Field(default=0.1, examples=[0.1])
+
+
 class RecognitionOut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
     value: str = Field(..., examples=["Hello"])
+    confidence: float = Field(..., examples=[0.99])
 
 
 class DetectionOut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
-    boxes: List[Tuple[float, float, float, float]]
+    geometries: List[List[float]] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+
+
+class OCRWord(BaseModel):
+    value: str = Field(..., examples=["example"])
+    geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    confidence: float = Field(..., examples=[0.99])
+
+
+class OCRLine(BaseModel):
+    geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    words: List[OCRWord] = Field(
+        ..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
+    )
+
+
+class OCRBlock(BaseModel):
+    geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
+    lines: List[OCRLine] = Field(
+        ...,
+        examples=[
+            {
+                "geometry": [0.0, 0.0, 0.0, 0.0],
+                "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+            }
+        ],
+    )
+
+
+class OCRPage(BaseModel):
+    blocks: List[OCRBlock] = Field(
+        ...,
+        examples=[
+            {
+                "geometry": [0.0, 0.0, 0.0, 0.0],
+                "lines": [
+                    {
+                        "geometry": [0.0, 0.0, 0.0, 0.0],
+                        "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+                    }
+                ],
+            }
+        ],
+    )
 
 
 class OCROut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
-    items: List[Dict[str, Union[str, Tuple[float, float, float, float]]]] = Field(
-        ..., examples=[{"value": "example", "box": [0.0, 0.0, 0.0, 0.0]}]
+    orientation: Dict[str, Union[float, None]] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
+    language: Dict[str, Union[str, float, None]] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
+    items: List[OCRPage] = Field(
+        ...,
+        examples=[
+            {
+                "geometry": [0.0, 0.0, 0.0, 0.0],
+                "lines": [
+                    {
+                        "geometry": [0.0, 0.0, 0.0, 0.0],
+                        "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+                    }
+                ],
+            }
+        ],
     )
 
 
 class KIEElement(BaseModel):
     class_name: str = Field(..., examples=["example"])
-    items: List[Dict[str, Union[str, Tuple[float, float, float, float]]]] = Field(
-        ..., examples=[{"value": "example", "box": [0.0, 0.0, 0.0, 0.0]}]
+    items: List[Dict[str, Union[str, List[float], float]]] = Field(
+        ..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
     )
 
 
 class KIEOut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
+    orientation: Dict[str, Union[float, None]] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
+    language: Dict[str, Union[str, float, None]] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
     predictions: List[KIEElement]
diff --git a/api/app/utils.py b/api/app/utils.py
new file mode 100644
index 0000000000..d1897f51b1
--- /dev/null
+++ b/api/app/utils.py
@@ -0,0 +1,49 @@
+# Copyright (C) 2021-2024, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+
+from typing import Any, List, Tuple, Union
+
+import numpy as np
+from fastapi import UploadFile
+
+from doctr.io import DocumentFile
+
+
+def resolve_geometry(
+    geom: Any,
+) -> Union[Tuple[float, float, float, float], Tuple[float, float, float, float, float, float, float, float]]:
+    if len(geom) == 4:
+        return (*geom[0], *geom[1], *geom[2], *geom[3])
+    return (*geom[0], *geom[1])
+
+
+async def get_documents(files: List[UploadFile]) -> Tuple[List[np.ndarray], List[str]]:  # pragma: no cover
+    """Convert a list of UploadFile objects to lists of numpy arrays and their corresponding filenames
+
+    Args:
+    ----
+        files: list of UploadFile objects
+
+    Returns:
+    -------
+        Tuple[List[np.ndarray], List[str]]: list of numpy arrays and their corresponding filenames
+
+    """
+    filenames = []
+    docs = []
+    for file in files:
+        mime_type = file.content_type
+        if mime_type in ["image/jpeg", "image/png"]:
+            docs.extend(DocumentFile.from_images([await file.read()]))
+            filenames.append(file.filename or "")
+        elif mime_type == "application/pdf":
+            pdf_content = DocumentFile.from_pdf(await file.read())
+            docs.extend(pdf_content)
+            filenames.append(file.filename or "" * len(pdf_content))
+        else:
+            raise ValueError(f"Unsupported file format: {mime_type} for file {file.filename}")
+
+    return docs, filenames
diff --git a/api/app/vision.py b/api/app/vision.py
index 0ec3f73d5e..005c8d1548 100644
--- a/api/app/vision.py
+++ b/api/app/vision.py
@@ -3,15 +3,45 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
+
 import tensorflow as tf
 
 gpu_devices = tf.config.experimental.list_physical_devices("GPU")
 if any(gpu_devices):
     tf.config.experimental.set_memory_growth(gpu_devices[0], True)
 
+from typing import Callable, Union
+
 from doctr.models import kie_predictor, ocr_predictor
 
-predictor = ocr_predictor(pretrained=True, assume_straight_pages=True)
-det_predictor = predictor.det_predictor
-reco_predictor = predictor.reco_predictor
-kie_predictor = kie_predictor(pretrained=True, assume_straight_pages=True)
+from .schemas import DetectionIn, KIEIn, OCRIn, RecognitionIn
+
+
+def init_predictor(request: Union[KIEIn, OCRIn, RecognitionIn, DetectionIn]) -> Callable:
+    """Initialize the predictor based on the request
+
+    Args:
+    ----
+        request: input request
+
+    Returns:
+    -------
+        Callable: the predictor
+    """
+    params = request.model_dump()
+    bin_thresh = params.pop("bin_thresh", None)
+    box_thresh = params.pop("box_thresh", None)
+    if isinstance(request, (OCRIn, RecognitionIn, DetectionIn)):
+        predictor = ocr_predictor(pretrained=True, **params)
+        predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
+        predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
+        if isinstance(request, DetectionIn):
+            return predictor.det_predictor
+        elif isinstance(request, RecognitionIn):
+            return predictor.reco_predictor
+        return predictor
+    elif isinstance(request, KIEIn):
+        predictor = kie_predictor(pretrained=True, **params)
+        predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
+        predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
+        return predictor
diff --git a/api/tests/routes/test_detection.py b/api/tests/routes/test_detection.py
index 5c6852d1eb..05f54a11e9 100644
--- a/api/tests/routes/test_detection.py
+++ b/api/tests/routes/test_detection.py
@@ -17,9 +17,9 @@ async def test_text_detection(test_app_asyncio, mock_detection_image, mock_txt_f
 
     # Check that IoU with GT if reasonable
     assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]
-    assert isinstance(first_pred, dict) and len(first_pred["boxes"]) == gt_boxes.shape[0]
-    pred_boxes = np.array(first_pred["boxes"])
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
+    assert isinstance(first_pred, dict) and len(first_pred["geometries"]) == gt_boxes.shape[0]
+    pred_boxes = np.array(first_pred["geometries"])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
     is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
index 60fcec7e0a..2b0c9b3b38 100644
--- a/api/tests/routes/test_kie.py
+++ b/api/tests/routes/test_kie.py
@@ -18,9 +18,13 @@ async def test_perform_kie(test_app_asyncio, mock_detection_image, mock_txt_file
 
     # Check that IoU with GT if reasonable
     assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]
-    assert isinstance(first_pred, dict) and len(first_pred["predictions"]["items"]) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["box"] for elt in first_pred["predictions"]["items"]])
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
+    assert (
+        isinstance(first_pred, dict)
+        and len(first_pred["predictions"]["items"]) == gt_boxes.shape[0]
+        and isinstance(first_pred["predictions"]["class_name"], str)
+    )
+    pred_boxes = np.array([elt["geometry"] for elt in first_pred["predictions"]["items"]])
     pred_labels = np.array([elt["value"] for elt in first_pred["predictions"]["items"]])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
index a896181948..aa678c27ee 100644
--- a/api/tests/routes/test_ocr.py
+++ b/api/tests/routes/test_ocr.py
@@ -18,7 +18,7 @@ async def test_perform_ocr(test_app_asyncio, mock_detection_image, mock_txt_file
 
     # Check that IoU with GT if reasonable
     assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
     assert isinstance(first_pred, dict) and len(first_pred["items"]) == gt_boxes.shape[0]
     pred_boxes = np.array([elt["box"] for elt in first_pred["items"]])
     pred_labels = np.array([elt["value"] for elt in first_pred["items"]])
diff --git a/api/tests/utils/test_utils.py b/api/tests/utils/test_utils.py
new file mode 100644
index 0000000000..b346565feb
--- /dev/null
+++ b/api/tests/utils/test_utils.py
@@ -0,0 +1,26 @@
+from app.utils import resolve_geometry
+
+
+def test_resolve_geometry():
+    dummy_box = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
+    dummy_polygon = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
+
+    assert resolve_geometry(dummy_box) == (0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0)
+    assert resolve_geometry(dummy_polygon) == [
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.0,
+        1.0,
+        0.0,
+        1.0,
+        0.0,
+        0.0,
+        1.0,
+        0.0,
+        1.0,
+        1.0,
+        0.0,
+        1.0,
+    ]
diff --git a/api/tests/utils/test_vision.py b/api/tests/utils/test_vision.py
new file mode 100644
index 0000000000..04050268f7
--- /dev/null
+++ b/api/tests/utils/test_vision.py
@@ -0,0 +1,13 @@
+from app.schemas import DetectionIn, KIEIn, OCRIn, RecognitionIn
+from app.vision import init_predictor
+from doctr.models.detection.predictor import DetectionPredictor
+from doctr.models.kie_predictor import KIEPredictor
+from doctr.models.predictor import OCRPredictor
+from doctr.models.recognition.predictor import RecognitionPredictor
+
+
+def test_vision():
+    assert isinstance(init_predictor(OCRIn), OCRPredictor)
+    assert isinstance(init_predictor(DetectionIn), DetectionPredictor)
+    assert isinstance(init_predictor(RecognitionIn), RecognitionPredictor)
+    assert isinstance(init_predictor(KIEIn), KIEPredictor)

From 0525b69b0e34dfb4da9af8fa1ea7a7cc955de918 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 10 Apr 2024 15:12:23 +0200
Subject: [PATCH 04/11] update mypy + tests

---
 api/tests/routes/test_ocr.py     |  6 +++---
 doctr/datasets/generator/base.py | 11 ++++++-----
 doctr/io/image/pytorch.py        |  2 +-
 doctr/io/image/tensorflow.py     |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
index aa678c27ee..731117bc19 100644
--- a/api/tests/routes/test_ocr.py
+++ b/api/tests/routes/test_ocr.py
@@ -19,9 +19,9 @@ async def test_perform_ocr(test_app_asyncio, mock_detection_image, mock_txt_file
     # Check that IoU with GT if reasonable
     assert isinstance(json_response, list) and len(json_response) == 2
     first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
-    assert isinstance(first_pred, dict) and len(first_pred["items"]) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["box"] for elt in first_pred["items"]])
-    pred_labels = np.array([elt["value"] for elt in first_pred["items"]])
+    assert isinstance(first_pred, dict) and len(first_pred["items"]["blocks"]["lines"]["words"]) == gt_boxes.shape[0]
+    pred_boxes = np.array([elt["geometry"] for elt in first_pred["items"]["blocks"]["lines"]["words"]])
+    pred_labels = np.array([elt["value"] for elt in first_pred["items"]["blocks"]["lines"]["words"]])
     iou_mat = box_iou(gt_boxes, pred_boxes)
     gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
     is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
diff --git a/doctr/datasets/generator/base.py b/doctr/datasets/generator/base.py
index 71a09abd85..424f59563d 100644
--- a/doctr/datasets/generator/base.py
+++ b/doctr/datasets/generator/base.py
@@ -20,7 +20,7 @@ def synthesize_text_img(
     font_family: Optional[str] = None,
     background_color: Optional[Tuple[int, int, int]] = None,
     text_color: Optional[Tuple[int, int, int]] = None,
-) -> Image:
+) -> Image.Image:
     """Generate a synthetic text image
 
     Args:
@@ -81,7 +81,7 @@ def __init__(
         self._data: List[Image.Image] = []
         if cache_samples:
             self._data = [
-                (synthesize_text_img(char, font_family=font), idx)
+                (synthesize_text_img(char, font_family=font), idx)  # type: ignore[misc]
                 for idx, char in enumerate(self.vocab)
                 for font in self.font_family
             ]
@@ -93,7 +93,7 @@ def _read_sample(self, index: int) -> Tuple[Any, int]:
         # Samples are already cached
         if len(self._data) > 0:
             idx = index % len(self._data)
-            pil_img, target = self._data[idx]
+            pil_img, target = self._data[idx]  # type: ignore[misc]
         else:
             target = index % len(self.vocab)
             pil_img = synthesize_text_img(self.vocab[target], font_family=random.choice(self.font_family))
@@ -132,7 +132,8 @@ def __init__(
         if cache_samples:
             _words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
             self._data = [
-                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text) for text in _words
+                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text)  # type: ignore[misc]
+                for text in _words
             ]
 
     def _generate_string(self, min_chars: int, max_chars: int) -> str:
@@ -145,7 +146,7 @@ def __len__(self) -> int:
     def _read_sample(self, index: int) -> Tuple[Any, str]:
         # Samples are already cached
         if len(self._data) > 0:
-            pil_img, target = self._data[index]
+            pil_img, target = self._data[index]  # type: ignore[misc]
         else:
             target = self._generate_string(*self.wordlen_range)
             pil_img = synthesize_text_img(target, font_family=random.choice(self.font_family))
diff --git a/doctr/io/image/pytorch.py b/doctr/io/image/pytorch.py
index 26e3e76f95..2e8450e840 100644
--- a/doctr/io/image/pytorch.py
+++ b/doctr/io/image/pytorch.py
@@ -16,7 +16,7 @@
 __all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
 
 
-def tensor_from_pil(pil_img: Image, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) -> torch.Tensor:
     """Convert a PIL Image to a PyTorch tensor
 
     Args:
diff --git a/doctr/io/image/tensorflow.py b/doctr/io/image/tensorflow.py
index dbfc55b4be..28fb2fadd5 100644
--- a/doctr/io/image/tensorflow.py
+++ b/doctr/io/image/tensorflow.py
@@ -15,7 +15,7 @@
 __all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
 
 
-def tensor_from_pil(pil_img: Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
     """Convert a PIL Image to a TensorFlow tensor
 
     Args:

From f7c653d2ac87aacd5bffa2dd451b5995f72f99cf Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 10 Apr 2024 15:15:35 +0200
Subject: [PATCH 05/11] update ci

---
 .github/workflows/docker.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index f19c87e358..88b021de14 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -29,9 +29,9 @@ jobs:
           python-version: ${{ matrix.python }}
           architecture: x64
       - name: Build & run docker
-        run: cd api && docker-compose up -d --build
+        run: cd api && docker compose up -d --build
       - name: Ping server
         run: wget --spider --tries=12 http://localhost:8080/docs
       - name: Run docker test
         run: |
-          docker-compose -f api/docker-compose.yml exec --no-TTY web pytest tests/
+          docker compose -f api/docker-compose.yml exec --no-TTY web pytest tests/

From 37699f601241c0d68d1f785f293899fc62520f4f Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 10 Apr 2024 16:10:18 +0200
Subject: [PATCH 06/11] ci

---
 .github/workflows/docker.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 88b021de14..7409385424 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -34,4 +34,4 @@ jobs:
         run: wget --spider --tries=12 http://localhost:8080/docs
       - name: Run docker test
         run: |
-          docker compose -f api/docker-compose.yml exec --no-TTY web pytest tests/
+          docker-compose -f api/docker-compose.yml exec --no-TTY web pytest tests/

From 6b1ed8b7549a4edca79c5c72db9358ff4e422a9b Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Wed, 10 Apr 2024 16:43:08 +0200
Subject: [PATCH 07/11] update

---
 api/Makefile                  |  4 ++--
 api/app/schemas.py            |  2 +-
 api/tests/utils/test_utils.py | 14 +++-----------
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/api/Makefile b/api/Makefile
index 689931dd29..27ef5584c9 100644
--- a/api/Makefile
+++ b/api/Makefile
@@ -18,8 +18,8 @@ stop:
 # Run tests for the library
 test:
 	docker compose up -d --build
-	docker cp requirements-dev.txt api_web_1:/app/requirements-dev.txt
+	docker cp requirements-dev.txt api_web:/app/requirements-dev.txt
 	docker compose exec -T web pip install -r requirements-dev.txt
-	docker cp tests api_web_1:/app/tests
+	docker cp tests api_web:/app/tests
 	docker compose exec -T web pytest tests/
 	docker compose down
diff --git a/api/app/schemas.py b/api/app/schemas.py
index 46a9cb0ac5..1dac0d26ea 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -23,7 +23,7 @@ class KIEIn(BaseModel):
     box_thresh: float = Field(default=0.1, examples=[0.1])
 
 
-class OCRIn(KIEIn):
+class OCRIn(KIEIn, BaseModel):
     resolve_lines: bool = Field(default=True, examples=[True])
     resolve_blocks: bool = Field(default=True, examples=[True])
     paragraph_break: float = Field(default=0.0035, examples=[0.0035])
diff --git a/api/tests/utils/test_utils.py b/api/tests/utils/test_utils.py
index b346565feb..6474993aa9 100644
--- a/api/tests/utils/test_utils.py
+++ b/api/tests/utils/test_utils.py
@@ -2,10 +2,10 @@
 
 
 def test_resolve_geometry():
-    dummy_box = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
-    dummy_polygon = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0), (0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
+    dummy_box = [(0.0, 0.0), (1.0, 0.0)]
+    dummy_polygon = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
 
-    assert resolve_geometry(dummy_box) == (0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0)
+    assert resolve_geometry(dummy_box) == (0.0, 0.0, 1.0, 0.0)
     assert resolve_geometry(dummy_polygon) == [
         0.0,
         0.0,
@@ -15,12 +15,4 @@ def test_resolve_geometry():
         1.0,
         0.0,
         1.0,
-        0.0,
-        0.0,
-        1.0,
-        0.0,
-        1.0,
-        1.0,
-        0.0,
-        1.0,
     ]

From 43079b453e60d075c848dd02cc49547d10048c20 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Thu, 11 Apr 2024 09:12:37 +0200
Subject: [PATCH 08/11] update api tests

---
 .github/workflows/docker.yml         |   5 +-
 api/Makefile                         |   2 +-
 api/tests/conftest.py                | 215 +++++++++++++++++++++++++++
 api/tests/routes/test_detection.py   |  65 +++++---
 api/tests/routes/test_kie.py         |  82 ++++++----
 api/tests/routes/test_ocr.py         |  76 +++++++---
 api/tests/routes/test_recognition.py |  25 +++-
 api/tests/utils/test_utils.py        |  11 +-
 api/tests/utils/test_vision.py       |   8 +-
 9 files changed, 404 insertions(+), 85 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 7409385424..e65b1452ac 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -29,9 +29,8 @@ jobs:
           python-version: ${{ matrix.python }}
           architecture: x64
       - name: Build & run docker
-        run: cd api && docker compose up -d --build
+        run: cd api && make lock && make run
       - name: Ping server
         run: wget --spider --tries=12 http://localhost:8080/docs
       - name: Run docker test
-        run: |
-          docker-compose -f api/docker-compose.yml exec --no-TTY web pytest tests/
+        run: make test
diff --git a/api/Makefile b/api/Makefile
index 27ef5584c9..1a71619f02 100644
--- a/api/Makefile
+++ b/api/Makefile
@@ -21,5 +21,5 @@ test:
 	docker cp requirements-dev.txt api_web:/app/requirements-dev.txt
 	docker compose exec -T web pip install -r requirements-dev.txt
 	docker cp tests api_web:/app/tests
-	docker compose exec -T web pytest tests/
+	docker compose exec -T web pytest tests/ -vv
 	docker compose down
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
index c482a7bb00..d5316b18bd 100644
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@@ -29,3 +29,218 @@ async def test_app_asyncio():
     # for httpx>=20, follow_redirects=True (cf. https://github.com/encode/httpx/releases/tag/0.20.0)
     async with AsyncClient(app=app, base_url="http://test", follow_redirects=True) as ac:
         yield ac  # testing happens here
+
+
+@pytest_asyncio.fixture(scope="function")
+def mock_detection_response():
+    return {
+        "box": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "geometries": [
+                [0.724609375, 0.1787109375, 0.7900390625, 0.2080078125],
+                [0.6748046875, 0.1796875, 0.7314453125, 0.20703125],
+            ],
+        },
+        "poly": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "geometries": [
+                [
+                    0.7873152494430542,
+                    0.17740710079669952,
+                    0.7884310483932495,
+                    0.20474515855312347,
+                    0.7244035005569458,
+                    0.20735852420330048,
+                    0.7232877016067505,
+                    0.18002046644687653,
+                ],
+                [
+                    0.7286394834518433,
+                    0.17740298807621002,
+                    0.7298480272293091,
+                    0.2027825564146042,
+                    0.6746810674667358,
+                    0.20540954172611237,
+                    0.67347252368927,
+                    0.1800299733877182,
+                ],
+            ],
+        },
+    }
+
+
+@pytest_asyncio.fixture(scope="function")
+def mock_kie_response():
+    return {
+        "box": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "orientation": {"value": None, "confidence": None},
+            "language": {"value": None, "confidence": None},
+            "predictions": [
+                {
+                    "class_name": "words",
+                    "items": [
+                        {
+                            "value": "Hello",
+                            "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
+                            "confidence": 1,
+                        },
+                        {
+                            "value": "world!",
+                            "geometry": [0.8176307908857315, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                            "confidence": 1,
+                        },
+                    ],
+                }
+            ],
+        },
+        "poly": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "orientation": {"value": None, "confidence": None},
+            "language": {"value": None, "confidence": None},
+            "predictions": [
+                {
+                    "class_name": "words",
+                    "items": [
+                        {
+                            "value": "Hello",
+                            "geometry": [
+                                0.7453157305717468,
+                                0.1800299733877182,
+                                0.8233299851417542,
+                                0.17740298807621002,
+                                0.8250390291213989,
+                                0.2027825564146042,
+                                0.7470247745513916,
+                                0.20540954172611237,
+                            ],
+                            "confidence": 0.99,
+                        },
+                        {
+                            "value": "world!",
+                            "geometry": [
+                                0.8157618045806885,
+                                0.18002046644687653,
+                                0.9063061475753784,
+                                0.17740710079669952,
+                                0.9078840017318726,
+                                0.20474515855312347,
+                                0.8173396587371826,
+                                0.20735852420330048,
+                            ],
+                            "confidence": 1,
+                        },
+                    ],
+                }
+            ],
+        },
+    }
+
+
+@pytest_asyncio.fixture(scope="function")
+def mock_ocr_response():
+    return {
+        "box": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "orientation": {"value": None, "confidence": None},
+            "language": {"value": None, "confidence": None},
+            "items": [
+                {
+                    "blocks": [
+                        {
+                            "geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                            "lines": [
+                                {
+                                    "geometry": [0.7471996155154171, 0.1787109375, 0.9101580212741838, 0.2080078125],
+                                    "words": [
+                                        {
+                                            "value": "Hello",
+                                            "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
+                                            "confidence": 1,
+                                        },
+                                        {
+                                            "value": "world!",
+                                            "geometry": [
+                                                0.8176307908857315,
+                                                0.1787109375,
+                                                0.9101580212741838,
+                                                0.2080078125,
+                                            ],
+                                            "confidence": 1,
+                                        },
+                                    ],
+                                }
+                            ],
+                        }
+                    ]
+                }
+            ],
+        },
+        "poly": {
+            "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+            "orientation": {"value": None, "confidence": None},
+            "language": {"value": None, "confidence": None},
+            "items": [
+                {
+                    "blocks": [
+                        {
+                            "geometry": [
+                                0.7451040148735046,
+                                0.17927837371826172,
+                                0.9062581658363342,
+                                0.17407986521720886,
+                                0.9072266221046448,
+                                0.2041015625,
+                                0.7460724711418152,
+                                0.20930007100105286,
+                            ],
+                            "lines": [
+                                {
+                                    "geometry": [
+                                        0.7451040148735046,
+                                        0.17927837371826172,
+                                        0.9062581658363342,
+                                        0.17407986521720886,
+                                        0.9072266221046448,
+                                        0.2041015625,
+                                        0.7460724711418152,
+                                        0.20930007100105286,
+                                    ],
+                                    "words": [
+                                        {
+                                            "value": "Hello",
+                                            "geometry": [
+                                                0.7453157305717468,
+                                                0.1800299733877182,
+                                                0.8233299851417542,
+                                                0.17740298807621002,
+                                                0.8250390291213989,
+                                                0.2027825564146042,
+                                                0.7470247745513916,
+                                                0.20540954172611237,
+                                            ],
+                                            "confidence": 0.99,
+                                        },
+                                        {
+                                            "value": "world!",
+                                            "geometry": [
+                                                0.8157618045806885,
+                                                0.18002046644687653,
+                                                0.9063061475753784,
+                                                0.17740710079669952,
+                                                0.9078840017318726,
+                                                0.20474515855312347,
+                                                0.8173396587371826,
+                                                0.20735852420330048,
+                                            ],
+                                            "confidence": 1,
+                                        },
+                                    ],
+                                }
+                            ],
+                        }
+                    ]
+                }
+            ],
+        },
+    }
diff --git a/api/tests/routes/test_detection.py b/api/tests/routes/test_detection.py
index 05f54a11e9..51672fd962 100644
--- a/api/tests/routes/test_detection.py
+++ b/api/tests/routes/test_detection.py
@@ -1,29 +1,58 @@
 import numpy as np
 import pytest
-from scipy.optimize import linear_sum_assignment
 
-from doctr.utils.metrics import box_iou
+
+def common_test(json_response, expected_response):
+    assert isinstance(json_response, list) and len(json_response) == 2
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
+
+    assert isinstance(first_pred["name"], str)
+    np.testing.assert_allclose(first_pred["geometries"], expected_response["geometries"], rtol=1e-2)
 
 
 @pytest.mark.asyncio
-async def test_text_detection(test_app_asyncio, mock_detection_image, mock_txt_file):
-    response = await test_app_asyncio.post("/detection", files={"files": [mock_detection_image] * 2})
+async def test_text_detection_box(test_app_asyncio, mock_detection_image, mock_detection_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50"}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/detection", params=params, files=files, headers=headers)
     assert response.status_code == 200
     json_response = response.json()
 
-    gt_boxes = np.array([[1240, 430, 1355, 470], [1360, 430, 1495, 470]], dtype=np.float32)
-    gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] / 1654
-    gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] / 2339
+    expected_box_response = mock_detection_response["box"]
+    common_test(json_response, expected_box_response)
 
-    # Check that IoU with GT if reasonable
-    assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
-    assert isinstance(first_pred, dict) and len(first_pred["geometries"]) == gt_boxes.shape[0]
-    pred_boxes = np.array(first_pred["geometries"])
-    iou_mat = box_iou(gt_boxes, pred_boxes)
-    gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
-    is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
-    assert gt_idxs[is_kept].shape[0] == gt_boxes.shape[0]
-
-    response = await test_app_asyncio.post("/detection", files={"files": [mock_txt_file]})
+
+@pytest.mark.asyncio
+async def test_text_detection_poly(test_app_asyncio, mock_detection_image, mock_detection_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50", "assume_straight_pages": False}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/detection", params=params, files=files, headers=headers)
+    assert response.status_code == 200
+    json_response = response.json()
+
+    expected_poly_response = mock_detection_response["poly"]
+    common_test(json_response, expected_poly_response)
+
+
+@pytest.mark.asyncio
+async def test_text_detection_invalid_file(test_app_asyncio, mock_txt_file):
+    headers = {
+        "accept": "application/json",
+    }
+    files = [
+        ("files", ("test.txt", mock_txt_file)),
+    ]
+    response = await test_app_asyncio.post("/detection", files=files, headers=headers)
     assert response.status_code == 400
diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
index 2b0c9b3b38..00411120b9 100644
--- a/api/tests/routes/test_kie.py
+++ b/api/tests/routes/test_kie.py
@@ -1,37 +1,69 @@
 import numpy as np
 import pytest
-from scipy.optimize import linear_sum_assignment
 
-from doctr.utils.metrics import box_iou
+
+def common_test(json_response, expected_response):
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
+    assert isinstance(first_pred["name"], str)
+    assert isinstance(first_pred["predictions"], list)
+    assert isinstance(expected_response["predictions"], list)
+
+    for pred, expected_pred in zip(first_pred["predictions"], expected_response["predictions"]):
+        assert pred["class_name"] == expected_pred["class_name"]
+        assert isinstance(pred["items"], list)
+        assert isinstance(expected_pred["items"], list)
+
+        for pred_item, expected_pred_item in zip(pred["items"], expected_pred["items"]):
+            assert isinstance(pred_item["value"], str) and pred_item["value"] == expected_pred_item["value"]
+            assert isinstance(pred_item["confidence"], (int, float))
+            np.testing.assert_allclose(pred_item["geometry"], expected_pred_item["geometry"], rtol=1e-2)
 
 
 @pytest.mark.asyncio
-async def test_perform_kie(test_app_asyncio, mock_detection_image, mock_txt_file):
-    response = await test_app_asyncio.post("/kie", files={"files": [mock_detection_image] * 2})
+async def test_kie_box(test_app_asyncio, mock_detection_image, mock_kie_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/kie", params=params, files=files, headers=headers)
     assert response.status_code == 200
     json_response = response.json()
 
-    gt_boxes = np.array([[1240, 430, 1355, 470], [1360, 430, 1495, 470]], dtype=np.float32)
-    gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] / 1654
-    gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] / 2339
-    gt_labels = ["Hello", "world!"]
+    expected_box_response = mock_kie_response["box"]
+    assert isinstance(json_response, list) and len(json_response) == 2
+    common_test(json_response, expected_box_response)
+
+
+@pytest.mark.asyncio
+async def test_kie_poly(test_app_asyncio, mock_detection_image, mock_kie_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn", "assume_straight_pages": False}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/kie", params=params, files=files, headers=headers)
+    assert response.status_code == 200
+    json_response = response.json()
 
-    # Check that IoU with GT if reasonable
+    expected_poly_response = mock_kie_response["poly"]
     assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
-    assert (
-        isinstance(first_pred, dict)
-        and len(first_pred["predictions"]["items"]) == gt_boxes.shape[0]
-        and isinstance(first_pred["predictions"]["class_name"], str)
-    )
-    pred_boxes = np.array([elt["geometry"] for elt in first_pred["predictions"]["items"]])
-    pred_labels = np.array([elt["value"] for elt in first_pred["predictions"]["items"]])
-    iou_mat = box_iou(gt_boxes, pred_boxes)
-    gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
-    is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
-    gt_idxs, pred_idxs = gt_idxs[is_kept], pred_idxs[is_kept]
-    assert gt_idxs.shape[0] == gt_boxes.shape[0]
-    assert all(gt_labels[gt_idx] == pred_labels[pred_idx] for gt_idx, pred_idx in zip(gt_idxs, pred_idxs))
-
-    response = await test_app_asyncio.post("/kie", files={"files": [mock_txt_file]})
+    common_test(json_response, expected_poly_response)
+
+
+@pytest.mark.asyncio
+async def test_kie_invalid_file(test_app_asyncio, mock_txt_file):
+    headers = {
+        "accept": "application/json",
+    }
+    files = [
+        ("files", ("test.txt", mock_txt_file)),
+    ]
+    response = await test_app_asyncio.post("/kie", files=files, headers=headers)
     assert response.status_code == 400
diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
index 731117bc19..f30587bac2 100644
--- a/api/tests/routes/test_ocr.py
+++ b/api/tests/routes/test_ocr.py
@@ -1,33 +1,67 @@
 import numpy as np
 import pytest
-from scipy.optimize import linear_sum_assignment
 
-from doctr.utils.metrics import box_iou
+
+def common_test(json_response, expected_response):
+    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
+
+    assert isinstance(first_pred["name"], str)
+    for item, expected_item in zip(first_pred["items"], expected_response["items"]):
+        for block, expected_block in zip(item["blocks"], expected_item["blocks"]):
+            np.testing.assert_allclose(block["geometry"], expected_block["geometry"], rtol=1e-2)
+            for line, expected_line in zip(block["lines"], expected_block["lines"]):
+                np.testing.assert_allclose(line["geometry"], expected_line["geometry"], rtol=1e-2)
+                for word, expected_word in zip(line["words"], expected_line["words"]):
+                    np.testing.assert_allclose(word["geometry"], expected_word["geometry"], rtol=1e-2)
+                    assert isinstance(word["value"], str) and word["value"] == expected_word["value"]
+                    assert isinstance(word["confidence"], (int, float))
 
 
 @pytest.mark.asyncio
-async def test_perform_ocr(test_app_asyncio, mock_detection_image, mock_txt_file):
-    response = await test_app_asyncio.post("/ocr", files={"files": [mock_detection_image] * 2})
+async def test_ocr_box(test_app_asyncio, mock_detection_image, mock_ocr_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/ocr", params=params, files=files, headers=headers)
     assert response.status_code == 200
     json_response = response.json()
 
-    gt_boxes = np.array([[1240, 430, 1355, 470], [1360, 430, 1495, 470]], dtype=np.float32)
-    gt_boxes[:, [0, 2]] = gt_boxes[:, [0, 2]] / 1654
-    gt_boxes[:, [1, 3]] = gt_boxes[:, [1, 3]] / 2339
-    gt_labels = ["Hello", "world!"]
+    expected_box_response = mock_ocr_response["box"]
+    assert isinstance(json_response, list) and len(json_response) == 2
+    common_test(json_response, expected_box_response)
 
-    # Check that IoU with GT if reasonable
+
+@pytest.mark.asyncio
+async def test_ocr_poly(test_app_asyncio, mock_detection_image, mock_ocr_response):
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn", "assume_straight_pages": False}
+    files = [
+        ("files", ("test.jpg", mock_detection_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_detection_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/ocr", params=params, files=files, headers=headers)
+    assert response.status_code == 200
+    json_response = response.json()
+
+    expected_poly_response = mock_ocr_response["poly"]
     assert isinstance(json_response, list) and len(json_response) == 2
-    first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
-    assert isinstance(first_pred, dict) and len(first_pred["items"]["blocks"]["lines"]["words"]) == gt_boxes.shape[0]
-    pred_boxes = np.array([elt["geometry"] for elt in first_pred["items"]["blocks"]["lines"]["words"]])
-    pred_labels = np.array([elt["value"] for elt in first_pred["items"]["blocks"]["lines"]["words"]])
-    iou_mat = box_iou(gt_boxes, pred_boxes)
-    gt_idxs, pred_idxs = linear_sum_assignment(-iou_mat)
-    is_kept = iou_mat[gt_idxs, pred_idxs] >= 0.8
-    gt_idxs, pred_idxs = gt_idxs[is_kept], pred_idxs[is_kept]
-    assert gt_idxs.shape[0] == gt_boxes.shape[0]
-    assert all(gt_labels[gt_idx] == pred_labels[pred_idx] for gt_idx, pred_idx in zip(gt_idxs, pred_idxs))
-
-    response = await test_app_asyncio.post("/ocr", files={"files": [mock_txt_file]})
+    common_test(json_response, expected_poly_response)
+
+
+@pytest.mark.asyncio
+async def test_ocr_invalid_file(test_app_asyncio, mock_txt_file):
+    headers = {
+        "accept": "application/json",
+    }
+    files = [
+        ("files", ("test.txt", mock_txt_file)),
+    ]
+    response = await test_app_asyncio.post("/ocr", files=files, headers=headers)
     assert response.status_code == 400
diff --git a/api/tests/routes/test_recognition.py b/api/tests/routes/test_recognition.py
index 990d9fb900..61c6561133 100644
--- a/api/tests/routes/test_recognition.py
+++ b/api/tests/routes/test_recognition.py
@@ -3,9 +3,28 @@
 
 @pytest.mark.asyncio
 async def test_text_recognition(test_app_asyncio, mock_recognition_image, mock_txt_file):
-    response = await test_app_asyncio.post("/recognition", files={"files": [mock_recognition_image] * 2})
+    headers = {
+        "accept": "application/json",
+    }
+    params = {"reco_arch": "crnn_vgg16_bn"}
+    files = [
+        ("files", ("test.jpg", mock_recognition_image, "image/jpeg")),
+        ("files", ("test2.jpg", mock_recognition_image, "image/jpeg")),
+    ]
+    response = await test_app_asyncio.post("/recognition", params=params, files=files, headers=headers)
     assert response.status_code == 200
-    assert response.json() == [{"value": "invite"}, {"value": "invite"}]
+    json_response = response.json()
+    assert isinstance(json_response, list) and len(json_response) == 2
+    for item in json_response:
+        assert isinstance(item["name"], str)
+        assert isinstance(item["value"], str) and item["value"] == "invite"
+        assert isinstance(item["confidence"], (int, float)) and item["confidence"] >= 0.8
 
-    response = await test_app_asyncio.post("/recognition", files={"files": [mock_txt_file]})
+    headers = {
+        "accept": "application/json",
+    }
+    files = [
+        ("files", ("test.txt", mock_txt_file)),
+    ]
+    response = await test_app_asyncio.post("/recognition", files=files, headers=headers)
     assert response.status_code == 400
diff --git a/api/tests/utils/test_utils.py b/api/tests/utils/test_utils.py
index 6474993aa9..09b3a2eb7a 100644
--- a/api/tests/utils/test_utils.py
+++ b/api/tests/utils/test_utils.py
@@ -6,13 +6,4 @@ def test_resolve_geometry():
     dummy_polygon = [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0)]
 
     assert resolve_geometry(dummy_box) == (0.0, 0.0, 1.0, 0.0)
-    assert resolve_geometry(dummy_polygon) == [
-        0.0,
-        0.0,
-        1.0,
-        0.0,
-        1.0,
-        1.0,
-        0.0,
-        1.0,
-    ]
+    assert resolve_geometry(dummy_polygon) == (0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0)
diff --git a/api/tests/utils/test_vision.py b/api/tests/utils/test_vision.py
index 04050268f7..4375322a65 100644
--- a/api/tests/utils/test_vision.py
+++ b/api/tests/utils/test_vision.py
@@ -7,7 +7,7 @@
 
 
 def test_vision():
-    assert isinstance(init_predictor(OCRIn), OCRPredictor)
-    assert isinstance(init_predictor(DetectionIn), DetectionPredictor)
-    assert isinstance(init_predictor(RecognitionIn), RecognitionPredictor)
-    assert isinstance(init_predictor(KIEIn), KIEPredictor)
+    assert isinstance(init_predictor(OCRIn()), OCRPredictor)
+    assert isinstance(init_predictor(DetectionIn()), DetectionPredictor)
+    assert isinstance(init_predictor(RecognitionIn()), RecognitionPredictor)
+    assert isinstance(init_predictor(KIEIn()), KIEPredictor)

From f5cdafbd9ccb8136b6a464b43be571d6a4e598de Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Thu, 11 Apr 2024 09:21:45 +0200
Subject: [PATCH 09/11] update

---
 api/Dockerfile | 2 +-
 api/Makefile   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/api/Dockerfile b/api/Dockerfile
index a158e44721..8038ed28c8 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update \
 COPY pyproject.toml  /app/pyproject.toml
 COPY Makefile /app/Makefile
 
-RUN pip install --upgrade pip setuptools wheel poetry \
+RUN pip install --upgrade pip setuptools wheel \
     && make lock \
     && pip install -r /app/requirements.txt \
     && pip cache purge \
diff --git a/api/Makefile b/api/Makefile
index 1a71619f02..09e9841e91 100644
--- a/api/Makefile
+++ b/api/Makefile
@@ -3,6 +3,7 @@
 .PHONY: lock run stop test
 # Pin the dependencies
 lock:
+	pip install poetry>=1.0
 	poetry lock
 	poetry export -f requirements.txt --without-hashes --output requirements.txt
 	poetry export -f requirements.txt --without-hashes --with dev --output requirements-dev.txt

From 2f3ebedab975864ce0efe73a52c8aabe09a42d83 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Thu, 11 Apr 2024 09:33:37 +0200
Subject: [PATCH 10/11] update readme

---
 .github/workflows/docker.yml |  2 +-
 api/README.md                | 30 ++++++++++++++++++++++++------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index e65b1452ac..0aa5a44976 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -33,4 +33,4 @@ jobs:
       - name: Ping server
         run: wget --spider --tries=12 http://localhost:8080/docs
       - name: Run docker test
-        run: make test
+        run: cd api && make test
diff --git a/api/README.md b/api/README.md
index a9501e2542..09708c916c 100644
--- a/api/README.md
+++ b/api/README.md
@@ -35,9 +35,15 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"det_arch": "db_resnet50"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/detection", files={'files': [data]}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/detection", headers=headers, params=params, files=files).json())
 ```
 
 should yield
@@ -73,9 +79,15 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"reco_arch": "crnn_vgg16_bn"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/recognition", files={'files': [data]}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/recognition", headers=headers, params=params, files=files).json())
 ```
 
 should yield
@@ -99,9 +111,15 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/ocr", files={'files': [data]}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/ocr", headers=headers, params=params, files=files).json())
 ```
 
 should yield

From 9137f09d0f84a39846a9f1268af5a780d5a44b91 Mon Sep 17 00:00:00 2001
From: felix <felixdittrich92@gmail.com>
Date: Thu, 11 Apr 2024 10:26:19 +0200
Subject: [PATCH 11/11] add missing dimensions

---
 api/README.md                | 1 +
 api/app/routes/kie.py        | 1 +
 api/app/routes/ocr.py        | 1 +
 api/app/schemas.py           | 4 +++-
 api/app/utils.py             | 2 +-
 api/tests/conftest.py        | 4 ++++
 api/tests/routes/test_kie.py | 5 +++++
 api/tests/routes/test_ocr.py | 5 +++++
 8 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/api/README.md b/api/README.md
index 09708c916c..4126e808c5 100644
--- a/api/README.md
+++ b/api/README.md
@@ -136,6 +136,7 @@ should yield
       "value": null,
       "confidence": null
     },
+    "dimensions": [2339, 1654],
     "items": [
       {
         "blocks": [
diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
index ece3e1a8cb..46b2d92be1 100644
--- a/api/app/routes/kie.py
+++ b/api/app/routes/kie.py
@@ -30,6 +30,7 @@ async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [Fil
             name=filenames[i],
             orientation=page.orientation,
             language=page.language,
+            dimensions=page.dimensions,
             predictions=[
                 KIEElement(
                     class_name=class_name,
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
index dc18af795c..4c766e9f35 100644
--- a/api/app/routes/ocr.py
+++ b/api/app/routes/ocr.py
@@ -31,6 +31,7 @@ async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [Fil
             name=filenames[i],
             orientation=page.orientation,
             language=page.language,
+            dimensions=page.dimensions,
             items=[
                 OCRPage(
                     blocks=[
diff --git a/api/app/schemas.py b/api/app/schemas.py
index 1dac0d26ea..8fe3fce38f 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -3,7 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union
 
 from pydantic import BaseModel, Field
 
@@ -102,6 +102,7 @@ class OCROut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
     orientation: Dict[str, Union[float, None]] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
     language: Dict[str, Union[str, float, None]] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
+    dimensions: Tuple[int, int] = Field(..., examples=[(100, 100)])
     items: List[OCRPage] = Field(
         ...,
         examples=[
@@ -129,4 +130,5 @@ class KIEOut(BaseModel):
     name: str = Field(..., examples=["example.jpg"])
     orientation: Dict[str, Union[float, None]] = Field(..., examples=[{"value": 0.0, "confidence": 0.99}])
     language: Dict[str, Union[str, float, None]] = Field(..., examples=[{"value": "en", "confidence": 0.99}])
+    dimensions: Tuple[int, int] = Field(..., examples=[(100, 100)])
     predictions: List[KIEElement]
diff --git a/api/app/utils.py b/api/app/utils.py
index d1897f51b1..511a75ad9e 100644
--- a/api/app/utils.py
+++ b/api/app/utils.py
@@ -42,7 +42,7 @@ async def get_documents(files: List[UploadFile]) -> Tuple[List[np.ndarray], List
         elif mime_type == "application/pdf":
             pdf_content = DocumentFile.from_pdf(await file.read())
             docs.extend(pdf_content)
-            filenames.append(file.filename or "" * len(pdf_content))
+            filenames.extend([file.filename] * len(pdf_content) or [""] * len(pdf_content))
         else:
             raise ValueError(f"Unsupported file format: {mime_type} for file {file.filename}")
 
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
index d5316b18bd..41872b47ec 100644
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@@ -76,6 +76,7 @@ def mock_kie_response():
             "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
             "orientation": {"value": None, "confidence": None},
             "language": {"value": None, "confidence": None},
+            "dimensions": [2339, 1654],
             "predictions": [
                 {
                     "class_name": "words",
@@ -98,6 +99,7 @@ def mock_kie_response():
             "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
             "orientation": {"value": None, "confidence": None},
             "language": {"value": None, "confidence": None},
+            "dimensions": [2339, 1654],
             "predictions": [
                 {
                     "class_name": "words",
@@ -144,6 +146,7 @@ def mock_ocr_response():
             "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
             "orientation": {"value": None, "confidence": None},
             "language": {"value": None, "confidence": None},
+            "dimensions": [2339, 1654],
             "items": [
                 {
                     "blocks": [
@@ -180,6 +183,7 @@ def mock_ocr_response():
             "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
             "orientation": {"value": None, "confidence": None},
             "language": {"value": None, "confidence": None},
+            "dimensions": [2339, 1654],
             "items": [
                 {
                     "blocks": [
diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
index 00411120b9..36ca4b5b62 100644
--- a/api/tests/routes/test_kie.py
+++ b/api/tests/routes/test_kie.py
@@ -5,6 +5,11 @@
 def common_test(json_response, expected_response):
     first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
     assert isinstance(first_pred["name"], str)
+    assert (
+        isinstance(first_pred["dimensions"], (tuple, list))
+        and len(first_pred["dimensions"]) == 2
+        and all(isinstance(dim, int) for dim in first_pred["dimensions"])
+    )
     assert isinstance(first_pred["predictions"], list)
     assert isinstance(expected_response["predictions"], list)
 
diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
index f30587bac2..c702084447 100644
--- a/api/tests/routes/test_ocr.py
+++ b/api/tests/routes/test_ocr.py
@@ -6,6 +6,11 @@ def common_test(json_response, expected_response):
     first_pred = json_response[0]  # it's enough to test for the first file because the same image is used twice
 
     assert isinstance(first_pred["name"], str)
+    assert (
+        isinstance(first_pred["dimensions"], (tuple, list))
+        and len(first_pred["dimensions"]) == 2
+        and all(isinstance(dim, int) for dim in first_pred["dimensions"])
+    )
     for item, expected_item in zip(first_pred["items"], expected_response["items"]):
         for block, expected_block in zip(item["blocks"], expected_item["blocks"]):
             np.testing.assert_allclose(block["geometry"], expected_block["geometry"], rtol=1e-2)