mindee · felixdittrich92 · Apr 11, 2024 · Mar 25, 2024 · Apr 1, 2024 · Apr 10, 2024
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -29,9 +29,8 @@ jobs:
           python-version: ${{ matrix.python }}
           architecture: x64
       - name: Build & run docker
-        run: cd api && docker-compose up -d --build
+        run: cd api && make lock && make run
       - name: Ping server
         run: wget --spider --tries=12 http://localhost:8080/docs
       - name: Run docker test
-        run: |
-          docker-compose -f api/docker-compose.yml exec --no-TTY web pytest tests/
+        run: cd api && make test
diff --git a/api/Dockerfile b/api/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update \
 COPY pyproject.toml  /app/pyproject.toml
 COPY Makefile /app/Makefile
 
-RUN pip install --upgrade pip setuptools wheel poetry \
+RUN pip install --upgrade pip setuptools wheel \
     && make lock \
     && pip install -r /app/requirements.txt \
     && pip cache purge \

diff --git a/api/Makefile b/api/Makefile
@@ -3,6 +3,7 @@
 .PHONY: lock run stop test
 # Pin the dependencies
 lock:
+	pip install poetry>=1.0
 	poetry lock
 	poetry export -f requirements.txt --without-hashes --output requirements.txt
 	poetry export -f requirements.txt --without-hashes --with dev --output requirements-dev.txt
@@ -18,8 +19,8 @@ stop:
 # Run tests for the library
 test:
 	docker compose up -d --build
-	docker cp requirements-dev.txt api_web_1:/app/requirements-dev.txt
+	docker cp requirements-dev.txt api_web:/app/requirements-dev.txt
 	docker compose exec -T web pip install -r requirements-dev.txt
-	docker cp tests api_web_1:/app/tests
-	docker compose exec -T web pytest tests/
+	docker cp tests api_web:/app/tests
+	docker compose exec -T web pytest tests/ -vv
 	docker compose down
diff --git a/api/README.md b/api/README.md
@@ -35,16 +35,39 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"det_arch": "db_resnet50"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/detection", files={'file': data}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/detection", headers=headers, params=params, files=files).json())
 ```
 
 should yield
 
 ```json
-[{'box': [0.826171875, 0.185546875, 0.90234375, 0.201171875]},
- {'box': [0.75390625, 0.185546875, 0.8173828125, 0.201171875]}]
+[
+  {
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "geometries": [
+      [
+        0.724609375,
+        0.1787109375,
+        0.7900390625,
+        0.2080078125
+      ],
+      [
+        0.6748046875,
+        0.1796875,
+        0.7314453125,
+        0.20703125
+      ]
+    ]
+  }
+]
 ```
 
 #### Text recognition
@@ -56,15 +79,27 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"reco_arch": "crnn_vgg16_bn"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/recognition", files={'file': data}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/recognition", headers=headers, params=params, files=files).json())
 ```
 
 should yield
 
 ```json
-{'value': 'invite'}
+[
+  {
+    "name": "117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg",
+    "value": "invite",
+    "confidence": 1.0
+  }
+]
 ```
 
 #### End-to-end OCR
@@ -76,16 +111,78 @@ with this snippet:
 
 ```python
 import requests
+
+headers = {"accept": "application/json"}
+params = {"det_arch": "db_resnet50", "reco_arch": "crnn_vgg16_bn"}
+
 with open('/path/to/your/img.jpg', 'rb') as f:
-    data = f.read()
-print(requests.post("http://localhost:8080/ocr", files={'file': data}).json())
+    files = [  # application/pdf, image/jpeg, image/png supported
+        ("files", ("117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg", f.read(), "image/jpeg")),
+    ]
+print(requests.post("http://localhost:8080/ocr", headers=headers, params=params, files=files).json())
 ```
 
 should yield
 
 ```json
-[{'box': [0.75390625, 0.185546875, 0.8173828125, 0.201171875],
-  'value': 'Hello'},
- {'box': [0.826171875, 0.185546875, 0.90234375, 0.201171875],
-  'value': 'world!'}]
+[
+  {
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "orientation": {
+      "value": 0,
+      "confidence": null
+    },
+    "language": {
+      "value": null,
+      "confidence": null
+    },
+    "dimensions": [2339, 1654],
+    "items": [
+      {
+        "blocks": [
+          {
+            "geometry": [
+              0.7471996155154171,
+              0.1787109375,
+              0.9101580212741838,
+              0.2080078125
+            ],
+            "lines": [
+              {
+                "geometry": [
+                  0.7471996155154171,
+                  0.1787109375,
+                  0.9101580212741838,
+                  0.2080078125
+                ],
+                "words": [
+                  {
+                    "value": "Hello",
+                    "geometry": [
+                      0.7471996155154171,
+                      0.1796875,
+                      0.8272978149561669,
+                      0.20703125
+                    ],
+                    "confidence": 1.0
+                  },
+                  {
+                    "value": "world!",
+                    "geometry": [
+                      0.8176307908857315,
+                      0.1787109375,
+                      0.9101580212741838,
+                      0.2080078125
+                    ],
+                    "confidence": 1.0
+                  }
+                ]
+              }
+            ]
+          }
+        ]
+      }
+    ]
+  }
+]
 ```
diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
@@ -5,19 +5,31 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import DetectionOut
-from app.vision import det_predictor
+from app.schemas import DetectionIn, DetectionOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 from doctr.file_utils import CLASS_NAME
-from doctr.io import decode_img_as_tensor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[DetectionOut], status_code=status.HTTP_200_OK, summary="Perform text detection")
-async def text_detection(file: UploadFile = File(...)):
+async def text_detection(request: DetectionIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR text detection model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    boxes = det_predictor([img])[0]
-    return [DetectionOut(box=box.tolist()) for box in boxes[CLASS_NAME][:, :-1]]
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    return [
+        DetectionOut(
+            name=filename,
+            geometries=[
+                geom[:-1].tolist() if len(geom) == 5 else resolve_geometry(geom.tolist()) for geom in doc[CLASS_NAME]
+            ],
+        )
+        for doc, filename in zip(predictor(content), filenames)
+    ]
diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
@@ -3,27 +3,50 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, List
+from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import OCROut
-from app.vision import kie_predictor
-from doctr.io import decode_img_as_tensor
+from app.schemas import KIEElement, KIEIn, KIEOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
-@router.post("/", response_model=Dict[str, List[OCROut]], status_code=status.HTTP_200_OK, summary="Perform KIE")
-async def perform_kie(file: UploadFile = File(...)):
+@router.post("/", response_model=List[KIEOut], status_code=status.HTTP_200_OK, summary="Perform KIE")
+async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR KIE model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    out = kie_predictor([img])
-
-    return {
-        class_name: [
-            OCROut(box=(*prediction.geometry[0], *prediction.geometry[1]), value=prediction.value)
-            for prediction in out.pages[0].predictions[class_name]
-        ]
-        for class_name in out.pages[0].predictions.keys()
-    }
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        KIEOut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            dimensions=page.dimensions,
+            predictions=[
+                KIEElement(
+                    class_name=class_name,
+                    items=[
+                        dict(
+                            value=prediction.value,
+                            geometry=resolve_geometry(prediction.geometry),
+                            confidence=round(prediction.confidence, 2),
+                        )
+                        for prediction in page.predictions[class_name]
+                    ],
+                )
+                for class_name in page.predictions.keys()
+            ],
+        )
+        for i, page in enumerate(out.pages)
+    ]
+
+    return results
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
@@ -5,24 +5,59 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import OCROut
-from app.vision import predictor
-from doctr.io import decode_img_as_tensor
+from app.schemas import OCRBlock, OCRIn, OCRLine, OCROut, OCRPage, OCRWord
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[OCROut], status_code=status.HTTP_200_OK, summary="Perform OCR")
-async def perform_ocr(file: UploadFile = File(...)):
+async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR OCR model to analyze the input image"""
-    img = decode_img_as_tensor(file.file.read())
-    out = predictor([img])
-
-    return [
-        OCROut(box=(*word.geometry[0], *word.geometry[1]), value=word.value)
-        for block in out.pages[0].blocks
-        for line in block.lines
-        for word in line.words
+    try:
+        # generator object to list
+        content, filenames = await get_documents(files)
+        predictor = init_predictor(request)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        OCROut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            dimensions=page.dimensions,
+            items=[
+                OCRPage(
+                    blocks=[
+                        OCRBlock(
+                            geometry=resolve_geometry(block.geometry),
+                            lines=[
+                                OCRLine(
+                                    geometry=resolve_geometry(line.geometry),
+                                    words=[
+                                        OCRWord(
+                                            value=word.value,
+                                            geometry=resolve_geometry(word.geometry),
+                                            confidence=round(word.confidence, 2),
+                                        )
+                                        for word in line.words
+                                    ],
+                                )
+                                for line in block.lines
+                            ],
+                        )
+                        for block in page.blocks
+                    ]
+                )
+            ],
+        )
+        for i, page in enumerate(out.pages)
     ]
+
+    return results