mindee · felixdittrich92 · Oct 1, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 30, 2024
diff --git a/Makefile b/Makefile
@@ -6,8 +6,8 @@ quality:
 
 # this target runs checks on all files and potentially modifies some of them
 style:
-	ruff check --fix .
 	ruff format .
+	ruff check --fix .
 
 # Run tests for the library
 test:

diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst
@@ -48,9 +48,9 @@ This datasets contains the information to train or validate a text detection mod
 
     from doctr.datasets import CORD
     # Load straight boxes
-    train_set = CORD(train=True, download=True)
+    train_set = CORD(train=True, download=True, detection_task=True)
     # Load rotated boxes
-    train_set = CORD(train=True, download=True, use_polygons=True)
+    train_set = CORD(train=True, download=True, use_polygons=True, detection_task=True)
     img, target = train_set[0]
 
 
@@ -99,6 +99,21 @@ This datasets contains the information to train or validate a text recognition m
     img, target = train_set[0]
 
 
+OCR
+^^^
+
+The same dataset table as for detection, but with information about the bounding boxes and labels.
+
+.. code:: python3
+
+    from doctr.datasets import CORD
+    # Load straight boxes
+    train_set = CORD(train=True, download=True)
+    # Load rotated boxes
+    train_set = CORD(train=True, download=True, use_polygons=True)
+    img, target = train_set[0]
+
+
 Object Detection
 ^^^^^^^^^^^^^^^^
 

diff --git a/doctr/datasets/cord.py b/doctr/datasets/cord.py
@@ -33,6 +33,7 @@ class CORD(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -53,6 +54,7 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         url, sha256, name = self.TRAIN if train else self.TEST
@@ -64,10 +66,15 @@ def __init__(
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
 
         # List images
         tmp_root = os.path.join(self.root, "image")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         self.train = train
         np_dtype = np.float32
         for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
@@ -109,6 +116,8 @@ def __init__(
                 )
                 for crop, label in zip(crops, list(text_targets)):
                     self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
             else:
                 self.data.append((
                     img_path,

diff --git a/doctr/datasets/funsd.py b/doctr/datasets/funsd.py
@@ -33,6 +33,7 @@ class FUNSD(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -45,6 +46,7 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -55,6 +57,12 @@ def __init__(
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
+
         self.train = train
         np_dtype = np.float32
 
@@ -63,7 +71,7 @@ def __init__(
 
         # # List images
         tmp_root = os.path.join(self.root, subfolder, "images")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
             # File existence check
             if not os.path.exists(os.path.join(tmp_root, img_path)):
@@ -100,6 +108,8 @@ def __init__(
                     # filter labels with unknown characters
                     if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
                         self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
             else:
                 self.data.append((
                     img_path,

diff --git a/doctr/datasets/ic03.py b/doctr/datasets/ic03.py
@@ -32,6 +32,7 @@ class IC03(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -51,6 +52,7 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         url, sha256, file_name = self.TRAIN if train else self.TEST
@@ -62,8 +64,14 @@ def __init__(
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
+
         self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
 
         # Load xml data
@@ -117,6 +125,8 @@ def __init__(
                     for crop, label in zip(crops, labels):
                         if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                             self.data.append((crop, label))
+                elif detection_task:
+                    self.data.append((name.text, boxes))
                 else:
                     self.data.append((name.text, dict(boxes=boxes, labels=labels)))
 

diff --git a/doctr/datasets/ic13.py b/doctr/datasets/ic13.py
@@ -38,6 +38,7 @@ class IC13(AbstractDataset):
         label_folder: folder with all annotation files for the images
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `AbstractDataset`.
     """
 
@@ -47,19 +48,25 @@ def __init__(
         label_folder: str,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
 
         # File existence check
         if not os.path.exists(label_folder) or not os.path.exists(img_folder):
             raise FileNotFoundError(
                 f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
             )
 
-        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
 
         img_names = os.listdir(img_folder)
@@ -95,5 +102,7 @@ def __init__(
                 crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
                 for crop, label in zip(crops, labels):
                     self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, box_targets))
             else:
                 self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
diff --git a/doctr/datasets/iiit5k.py b/doctr/datasets/iiit5k.py
@@ -34,6 +34,7 @@ class IIIT5K(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -45,6 +46,7 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
@@ -55,14 +57,20 @@ def __init__(
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
+
         self.train = train
 
         # Load mat data
         tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
         mat_file = "trainCharBound" if self.train else "testCharBound"
         mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
 
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
 
         for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
@@ -73,24 +81,26 @@ def __init__(
             if not os.path.exists(os.path.join(tmp_root, _raw_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
 
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                box_targets = [
+                    [
+                        [box[0], box[1]],
+                        [box[0] + box[2], box[1]],
+                        [box[0] + box[2], box[1] + box[3]],
+                        [box[0], box[1] + box[3]],
+                    ]
+                    for box in box_targets
+                ]
+            else:
+                # xmin, ymin, xmax, ymax
+                box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
+
             if recognition_task:
                 self.data.append((_raw_path, _raw_label))
+            elif detection_task:
+                self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
             else:
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box_targets = [
-                        [
-                            [box[0], box[1]],
-                            [box[0] + box[2], box[1]],
-                            [box[0] + box[2], box[1] + box[3]],
-                            [box[0], box[1] + box[3]],
-                        ]
-                        for box in box_targets
-                    ]
-                else:
-                    # xmin, ymin, xmax, ymax
-                    box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
-
                 # label are casted to list where each char corresponds to the character's bounding box
                 self.data.append((
                     _raw_path,

diff --git a/doctr/datasets/imgur5k.py b/doctr/datasets/imgur5k.py
@@ -46,6 +46,7 @@ class IMGUR5K(AbstractDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `AbstractDataset`.
     """
 
@@ -56,17 +57,23 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         super().__init__(
             img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
 
         # File existence check
         if not os.path.exists(label_path) or not os.path.exists(img_folder):
             raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
 
-        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         self.train = train
         np_dtype = np.float32
 
@@ -132,6 +139,8 @@ def __init__(
                                 tmp_img = Image.fromarray(crop)
                                 tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                                 reco_images_counter += 1
+                elif detection_task:
+                    self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
                 else:
                     self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
 

diff --git a/doctr/datasets/sroie.py b/doctr/datasets/sroie.py
@@ -33,6 +33,7 @@ class SROIE(VisionDataset):
         train: whether the subset should be the training one
         use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
         recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
         **kwargs: keyword arguments from `VisionDataset`.
     """
 
@@ -52,6 +53,7 @@ def __init__(
         train: bool = True,
         use_polygons: bool = False,
         recognition_task: bool = False,
+        detection_task: bool = False,
         **kwargs: Any,
     ) -> None:
         url, sha256, name = self.TRAIN if train else self.TEST
@@ -63,10 +65,16 @@ def __init__(
             pre_transforms=convert_target_to_relative if not recognition_task else None,
             **kwargs,
         )
+        if recognition_task and detection_task:
+            raise ValueError(
+                "recognition_task and detection_task cannot be set to True simultaneously "
+                + "to get the whole dataset with boxes and labels leave both to False"
+            )
+
         self.train = train
 
         tmp_root = os.path.join(self.root, "images")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
         np_dtype = np.float32
 
         for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))):
@@ -94,6 +102,8 @@ def __init__(
                 for crop, label in zip(crops, labels):
                     if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                         self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, coords))
             else:
                 self.data.append((img_path, dict(boxes=coords, labels=labels)))