Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[datasets] Allow detection task for built-in datasets #1717

Merged
merged 4 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ quality:

# this target runs checks on all files and potentially modifies some of them
style:
ruff check --fix .
ruff format .
ruff check --fix .

# Run tests for the library
test:
Expand Down
19 changes: 17 additions & 2 deletions docs/source/using_doctr/using_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ This datasets contains the information to train or validate a text detection mod

from doctr.datasets import CORD
# Load straight boxes
train_set = CORD(train=True, download=True)
train_set = CORD(train=True, download=True, detection_task=True)
# Load rotated boxes
train_set = CORD(train=True, download=True, use_polygons=True)
train_set = CORD(train=True, download=True, use_polygons=True, detection_task=True)
img, target = train_set[0]


Expand Down Expand Up @@ -99,6 +99,21 @@ This datasets contains the information to train or validate a text recognition m
img, target = train_set[0]


OCR
^^^

The same dataset table as for detection, but with information about the bounding boxes and labels.

.. code:: python3

from doctr.datasets import CORD
# Load straight boxes
train_set = CORD(train=True, download=True)
# Load rotated boxes
train_set = CORD(train=True, download=True, use_polygons=True)
img, target = train_set[0]


Object Detection
^^^^^^^^^^^^^^^^

Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class CORD(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -53,6 +54,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, name = self.TRAIN if train else self.TEST
Expand All @@ -64,10 +66,15 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)
felixdittrich92 marked this conversation as resolved.
Show resolved Hide resolved

# List images
tmp_root = os.path.join(self.root, "image")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
self.train = train
np_dtype = np.float32
for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
Expand Down Expand Up @@ -109,6 +116,8 @@ def __init__(
)
for crop, label in zip(crops, list(text_targets)):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
else:
self.data.append((
img_path,
Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/funsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class FUNSD(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -45,6 +46,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
Expand All @@ -55,6 +57,12 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this part can be moved in VisionDataset or even in an abstraction above as this configuration is always forbidden.
It'll also reduce the number of copy paste

Copy link
Contributor

@felixT2K felixT2K Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem here is that all datasets inerhit from AbstractDataset, but not all datasets provides the functionality to be used for recognition and/or detection for example MJSynth is a pure recognition dataset :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So recognition_task / detection_task is only available on the top level .. We could do something like raise_for on VisionDataset but not sure if we really want something 😅

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so if you don't want to to be able to pass recognition_task or detection_task to all AbstractDataset, the code can stay like this, I'm fine with it. My goal was to move the logic "if both variables are set to True, then raise an error as it's never possible".

self.train = train
np_dtype = np.float32

Expand All @@ -63,7 +71,7 @@ def __init__(

# # List images
tmp_root = os.path.join(self.root, subfolder, "images")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
# File existence check
if not os.path.exists(os.path.join(tmp_root, img_path)):
Expand Down Expand Up @@ -100,6 +108,8 @@ def __init__(
# filter labels with unknown characters
if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
else:
self.data.append((
img_path,
Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/ic03.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class IC03(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -51,6 +52,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, file_name = self.TRAIN if train else self.TEST
Expand All @@ -62,8 +64,14 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

self.train = train
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

# Load xml data
Expand Down Expand Up @@ -117,6 +125,8 @@ def __init__(
for crop, label in zip(crops, labels):
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
self.data.append((crop, label))
elif detection_task:
self.data.append((name.text, boxes))
else:
self.data.append((name.text, dict(boxes=boxes, labels=labels)))

Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/ic13.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class IC13(AbstractDataset):
label_folder: folder with all annotation files for the images
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `AbstractDataset`.
"""

Expand All @@ -47,19 +48,25 @@ def __init__(
label_folder: str,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

# File existence check
if not os.path.exists(label_folder) or not os.path.exists(img_folder):
raise FileNotFoundError(
f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
)

self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

img_names = os.listdir(img_folder)
Expand Down Expand Up @@ -95,5 +102,7 @@ def __init__(
crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
for crop, label in zip(crops, labels):
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, box_targets))
else:
self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
42 changes: 26 additions & 16 deletions doctr/datasets/iiit5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class IIIT5K(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -45,6 +46,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
Expand All @@ -55,14 +57,20 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

self.train = train

# Load mat data
tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
mat_file = "trainCharBound" if self.train else "testCharBound"
mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]

self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
Expand All @@ -73,24 +81,26 @@ def __init__(
if not os.path.exists(os.path.join(tmp_root, _raw_path)):
raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")

if use_polygons:
odulcy-mindee marked this conversation as resolved.
Show resolved Hide resolved
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box_targets = [
[
[box[0], box[1]],
[box[0] + box[2], box[1]],
[box[0] + box[2], box[1] + box[3]],
[box[0], box[1] + box[3]],
]
for box in box_targets
]
else:
# xmin, ymin, xmax, ymax
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]

if recognition_task:
self.data.append((_raw_path, _raw_label))
elif detection_task:
self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
else:
if use_polygons:
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box_targets = [
[
[box[0], box[1]],
[box[0] + box[2], box[1]],
[box[0] + box[2], box[1] + box[3]],
[box[0], box[1] + box[3]],
]
for box in box_targets
]
else:
# xmin, ymin, xmax, ymax
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]

# label are casted to list where each char corresponds to the character's bounding box
self.data.append((
_raw_path,
Expand Down
11 changes: 10 additions & 1 deletion doctr/datasets/imgur5k.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class IMGUR5K(AbstractDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `AbstractDataset`.
"""

Expand All @@ -56,17 +57,23 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
super().__init__(
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

# File existence check
if not os.path.exists(label_path) or not os.path.exists(img_folder):
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")

self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
self.train = train
np_dtype = np.float32

Expand Down Expand Up @@ -132,6 +139,8 @@ def __init__(
tmp_img = Image.fromarray(crop)
tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
reco_images_counter += 1
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
else:
self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))

Expand Down
12 changes: 11 additions & 1 deletion doctr/datasets/sroie.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class SROIE(VisionDataset):
train: whether the subset should be the training one
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
recognition_task: whether the dataset should be used for recognition task
detection_task: whether the dataset should be used for detection task
**kwargs: keyword arguments from `VisionDataset`.
"""

Expand All @@ -52,6 +53,7 @@ def __init__(
train: bool = True,
use_polygons: bool = False,
recognition_task: bool = False,
detection_task: bool = False,
**kwargs: Any,
) -> None:
url, sha256, name = self.TRAIN if train else self.TEST
Expand All @@ -63,10 +65,16 @@ def __init__(
pre_transforms=convert_target_to_relative if not recognition_task else None,
**kwargs,
)
if recognition_task and detection_task:
raise ValueError(
"recognition_task and detection_task cannot be set to True simultaneously "
+ "to get the whole dataset with boxes and labels leave both to False"
)

self.train = train

tmp_root = os.path.join(self.root, "images")
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
np_dtype = np.float32

for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))):
Expand Down Expand Up @@ -94,6 +102,8 @@ def __init__(
for crop, label in zip(crops, labels):
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, coords))
else:
self.data.append((img_path, dict(boxes=coords, labels=labels)))

Expand Down
Loading
Loading