diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index ab09d33220..2fac877fd9 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/latest/_modules/doctr/io/image/tensorflow.html b/latest/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/latest/_modules/doctr/io/image/tensorflow.html +++ b/latest/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/latest/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html b/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/latest/_modules/doctr/models/classification/resnet/tensorflow.html b/latest/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/latest/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/models/classification/textnet/tensorflow.html b/latest/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/latest/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/latest/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/models/classification/vgg/tensorflow.html b/latest/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/latest/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/latest/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/models/classification/vit/tensorflow.html b/latest/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/latest/_modules/doctr/models/classification/vit/tensorflow.html +++ b/latest/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index ba12ebbdf0..1a9eef2bfd 100644 --- a/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/latest/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -305,8 +305,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses -from keras.applications import ResNet50 +from tensorflow.keras import Model, Sequential, layers, losses +from tensorflow.keras.applications import ResNet50 from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params diff --git a/latest/_modules/doctr/models/detection/fast/tensorflow.html b/latest/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/latest/_modules/doctr/models/detection/fast/tensorflow.html +++ b/latest/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/latest/_modules/doctr/models/detection/linknet/tensorflow.html b/latest/_modules/doctr/models/detection/linknet/tensorflow.html index 14f36e18ed..b0133aee6b 100644 --- a/latest/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/latest/_modules/doctr/models/detection/linknet/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.linknet.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses +from tensorflow.keras import Model, Sequential, layers, losses from doctr.file_utils import CLASS_NAME from doctr.models.classification import resnet18, resnet34, resnet50 diff --git a/latest/_modules/doctr/models/recognition/crnn/tensorflow.html b/latest/_modules/doctr/models/recognition/crnn/tensorflow.html index b38a069159..efee5f0f8e 100644 --- a/latest/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Model, Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Model, Sequential from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/models/recognition/master/tensorflow.html b/latest/_modules/doctr/models/recognition/master/tensorflow.html index 1b11a994ef..4895fffe08 100644 --- a/latest/_modules/doctr/models/recognition/master/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/master/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.master.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.classification import magc_resnet31 diff --git a/latest/_modules/doctr/models/recognition/parseq/tensorflow.html b/latest/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/latest/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/latest/_modules/doctr/models/recognition/sar/tensorflow.html b/latest/_modules/doctr/models/recognition/sar/tensorflow.html index 3f7318d360..b5531f0910 100644 --- a/latest/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/sar/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.sar.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.datasets import VOCABS from doctr.utils.repr import NestedObject diff --git a/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html b/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/latest/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/latest/_modules/doctr/transforms/modules/tensorflow.html b/latest/_modules/doctr/transforms/modules/tensorflow.html index ae4f26eb78..6e825f2776 100644 --- a/latest/_modules/doctr/transforms/modules/tensorflow.html +++ b/latest/_modules/doctr/transforms/modules/tensorflow.html @@ -729,7 +729,6 @@

Source code for doctr.transforms.modules.tensorflow

def extra_repr(self) -> str: return f"kernel_shape={self.kernel_shape}, std={self.std}" - @tf.function def __call__(self, img: tf.Tensor) -> tf.Tensor: return tf.squeeze( _gaussian_filter( diff --git a/latest/_sources/using_doctr/using_model_export.rst.txt b/latest/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/latest/_sources/using_doctr/using_model_export.rst.txt +++ b/latest/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/latest/using_doctr/using_model_export.html b/latest/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/latest/using_doctr/using_model_export.html +++ b/latest/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.1.0/_modules/doctr/io/image/tensorflow.html b/v0.1.0/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/v0.1.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.1.0/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/v0.1.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index ba12ebbdf0..1a9eef2bfd 100644 --- a/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -305,8 +305,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses -from keras.applications import ResNet50 +from tensorflow.keras import Model, Sequential, layers, losses +from tensorflow.keras.applications import ResNet50 from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params diff --git a/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html index 14f36e18ed..b0133aee6b 100644 --- a/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.linknet.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses +from tensorflow.keras import Model, Sequential, layers, losses from doctr.file_utils import CLASS_NAME from doctr.models.classification import resnet18, resnet34, resnet50 diff --git a/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html index b38a069159..efee5f0f8e 100644 --- a/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Model, Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Model, Sequential from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html index 1b11a994ef..4895fffe08 100644 --- a/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.master.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.classification import magc_resnet31 diff --git a/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html index 3f7318d360..b5531f0910 100644 --- a/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.sar.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.datasets import VOCABS from doctr.utils.repr import NestedObject diff --git a/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.1.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html index ae4f26eb78..6e825f2776 100644 --- a/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.1.0/_modules/doctr/transforms/modules/tensorflow.html @@ -729,7 +729,6 @@

Source code for doctr.transforms.modules.tensorflow

def extra_repr(self) -> str: return f"kernel_shape={self.kernel_shape}, std={self.std}" - @tf.function def __call__(self, img: tf.Tensor) -> tf.Tensor: return tf.squeeze( _gaussian_filter( diff --git a/v0.1.0/_sources/using_doctr/using_model_export.rst.txt b/v0.1.0/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.1.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.1.0/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.1.0/using_doctr/using_model_export.html b/v0.1.0/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.1.0/using_doctr/using_model_export.html +++ b/v0.1.0/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.1.1/_modules/doctr/io/image/tensorflow.html b/v0.1.1/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/v0.1.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.1.1/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/v0.1.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index ba12ebbdf0..1a9eef2bfd 100644 --- a/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -305,8 +305,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses -from keras.applications import ResNet50 +from tensorflow.keras import Model, Sequential, layers, losses +from tensorflow.keras.applications import ResNet50 from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params diff --git a/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html index 14f36e18ed..b0133aee6b 100644 --- a/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.linknet.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses +from tensorflow.keras import Model, Sequential, layers, losses from doctr.file_utils import CLASS_NAME from doctr.models.classification import resnet18, resnet34, resnet50 diff --git a/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html index b38a069159..efee5f0f8e 100644 --- a/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Model, Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Model, Sequential from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html index 1b11a994ef..4895fffe08 100644 --- a/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.master.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.classification import magc_resnet31 diff --git a/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html index 3f7318d360..b5531f0910 100644 --- a/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.sar.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.datasets import VOCABS from doctr.utils.repr import NestedObject diff --git a/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.1.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html index ae4f26eb78..6e825f2776 100644 --- a/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.1.1/_modules/doctr/transforms/modules/tensorflow.html @@ -729,7 +729,6 @@

Source code for doctr.transforms.modules.tensorflow

def extra_repr(self) -> str: return f"kernel_shape={self.kernel_shape}, std={self.std}" - @tf.function def __call__(self, img: tf.Tensor) -> tf.Tensor: return tf.squeeze( _gaussian_filter( diff --git a/v0.1.1/_sources/using_doctr/using_model_export.rst.txt b/v0.1.1/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.1.1/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.1.1/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.1.1/using_doctr/using_model_export.html b/v0.1.1/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.1.1/using_doctr/using_model_export.html +++ b/v0.1.1/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.2.0/_modules/doctr/io/image/tensorflow.html b/v0.2.0/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/v0.2.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.2.0/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/v0.2.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index ba12ebbdf0..1a9eef2bfd 100644 --- a/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -305,8 +305,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses -from keras.applications import ResNet50 +from tensorflow.keras import Model, Sequential, layers, losses +from tensorflow.keras.applications import ResNet50 from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params diff --git a/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html index 14f36e18ed..b0133aee6b 100644 --- a/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.linknet.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses +from tensorflow.keras import Model, Sequential, layers, losses from doctr.file_utils import CLASS_NAME from doctr.models.classification import resnet18, resnet34, resnet50 diff --git a/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html index b38a069159..efee5f0f8e 100644 --- a/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Model, Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Model, Sequential from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html index 1b11a994ef..4895fffe08 100644 --- a/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.master.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.classification import magc_resnet31 diff --git a/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html index 3f7318d360..b5531f0910 100644 --- a/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.sar.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.datasets import VOCABS from doctr.utils.repr import NestedObject diff --git a/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.2.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html index ae4f26eb78..6e825f2776 100644 --- a/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.2.0/_modules/doctr/transforms/modules/tensorflow.html @@ -729,7 +729,6 @@

Source code for doctr.transforms.modules.tensorflow

def extra_repr(self) -> str: return f"kernel_shape={self.kernel_shape}, std={self.std}" - @tf.function def __call__(self, img: tf.Tensor) -> tf.Tensor: return tf.squeeze( _gaussian_filter( diff --git a/v0.2.0/_sources/using_doctr/using_model_export.rst.txt b/v0.2.0/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.2.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.2.0/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.2.0/using_doctr/using_model_export.html b/v0.2.0/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.2.0/using_doctr/using_model_export.html +++ b/v0.2.0/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.2.1/_modules/doctr/io/image/tensorflow.html b/v0.2.1/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/v0.2.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.2.1/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/v0.2.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index ba12ebbdf0..1a9eef2bfd 100644 --- a/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -305,8 +305,8 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses -from keras.applications import ResNet50 +from tensorflow.keras import Model, Sequential, layers, losses +from tensorflow.keras.applications import ResNet50 from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params diff --git a/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html index 14f36e18ed..b0133aee6b 100644 --- a/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.linknet.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers, losses +from tensorflow.keras import Model, Sequential, layers, losses from doctr.file_utils import CLASS_NAME from doctr.models.classification import resnet18, resnet34, resnet50 diff --git a/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html index b38a069159..efee5f0f8e 100644 --- a/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.models.recognition.crnn.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Model, Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Model, Sequential from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html index 1b11a994ef..4895fffe08 100644 --- a/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.master.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.classification import magc_resnet31 diff --git a/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html index 3f7318d360..b5531f0910 100644 --- a/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.sar.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.datasets import VOCABS from doctr.utils.repr import NestedObject diff --git a/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.2.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html index ae4f26eb78..6e825f2776 100644 --- a/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.2.1/_modules/doctr/transforms/modules/tensorflow.html @@ -729,7 +729,6 @@

Source code for doctr.transforms.modules.tensorflow

def extra_repr(self) -> str: return f"kernel_shape={self.kernel_shape}, std={self.std}" - @tf.function def __call__(self, img: tf.Tensor) -> tf.Tensor: return tf.squeeze( _gaussian_filter( diff --git a/v0.2.1/_sources/using_doctr/using_model_export.rst.txt b/v0.2.1/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.2.1/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.2.1/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.2.1/using_doctr/using_model_export.html b/v0.2.1/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.2.1/using_doctr/using_model_export.html +++ b/v0.2.1/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.3.0/_modules/doctr/io/image/tensorflow.html b/v0.3.0/_modules/doctr/io/image/tensorflow.html index d13b1c8932..f9faeeab1c 100644 --- a/v0.3.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.3.0/_modules/doctr/io/image/tensorflow.html @@ -302,8 +302,8 @@

Source code for doctr.io.image.tensorflow

 
 import numpy as np
 import tensorflow as tf
-from keras.utils import img_to_array
 from PIL import Image
+from tensorflow.keras.utils import img_to_array
 
 from doctr.utils.common_types import AbstractPath
 
diff --git a/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
index f850c994bc..e181ef6a1f 100644
--- a/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
+++ b/v0.3.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html
@@ -304,8 +304,8 @@ 

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.recognition.parseq.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

Source code for doctr.models.recognition.parseq.tensorflow

target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.3.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.recognition.vitstr.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.3.0/_sources/using_doctr/using_model_export.rst.txt b/v0.3.0/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.3.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.3.0/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.3.0/using_doctr/using_model_export.html b/v0.3.0/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.3.0/using_doctr/using_model_export.html +++ b/v0.3.0/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

Half-precision
import tensorflow as tf
-from keras import mixed_precision
+from tensorflow.keras import mixed_precision
 mixed_precision.set_global_policy('mixed_float16')
 predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
 
diff --git a/v0.3.1/_modules/doctr/datasets/classification/tensorflow.html b/v0.3.1/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 40da1ffe4c..0000000000 --- a/v0.3.1/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,359 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Skip to content - - - -
-
-
- -
- -
-
- -
- -
-
-
-
-
-
- - - - - Back to top - -
-
- -
- -
-
-

Source code for doctr.datasets.classification.tensorflow

-# Copyright (C) 2021, Mindee.
-
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
-
-import tensorflow as tf
-from .base import _CharacterGenerator
-
-
-__all__ = ['CharacterGenerator']
-
-
-
-[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
- -
-
-
-
- - -
-
- - Made with Sphinx and @pradyunsg's - - Furo - -
-
- -
-
- -
-
- -
-
- - - - - - - - - \ No newline at end of file diff --git a/v0.3.1/_modules/doctr/datasets/cord.html b/v0.3.1/_modules/doctr/datasets/cord.html index 5679c787e7..3b89955bd8 100644 --- a/v0.3.1/_modules/doctr/datasets/cord.html +++ b/v0.3.1/_modules/doctr/datasets/cord.html @@ -236,7 +236,7 @@

Package Reference

  • doctr.datasets
  • -
  • doctr.io
  • +
  • doctr.documents
  • doctr.models
  • doctr.transforms
  • doctr.utils
  • @@ -327,18 +327,17 @@

    Source code for doctr.datasets.cord

             super().__init__(url, None, sha256, True, **kwargs)
     
             # # List images
    -        tmp_root = os.path.join(self.root, 'image')
    +        self.root = os.path.join(self._root, 'image')
             self.data: List[Tuple[str, Dict[str, Any]]] = []
    -        np_dtype = np.float16 if self.fp16 else np.float32
             self.train = train
             self.sample_transforms = sample_transforms
    -        for img_path in os.listdir(tmp_root):
    +        for img_path in os.listdir(self.root):
                 # File existence check
    -            if not os.path.exists(os.path.join(tmp_root, img_path)):
    -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
    +            if not os.path.exists(os.path.join(self.root, img_path)):
    +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                 stem = Path(img_path).stem
                 _targets = []
    -            with open(os.path.join(self.root, 'json', f"{stem}.json"), 'rb') as f:
    +            with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f:
                     label = json.load(f)
                     for line in label["valid_line"]:
                         for word in line["words"]:
    @@ -351,7 +350,7 @@ 

    Source code for doctr.datasets.cord

                                         [x[1], y[1]],
                                         [x[2], y[2]],
                                         [x[3], y[3]],
    -                                ], dtype=np_dtype)))
    +                                ], dtype=np.float32)))
                                 else:
                                     # Reduce 8 coords to 4
                                     box = [min(x), min(y), max(x), max(y)]
    @@ -363,7 +362,6 @@ 

    Source code for doctr.datasets.cord

                     img_path,
                     dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets)
                 ))
    -        self.root = tmp_root
     
         def extra_repr(self) -> str:
             return f"train={self.train}"
    @@ -400,7 +398,7 @@

    Source code for doctr.datasets.cord

           
         
       
    -
    +
    diff --git a/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.3.1/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -284,7 +284,6 @@

      Source code for doctr.datasets.datasets.tensorflow

      from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

      Source code for doctr.datasets.datasets.tensorflow

      class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

      Source code for doctr.datasets.datasets.tensorflow

      +
      diff --git a/v0.3.1/_modules/doctr/datasets/funsd.html b/v0.3.1/_modules/doctr/datasets/funsd.html index 6ff6059aef..2f5494dc2a 100644 --- a/v0.3.1/_modules/doctr/datasets/funsd.html +++ b/v0.3.1/_modules/doctr/datasets/funsd.html @@ -236,7 +236,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -329,14 +329,14 @@

        Source code for doctr.datasets.funsd

                 subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data')
         
                 # # List images
        -        tmp_root = os.path.join(self.root, subfolder, 'images')
        +        self.root = os.path.join(self._root, subfolder, 'images')
                 self.data: List[Tuple[str, Dict[str, Any]]] = []
        -        for img_path in os.listdir(tmp_root):
        +        for img_path in os.listdir(self.root):
                     # File existence check
        -            if not os.path.exists(os.path.join(tmp_root, img_path)):
        -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
        +            if not os.path.exists(os.path.join(self.root, img_path)):
        +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                     stem = Path(img_path).stem
        -            with open(os.path.join(self.root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
        +            with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
                         data = json.load(f)
         
                     _targets = [(word['text'], word['box']) for block in data['form']
        @@ -352,8 +352,6 @@ 

        Source code for doctr.datasets.funsd

         
                     self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets)))
         
        -        self.root = tmp_root
        -
             def extra_repr(self) -> str:
                 return f"train={self.train}"
        @@ -389,7 +387,7 @@

        Source code for doctr.datasets.funsd

               
             
           
        -
        +
      diff --git a/v0.3.1/_modules/doctr/datasets/loader.html b/v0.3.1/_modules/doctr/datasets/loader.html index 0547a9778b..ba5bc217e0 100644 --- a/v0.3.1/_modules/doctr/datasets/loader.html +++ b/v0.3.1/_modules/doctr/datasets/loader.html @@ -236,7 +236,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -283,9 +283,9 @@

        Source code for doctr.datasets.loader

         import math
         import tensorflow as tf
         import numpy as np
        -from typing import Optional, Callable
        +from typing import Optional
         
        -from doctr.utils.multithreading import multithread_exec
        +from .multithreading import multithread_exec
         
         __all__ = ["DataLoader"]
         
        @@ -334,23 +334,16 @@ 

        Source code for doctr.datasets.loader

                 batch_size: int = 1,
                 drop_last: bool = False,
                 workers: Optional[int] = None,
        -        collate_fn: Optional[Callable] = None,
             ) -> None:
                 self.dataset = dataset
                 self.shuffle = shuffle
                 self.batch_size = batch_size
                 nb = len(self.dataset) / batch_size
                 self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
        -        if collate_fn is None:
        -            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
        -        else:
        -            self.collate_fn = collate_fn
        +        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                 self.workers = workers
                 self.reset()
         
        -    def __len__(self) -> int:
        -        return self.num_batches
        -
             def reset(self) -> None:
                 # Updates indices after each epoch
                 self._num_yielded = 0
        @@ -409,7 +402,7 @@ 

        Source code for doctr.datasets.loader

               
             
           
        -
        +
        diff --git a/v0.3.1/_modules/doctr/datasets/ocr.html b/v0.3.1/_modules/doctr/datasets/ocr.html index 6e1da9ce3e..2c4fb1b838 100644 --- a/v0.3.1/_modules/doctr/datasets/ocr.html +++ b/v0.3.1/_modules/doctr/datasets/ocr.html @@ -236,7 +236,7 @@

        Package Reference

        • doctr.datasets
        • -
        • doctr.io
        • +
        • doctr.documents
        • doctr.models
        • doctr.transforms
        • doctr.utils
        • @@ -290,7 +290,7 @@

          Source code for doctr.datasets.ocr

           from doctr.utils.geometry import fit_rbbox
           
           
          -__all__ = ['OCRDataset', 'DocDataset']
          +__all__ = ['OCRDataset']
           
           
           
          @@ -314,12 +314,12 @@

          Source code for doctr.datasets.ocr

                   rotated_bbox: bool = False,
                   **kwargs: Any,
               ) -> None:
          -        super().__init__(img_folder, **kwargs)
          +
                   self.sample_transforms = sample_transforms
          +        self.root = img_folder
           
                   # List images
                   self.data: List[Tuple[str, Dict[str, Any]]] = []
          -        np_dtype = np.float16 if self.fp16 else np.float32
                   with open(label_file, 'rb') as f:
                       data = json.load(f)
           
          @@ -333,13 +333,13 @@ 

          Source code for doctr.datasets.ocr

                       # handle empty images
                       if (len(file_dic["coordinates"]) == 0 or
                          (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")):
          -                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[])))
          +                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[])))
                           continue
                       is_valid: List[bool] = []
                       box_targets: List[List[float]] = []
                       for box in file_dic["coordinates"]:
                           if rotated_bbox:
          -                    x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np_dtype))
          +                    x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32))
                               box = [x, y, w, h, alpha]
                               is_valid.append(w > 0 and h > 0)
                           else:
          @@ -350,52 +350,8 @@ 

          Source code for doctr.datasets.ocr

                               box_targets.append(box)
           
                       text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid]
          -            self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
          - - - -class DocDataset(AbstractDataset): - """Implements an OCR dataset - - Args: - img_folder: local path to image folder (all jpg at the root) - label_file: local path to the label file - sample_transforms: composable transformations that will be applied to each image - **kwargs: keyword arguments from `VisionDataset`. - """ - - def __init__( - self, - img_folder: str, - label_file: str, - sample_transforms: Optional[Callable[[Any], Any]] = None, - **kwargs: Any, - ) -> None: - super().__init__(img_folder, **kwargs) - self.sample_transforms = sample_transforms - - # List images - self.data: List[Tuple[str, Dict[str, Any]]] = [] - np_dtype = np.float16 if self.fp16 else np.float32 - with open(label_file, 'rb') as f: - data = json.load(f) - - for img_name, annotations in data.items(): - # Get image path - img_name = Path(img_name) - # File existence check - if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - # handle empty images - if len(annotations["typed_words"]) == 0: - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) - continue - # Unpack - box_targets = [tuple(map(float, obj['geometry'])) for obj in annotations['typed_words']] - text_targets = [obj['value'] for obj in annotations['typed_words']] + self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
          - self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
          @@ -428,7 +384,7 @@

          Source code for doctr.datasets.ocr

                 
               
             
          -
          +
      diff --git a/v0.3.1/_modules/doctr/datasets/sroie.html b/v0.3.1/_modules/doctr/datasets/sroie.html index 5a1c9eaeb7..0425870abb 100644 --- a/v0.3.1/_modules/doctr/datasets/sroie.html +++ b/v0.3.1/_modules/doctr/datasets/sroie.html @@ -236,7 +236,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -331,16 +331,15 @@

        Source code for doctr.datasets.sroie

                     raise NotImplementedError
         
                 # # List images
        -        tmp_root = os.path.join(self.root, 'images')
        +        self.root = os.path.join(self._root, 'images')
                 self.data: List[Tuple[str, Dict[str, Any]]] = []
        -        np_dtype = np.float16 if self.fp16 else np.float32
        -        for img_path in os.listdir(tmp_root):
        +        for img_path in os.listdir(self.root):
                     # File existence check
        -            if not os.path.exists(os.path.join(tmp_root, img_path)):
        -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
        +            if not os.path.exists(os.path.join(self.root, img_path)):
        +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                     stem = Path(img_path).stem
                     _targets = []
        -            with open(os.path.join(self.root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
        +            with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                         for row in csv.reader(f, delimiter=','):
                             # Safeguard for blank lines
                             if len(row) > 0:
        @@ -355,8 +354,7 @@ 

        Source code for doctr.datasets.sroie

         
                     text_targets, box_targets = zip(*_targets)
         
        -            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
        -        self.root = tmp_root
        +            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
         
             def extra_repr(self) -> str:
                 return f"train={self.train}"
        @@ -393,7 +391,7 @@

        Source code for doctr.datasets.sroie

               
             
           
        -
        +
diff --git a/v0.3.1/_modules/doctr/datasets/utils.html b/v0.3.1/_modules/doctr/datasets/utils.html index eea8ba01f4..499d3fff84 100644 --- a/v0.3.1/_modules/doctr/datasets/utils.html +++ b/v0.3.1/_modules/doctr/datasets/utils.html @@ -236,7 +236,7 @@

Package Reference

  • doctr.datasets
  • -
  • doctr.io
  • +
  • doctr.documents
  • doctr.models
  • doctr.transforms
  • doctr.utils
  • @@ -283,12 +283,11 @@

    Source code for doctr.datasets.utils

     import string
     import unicodedata
     import numpy as np
    -from functools import partial
     from typing import List, Optional, Any
     
     from .vocabs import VOCABS
     
    -__all__ = ['translate', 'encode_string', 'decode_sequence', 'encode_sequences']
    +__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
     
     
     def translate(
    @@ -325,7 +324,7 @@ 

    Source code for doctr.datasets.utils

         return translated
     
     
    -def encode_string(
    +def encode_sequence(
         input_string: str,
         vocab: str,
     ) -> List[int]:
    @@ -352,13 +351,12 @@ 

    Source code for doctr.datasets.utils

             mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
     
         Returns:
    -        A string, decoded from input_array
    -    """
    +        A string, decoded from input_array"""
     
         if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
             raise AssertionError("Input must be an array of int, with max less than mapping size")
    -
    -    return ''.join(map(mapping.__getitem__, input_array))
    +    decoded = ''.join(mapping[idx] for idx in input_array)
    +    return decoded
     
     
     
    @@ -370,7 +368,6 @@

    Source code for doctr.datasets.utils

         eos: int = -1,
         sos: Optional[int] = None,
         pad: Optional[int] = None,
    -    dynamic_seq_length: bool = False,
         **kwargs: Any,
     ) -> np.ndarray:
         """Encode character sequences using a given vocab as mapping
    @@ -382,7 +379,6 @@ 

    Source code for doctr.datasets.utils

             eos: encoding of End Of String
             sos: optional encoding of Start Of String
             pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
    -        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
     
         Returns:
             the padded encoded data as a tensor
    @@ -391,32 +387,29 @@ 

    Source code for doctr.datasets.utils

         if 0 <= eos < len(vocab):
             raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
     
    -    if not isinstance(target_size, int) or dynamic_seq_length:
    -        # Maximum string length + EOS
    -        max_length = max(len(w) for w in sequences) + 1
    -        if isinstance(sos, int):
    -            max_length += 1
    -        if isinstance(pad, int):
    -            max_length += 1
    -        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
    +    if not isinstance(target_size, int):
    +        target_size = max(len(w) for w in sequences)
    +        if sos:
    +            target_size += 1
    +        if pad:
    +            target_size += 1
     
         # Pad all sequences
    -    if isinstance(pad, int):  # pad with padding symbol
    +    if pad:  # pad with padding symbol
             if 0 <= pad < len(vocab):
                 raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
             # In that case, add EOS at the end of the word before padding
    -        default_symbol = pad
    +        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
         else:  # pad with eos symbol
    -        default_symbol = eos
    -    encoded_data = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
    +        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
     
    -    # Encode the strings
    -    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
    -        if isinstance(pad, int):  # add eos at the end of the sequence
    -            seq.append(eos)
    -        encoded_data[idx, :min(len(seq), target_size)] = seq[:min(len(seq), target_size)]
    +    for idx, seq in enumerate(sequences):
    +        encoded_seq = encode_sequence(seq, vocab)
    +        if pad:  # add eos at the end of the sequence
    +            encoded_seq.append(eos)
    +        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
     
    -    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
    +    if sos:  # place eos symbol at the beginning of each sequence
             if 0 <= sos < len(vocab):
                 raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
             encoded_data = np.roll(encoded_data, 1)
    @@ -456,7 +449,7 @@ 

    Source code for doctr.datasets.utils

           
         
       
    -
    +
    diff --git a/v0.3.1/_modules/doctr/io/elements.html b/v0.3.1/_modules/doctr/io/elements.html index 8b0d270287..73dbca5877 100644 --- a/v0.3.1/_modules/doctr/io/elements.html +++ b/v0.3.1/_modules/doctr/io/elements.html @@ -226,20 +226,38 @@

    Source code for doctr.io.elements

    -# Copyright (C) 2021, Mindee.
    +# Copyright (C) 2021-2024, Mindee.
    +
    +# This program is licensed under the Apache License 2.0.
    +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
    +
    +from typing import Any, Dict, List, Optional, Tuple, Union
     
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    +from defusedxml import defuse_stdlib
    +
    +defuse_stdlib()
    +from xml.etree import ElementTree as ET
    +from xml.etree.ElementTree import Element as ETElement
    +from xml.etree.ElementTree import SubElement
     
     import numpy as np
    -import matplotlib.pyplot as plt
    -from typing import Tuple, Dict, List, Any, Optional, Union
     
    +import doctr
    +from doctr.file_utils import requires_package
    +from doctr.utils.common_types import BoundingBox
     from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
    -from doctr.utils.visualization import visualize_page
    -from doctr.utils.common_types import BoundingBox, RotatedBbox
    +from doctr.utils.reconstitution import synthesize_kie_page, synthesize_page
     from doctr.utils.repr import NestedObject
     
    -__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
    +try:  # optional dependency for visualization
    +    from doctr.utils.visualization import visualize_kie_page, visualize_page
    +except ModuleNotFoundError:
    +    pass
    +
    +__all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page", "KIEPage", "Document"]
     
     
     class Element(NestedObject):
    @@ -307,10 +339,14 @@ 

    Source code for doctr.io.elements

     
         def export(self) -> Dict[str, Any]:
             """Exports the object into a nested dict format"""
    -
             export_dict = {k: getattr(self, k) for k in self._exported_keys}
             for children_name in self._children_names:
    -            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
    +            if children_name in ["predictions"]:
    +                export_dict[children_name] = {
    +                    k: [item.export() for item in c] for k, c in getattr(self, children_name).items()
    +                }
    +            else:
    +                export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
     
             return export_dict
     
    @@ -323,25 +359,37 @@ 

    Source code for doctr.io.elements

     
     
     
    -[docs] +[docs] class Word(Element): """Implements a word element Args: + ---- value: the text string of the word confidence: the confidence associated with the text prediction geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to the page's size + objectness_score: the objectness score of the detection + crop_orientation: the general orientation of the crop in degrees and its confidence """ - _exported_keys: List[str] = ["value", "confidence", "geometry"] + _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"] _children_names: List[str] = [] - def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, RotatedBbox]) -> None: + def __init__( + self, + value: str, + confidence: float, + geometry: Union[BoundingBox, np.ndarray], + objectness_score: float, + crop_orientation: Dict[str, Any], + ) -> None: super().__init__() self.value = value self.confidence = confidence self.geometry = geometry + self.objectness_score = objectness_score + self.crop_orientation = crop_orientation def render(self) -> str: """Renders the full text of the element""" @@ -358,11 +406,12 @@

    Source code for doctr.io.elements

     
     
     
    -[docs] +[docs] class Artefact(Element): """Implements a non-textual element Args: + ---- artefact_type: the type of artefact confidence: the confidence of the type prediction geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to @@ -393,34 +442,40 @@

    Source code for doctr.io.elements

     
     
     
    -[docs] +[docs] class Line(Element): """Implements a line element as a collection of words Args: + ---- words: list of word elements geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing all words in it. """ - _exported_keys: List[str] = ["geometry"] - _children_names: List[str] = ['words'] + _exported_keys: List[str] = ["geometry", "objectness_score"] + _children_names: List[str] = ["words"] words: List[Word] = [] def __init__( self, words: List[Word], - geometry: Optional[Union[BoundingBox, RotatedBbox]] = None, + geometry: Optional[Union[BoundingBox, np.ndarray]] = None, + objectness_score: Optional[float] = None, ) -> None: + # Compute the objectness score of the line + if objectness_score is None: + objectness_score = float(np.mean([w.objectness_score for w in words])) # Resolve the geometry using the smallest enclosing bounding box if geometry is None: # Check whether this is a rotated or straight box - box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 5 else resolve_enclosing_bbox - geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator, misc] + box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox + geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator] super().__init__(words=words) self.geometry = geometry + self.objectness_score = objectness_score def render(self) -> str: """Renders the full text of the element""" @@ -430,18 +485,30 @@

    Source code for doctr.io.elements

         def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
             kwargs = {k: save_dict[k] for k in cls._exported_keys}
             kwargs.update({
    -            'words': [Word.from_dict(_dict) for _dict in save_dict['words']],
    +            "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
             })
             return cls(**kwargs)
    +class Prediction(Word): + """Implements a prediction element""" + + def render(self) -> str: + """Renders the full text of the element""" + return self.value + + def extra_repr(self) -> str: + return f"value='{self.value}', confidence={self.confidence:.2}, bounding_box={self.geometry}" + +
    -[docs] +[docs] class Block(Element): """Implements a block element as a collection of lines and artefacts Args: + ---- lines: list of line elements artefacts: list of artefacts geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to @@ -449,8 +516,8 @@

    Source code for doctr.io.elements

                 all lines and artefacts in it.
         """
     
    -    _exported_keys: List[str] = ["geometry"]
    -    _children_names: List[str] = ['lines', 'artefacts']
    +    _exported_keys: List[str] = ["geometry", "objectness_score"]
    +    _children_names: List[str] = ["lines", "artefacts"]
         lines: List[Line] = []
         artefacts: List[Artefact] = []
     
    @@ -458,19 +525,26 @@ 

    Source code for doctr.io.elements

             self,
             lines: List[Line] = [],
             artefacts: List[Artefact] = [],
    -        geometry: Optional[Union[BoundingBox, RotatedBbox]] = None,
    +        geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
    +        objectness_score: Optional[float] = None,
         ) -> None:
    +        # Compute the objectness score of the line
    +        if objectness_score is None:
    +            objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
             # Resolve the geometry using the smallest enclosing bounding box
             if geometry is None:
                 line_boxes = [word.geometry for line in lines for word in line.words]
                 artefact_boxes = [artefact.geometry for artefact in artefacts]
    -            box_resolution_fn = resolve_enclosing_rbbox if len(lines[0].geometry) == 5 else resolve_enclosing_bbox
    -            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator, arg-type]
    +            box_resolution_fn = (
    +                resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
    +            )
    +            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator]
     
             super().__init__(lines=lines, artefacts=artefacts)
             self.geometry = geometry
    +        self.objectness_score = objectness_score
     
    -    def render(self, line_break: str = '\n') -> str:
    +    def render(self, line_break: str = "\n") -> str:
             """Renders the full text of the element"""
             return line_break.join(line.render() for line in self.lines)
     
    @@ -478,32 +552,35 @@ 

    Source code for doctr.io.elements

         def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
             kwargs = {k: save_dict[k] for k in cls._exported_keys}
             kwargs.update({
    -            'lines': [Line.from_dict(_dict) for _dict in save_dict['lines']],
    -            'artefacts': [Artefact.from_dict(_dict) for _dict in save_dict['artefacts']],
    +            "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
    +            "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
             })
             return cls(**kwargs)
    -[docs] +[docs] class Page(Element): """Implements a page element as a collection of blocks Args: + ---- + page: image encoded as a numpy array in uint8 blocks: list of block elements page_idx: the index of the page in the input raw document - dimensions: the page size in pixels in format (width, height) + dimensions: the page size in pixels in format (height, width) orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction language: a dictionary with the language value and confidence of the prediction """ _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] - _children_names: List[str] = ['blocks'] + _children_names: List[str] = ["blocks"] blocks: List[Block] = [] def __init__( self, + page: np.ndarray, blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], @@ -511,12 +588,13 @@

    Source code for doctr.io.elements

             language: Optional[Dict[str, Any]] = None,
         ) -> None:
             super().__init__(blocks=blocks)
    +        self.page = page
             self.page_idx = page_idx
             self.dimensions = dimensions
             self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
             self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
     
    -    def render(self, block_break: str = '\n\n') -> str:
    +    def render(self, block_break: str = "\n\n") -> str:
             """Renders the full text of the element"""
             return block_break.join(b.render() for b in self.blocks)
     
    @@ -524,38 +602,302 @@ 

    Source code for doctr.io.elements

             return f"dimensions={self.dimensions}"
     
     
    -[docs] - def show( - self, page: np.ndarray, interactive: bool = True, **kwargs - ) -> None: +[docs] + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive + preserve_aspect_ratio: pass True if you passed True to the predictor + **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_page(self.export(), page, interactive=interactive) + requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed") + requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed") + import matplotlib.pyplot as plt + + visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) plt.show(**kwargs)
    + def synthesize(self, **kwargs) -> np.ndarray: + """Synthesize the page from the predictions + + Returns + ------- + synthesized page + """ + return synthesize_page(self.export(), **kwargs) + + def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]: + """Export the page as XML (hOCR-format) + convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md + + Args: + ---- + file_title: the title of the XML file + + Returns: + ------- + a tuple of the XML byte string, and its ElementTree + """ + p_idx = self.page_idx + block_count: int = 1 + line_count: int = 1 + word_count: int = 1 + height, width = self.dimensions + language = self.language if "language" in self.language.keys() else "en" + # Create the XML root element + page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)}) + # Create the header / SubElements of the root element + head = SubElement(page_hocr, "head") + SubElement(head, "title").text = file_title + SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"}) + SubElement( + head, + "meta", + attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"}, # type: ignore[attr-defined] + ) + SubElement( + head, + "meta", + attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"}, + ) + # Create the body + body = SubElement(page_hocr, "body") + SubElement( + body, + "div", + attrib={ + "class": "ocr_page", + "id": f"page_{p_idx + 1}", + "title": f"image; bbox 0 0 {width} {height}; ppageno 0", + }, + ) + # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes + for block in self.blocks: + if len(block.geometry) != 2: + raise TypeError("XML export is only available for straight bounding boxes for now.") + (xmin, ymin), (xmax, ymax) = block.geometry + block_div = SubElement( + body, + "div", + attrib={ + "class": "ocr_carea", + "id": f"block_{block_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}", + }, + ) + paragraph = SubElement( + block_div, + "p", + attrib={ + "class": "ocr_par", + "id": f"par_{block_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}", + }, + ) + block_count += 1 + for line in block.lines: + (xmin, ymin), (xmax, ymax) = line.geometry + # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0 + line_span = SubElement( + paragraph, + "span", + attrib={ + "class": "ocr_line", + "id": f"line_{line_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0", + }, + ) + line_count += 1 + for word in line.words: + (xmin, ymin), (xmax, ymax) = word.geometry + conf = word.confidence + word_div = SubElement( + line_span, + "span", + attrib={ + "class": "ocrx_word", + "id": f"word_{word_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + x_wconf {int(round(conf * 100))}", + }, + ) + # set the text + word_div.text = word.value + word_count += 1 + + return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)) + @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} - kwargs.update({'blocks': [Block.from_dict(block_dict) for block_dict in save_dict['blocks']]}) + kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]}) return cls(**kwargs)
    +class KIEPage(Element): + """Implements a KIE page element as a collection of predictions + + Args: + ---- + predictions: Dictionary with list of block elements for each detection class + page: image encoded as a numpy array in uint8 + page_idx: the index of the page in the input raw document + dimensions: the page size in pixels in format (height, width) + orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction + language: a dictionary with the language value and confidence of the prediction + """ + + _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] + _children_names: List[str] = ["predictions"] + predictions: Dict[str, List[Prediction]] = {} + + def __init__( + self, + page: np.ndarray, + predictions: Dict[str, List[Prediction]], + page_idx: int, + dimensions: Tuple[int, int], + orientation: Optional[Dict[str, Any]] = None, + language: Optional[Dict[str, Any]] = None, + ) -> None: + super().__init__(predictions=predictions) + self.page = page + self.page_idx = page_idx + self.dimensions = dimensions + self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) + self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) + + def render(self, prediction_break: str = "\n\n") -> str: + """Renders the full text of the element""" + return prediction_break.join( + f"{class_name}: {p.render()}" for class_name, predictions in self.predictions.items() for p in predictions + ) + + def extra_repr(self) -> str: + return f"dimensions={self.dimensions}" + + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + """Overlay the result on a given image + + Args: + interactive: whether the display should be interactive + preserve_aspect_ratio: pass True if you passed True to the predictor + **kwargs: keyword arguments passed to the matplotlib.pyplot.show method + """ + requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed") + requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed") + import matplotlib.pyplot as plt + + visualize_kie_page( + self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio + ) + plt.show(**kwargs) + + def synthesize(self, **kwargs) -> np.ndarray: + """Synthesize the page from the predictions + + Args: + ---- + **kwargs: keyword arguments passed to the matplotlib.pyplot.show method + + Returns: + ------- + synthesized page + """ + return synthesize_kie_page(self.export(), **kwargs) + + def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]: + """Export the page as XML (hOCR-format) + convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md + + Args: + ---- + file_title: the title of the XML file + + Returns: + ------- + a tuple of the XML byte string, and its ElementTree + """ + p_idx = self.page_idx + prediction_count: int = 1 + height, width = self.dimensions + language = self.language if "language" in self.language.keys() else "en" + # Create the XML root element + page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)}) + # Create the header / SubElements of the root element + head = SubElement(page_hocr, "head") + SubElement(head, "title").text = file_title + SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"}) + SubElement( + head, + "meta", + attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"}, # type: ignore[attr-defined] + ) + SubElement( + head, + "meta", + attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"}, + ) + # Create the body + body = SubElement(page_hocr, "body") + SubElement( + body, + "div", + attrib={ + "class": "ocr_page", + "id": f"page_{p_idx + 1}", + "title": f"image; bbox 0 0 {width} {height}; ppageno 0", + }, + ) + # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes + for class_name, predictions in self.predictions.items(): + for prediction in predictions: + if len(prediction.geometry) != 2: + raise TypeError("XML export is only available for straight bounding boxes for now.") + (xmin, ymin), (xmax, ymax) = prediction.geometry + prediction_div = SubElement( + body, + "div", + attrib={ + "class": "ocr_carea", + "id": f"{class_name}_prediction_{prediction_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}", + }, + ) + prediction_div.text = prediction.value + prediction_count += 1 + + return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr) + + @classmethod + def from_dict(cls, save_dict: Dict[str, Any], **kwargs): + kwargs = {k: save_dict[k] for k in cls._exported_keys} + kwargs.update({ + "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]] + }) + return cls(**kwargs) + +
    -[docs] +[docs] class Document(Element): """Implements a document element as a collection of pages Args: + ---- pages: list of page elements """ - _children_names: List[str] = ['pages'] + _children_names: List[str] = ["pages"] pages: List[Page] = [] def __init__( @@ -564,28 +906,64 @@

    Source code for doctr.io.elements

         ) -> None:
             super().__init__(pages=pages)
     
    -    def render(self, page_break: str = '\n\n\n\n') -> str:
    +    def render(self, page_break: str = "\n\n\n\n") -> str:
             """Renders the full text of the element"""
             return page_break.join(p.render() for p in self.pages)
     
     
    -[docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image +[docs] + def show(self, **kwargs) -> None: + """Overlay the result on a given image""" + for result in self.pages: + result.show(**kwargs)
    - Args: - pages: list of images encoded as numpy arrays in uint8 + + def synthesize(self, **kwargs) -> List[np.ndarray]: + """Synthesize all pages from their predictions + + Returns + ------- + list of synthesized pages """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
    + return [page.synthesize() for page in self.pages] + + def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: + """Export the document as XML (hOCR-format) + + Args: + ---- + **kwargs: additional keyword arguments passed to the Page.export_as_xml method + Returns: + ------- + list of tuple of (bytes, ElementTree) + """ + return [page.export_as_xml(**kwargs) for page in self.pages] @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} - kwargs.update({'pages': [Page.from_dict(page_dict) for page_dict in save_dict['pages']]}) + kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]}) return cls(**kwargs)
    + + +class KIEDocument(Document): + """Implements a document element as a collection of pages + + Args: + ---- + pages: list of page elements + """ + + _children_names: List[str] = ["pages"] + pages: List[KIEPage] = [] # type: ignore[assignment] + + def __init__( + self, + pages: List[KIEPage], + ) -> None: + super().__init__(pages=pages) # type: ignore[arg-type]
    @@ -618,7 +996,7 @@

    Source code for doctr.io.elements

           
         
       
    -
    +
    diff --git a/v0.3.1/_modules/doctr/io/html.html b/v0.3.1/_modules/doctr/io/html.html index 1dca6c97e4..d5495fcd8a 100644 --- a/v0.3.1/_modules/doctr/io/html.html +++ b/v0.3.1/_modules/doctr/io/html.html @@ -226,20 +226,38 @@

    Source code for doctr.io.html

    -# Copyright (C) 2021, Mindee.
    +# Copyright (C) 2021-2024, Mindee.
     
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    +# This program is licensed under the Apache License 2.0.
    +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
     
    -from weasyprint import HTML
     from typing import Any
     
    -__all__ = ['read_html']
    +__all__ = ["read_html"]
     
     
     
    -[docs] +[docs] def read_html(url: str, **kwargs: Any) -> bytes: """Read a PDF file and convert it into an image in numpy format - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") + >>> from doctr.io import read_html + >>> doc = read_html("https://www.yoursite.com") Args: + ---- url: URL of the target web page + **kwargs: keyword arguments from `weasyprint.HTML` + Returns: + ------- decoded PDF file as a bytes stream """ + from weasyprint import HTML return HTML(url, **kwargs).write_pdf()
    @@ -335,7 +356,7 @@

    Source code for doctr.io.html

           
         
       
    -
    +
    diff --git a/v0.3.1/_modules/doctr/io/image/base.html b/v0.3.1/_modules/doctr/io/image/base.html index defcac7f86..1ba249a68a 100644 --- a/v0.3.1/_modules/doctr/io/image/base.html +++ b/v0.3.1/_modules/doctr/io/image/base.html @@ -226,20 +226,38 @@

    Source code for doctr.io.image.base

    -# Copyright (C) 2021, Mindee.
    +# Copyright (C) 2021-2024, Mindee.
     
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    +# This program is licensed under the Apache License 2.0.
    +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
     
     from pathlib import Path
     from typing import Optional, Tuple
    -import numpy as np
    +
     import cv2
    +import numpy as np
    +
     from doctr.utils.common_types import AbstractFile
     
    -__all__ = ['read_img_as_numpy']
    +__all__ = ["read_img_as_numpy"]
     
     
     
    -[docs] +[docs] def read_img_as_numpy( file: AbstractFile, output_size: Optional[Tuple[int, int]] = None, @@ -298,25 +318,26 @@

    Source code for doctr.io.image.base

     ) -> np.ndarray:
         """Read an image file into numpy format
     
    -    Example::
    -        >>> from doctr.documents import read_img
    -        >>> page = read_img("path/to/your/doc.jpg")
    +    >>> from doctr.io import read_img_as_numpy
    +    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
     
         Args:
    +    ----
             file: the path to the image file
             output_size: the expected output size of each page in format H x W
             rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
    +
         Returns:
    +    -------
             the page decoded as numpy ndarray of shape H x W x 3
         """
    -
         if isinstance(file, (str, Path)):
             if not Path(file).is_file():
                 raise FileNotFoundError(f"unable to access {file}")
             img = cv2.imread(str(file), cv2.IMREAD_COLOR)
         elif isinstance(file, bytes):
    -        file = np.frombuffer(file, np.uint8)
    -        img = cv2.imdecode(file, cv2.IMREAD_COLOR)
    +        _file: np.ndarray = np.frombuffer(file, np.uint8)
    +        img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
         else:
             raise TypeError("unsupported object type for argument 'file'")
     
    @@ -363,7 +384,7 @@ 

    Source code for doctr.io.image.base

           
         
       
    -
    +
    diff --git a/v0.3.1/_modules/doctr/io/image/tensorflow.html b/v0.3.1/_modules/doctr/io/image/tensorflow.html index 64db7b45b4..f9faeeab1c 100644 --- a/v0.3.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.3.1/_modules/doctr/io/image/tensorflow.html @@ -226,20 +226,38 @@

    Source code for doctr.io.image.tensorflow

    -# Copyright (C) 2021, Mindee.
    +# Copyright (C) 2021-2024, Mindee.
    +
    +# This program is licensed under the Apache License 2.0.
    +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
     
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    +from typing import Tuple
     
     import numpy as np
    -from PIL import Image
     import tensorflow as tf
    -
    -if tf.__version__ >= '2.6.0':
    -    from tensorflow.keras.utils import img_to_array
    -else:
    -    from tensorflow.keras.preprocessing.image import img_to_array
    +from PIL import Image
    +from tensorflow.keras.utils import img_to_array
     
     from doctr.utils.common_types import AbstractPath
     
    -__all__ = ['tensor_from_pil', 'read_img_as_tensor', 'decode_img_as_tensor', 'tensor_from_numpy']
    +__all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
     
     
    -def tensor_from_pil(pil_img: Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
    +def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
         """Convert a PIL Image to a TensorFlow tensor
     
         Args:
    +    ----
             pil_img: a PIL image
             dtype: the output tensor data type
     
         Returns:
    +    -------
             decoded image as tensor
         """
    -
         npy_img = img_to_array(pil_img)
     
         return tensor_from_numpy(npy_img, dtype)
     
     
     
    -[docs] +[docs] def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor: """Read an image file as a TensorFlow tensor Args: + ---- img_path: location of the image file dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255. Returns: + ------- decoded image as a tensor """ - if dtype not in (tf.uint8, tf.float16, tf.float32): raise ValueError("insupported value for dtype") @@ -338,18 +356,19 @@

    Source code for doctr.io.image.tensorflow

     
     
     
    -[docs] +[docs] def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor: """Read a byte stream as a TensorFlow tensor Args: + ---- img_content: bytes of a decoded image dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255. Returns: + ------- decoded image as a tensor """ - if dtype not in (tf.uint8, tf.float16, tf.float32): raise ValueError("insupported value for dtype") @@ -367,13 +386,14 @@

    Source code for doctr.io.image.tensorflow

         """Read an image file as a TensorFlow tensor
     
         Args:
    -        img: image encoded as a numpy array of shape (H, W, C) in np.uint8
    +    ----
    +        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
             dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
     
         Returns:
    +    -------
             same image as a tensor of shape (H, W, C)
         """
    -
         if dtype not in (tf.uint8, tf.float16, tf.float32):
             raise ValueError("insupported value for dtype")
     
    @@ -384,6 +404,11 @@ 

    Source code for doctr.io.image.tensorflow

             img = tf.clip_by_value(img, 0, 1)
     
         return img
    +
    +
    +def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
    +    """Get the shape of an image"""
    +    return img.shape[:2]
     
    @@ -416,7 +441,7 @@

    Source code for doctr.io.image.tensorflow

           
         
       
    -
    +
diff --git a/v0.3.1/_modules/doctr/io/pdf.html b/v0.3.1/_modules/doctr/io/pdf.html index 2d383b9e85..91baf96f7b 100644 --- a/v0.3.1/_modules/doctr/io/pdf.html +++ b/v0.3.1/_modules/doctr/io/pdf.html @@ -226,20 +226,38 @@

Source code for doctr.io.pdf

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+from typing import Any, List, Optional
 
 import numpy as np
-import cv2
-from pathlib import Path
-import fitz
-from typing import List, Tuple, Optional, Any, Dict
+import pypdfium2 as pdfium
 
-from doctr.utils.common_types import AbstractFile, Bbox
+from doctr.utils.common_types import AbstractFile
 
-__all__ = ['read_pdf', 'PDF']
+__all__ = ["read_pdf"]
 
 
 
-[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: +[docs] +def read_pdf( + file: AbstractFile, + scale: float = 2, + rgb_mode: bool = True, + password: Optional[str] = None, + **kwargs: Any, +) -> List[np.ndarray]: """Read a PDF file and convert it into an image in numpy format - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") + >>> from doctr.io import read_pdf + >>> doc = read_pdf("path/to/your/doc.pdf") Args: + ---- file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
- - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. + scale: rendering scale (1 corresponds to 72dpi) + rgb_mode: if True, the output will be RGB, otherwise BGR + password: a password to unlock the document, if encrypted + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` Returns: - the rendered image in numpy format + ------- + the list of pages decoded as numpy ndarray of shape H x W x C """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
-[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
-[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
- - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
-[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
- - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
-[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
-
+ # Rasterise pages to numpy ndarrays with pypdfium2 + pdf = pdfium.PdfDocument(file, password=password) + try: + return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf] + finally: + pdf.close()

@@ -467,7 +373,7 @@

Source code for doctr.io.pdf

       
     
   
-
+
diff --git a/v0.3.1/_modules/doctr/io/reader.html b/v0.3.1/_modules/doctr/io/reader.html index ac14a8ce45..49cdc7d152 100644 --- a/v0.3.1/_modules/doctr/io/reader.html +++ b/v0.3.1/_modules/doctr/io/reader.html @@ -226,20 +226,38 @@

Source code for doctr.io.reader

-# Copyright (C) 2021, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 
-# This program is licensed under the Apache License version 2.
-# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-import numpy as np
 from pathlib import Path
-from typing import List, Union, Sequence
+from typing import List, Sequence, Union
+
+import numpy as np
+
+from doctr.file_utils import requires_package
 from doctr.utils.common_types import AbstractFile
-from .pdf import read_pdf, PDF
+
 from .html import read_html
 from .image import read_img_as_numpy
+from .pdf import read_pdf
 
-__all__ = ['DocumentFile']
+__all__ = ["DocumentFile"]
 
 
 
-[docs] +[docs] class DocumentFile: """Read a document from multiple extensions"""
-[docs] +[docs] @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: + def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]: """Read a PDF file - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") + >>> from doctr.io import DocumentFile + >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") Args: + ---- file: the path to the PDF file or a binary stream + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` + Returns: - a PDF document + ------- + the list of pages decoded as numpy ndarray of shape H x W x 3 """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
+ return read_pdf(file, **kwargs)
-[docs] +[docs] @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: + def from_url(cls, url: str, **kwargs) -> List[np.ndarray]: """Interpret a web page as a PDF document - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") + >>> from doctr.io import DocumentFile + >>> doc = DocumentFile.from_url("https://www.yoursite.com") Args: + ---- url: the URL of the target web page + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` + Returns: - a PDF document + ------- + the list of pages decoded as numpy ndarray of shape H x W x 3 """ + requires_package( + "weasyprint", + "`.from_url` requires weasyprint installed.\n" + + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation", + ) pdf_stream = read_html(url) return cls.from_pdf(pdf_stream, **kwargs)
-[docs] +[docs] @classmethod def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: """Read an image file (or a collection of image files) and convert it into an image in numpy format - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) + >>> from doctr.io import DocumentFile + >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) Args: + ---- files: the path to the image file or a binary stream, or a collection of those + **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy` + Returns: + ------- the list of pages decoded as numpy ndarray of shape H x W x 3 """ if isinstance(files, (str, Path, bytes)): @@ -389,7 +422,7 @@

Source code for doctr.io.reader

       
     
   
-
+
diff --git a/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index f850c994bc..e181ef6a1f 100644 --- a/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.magc_resnet.tensorflow

from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

Source code for doctr.models.classification.mobilenet.tensorflow

from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

Source code for doctr.models.classification.resnet.tensorflow

from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.textnet.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

Source code for doctr.models.classification.vgg.tensorflow

from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

Source code for doctr.models.classification.vit.tensorflow

from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 6e4b50d4ff..dc65e2ed03 100644 --- a/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -236,7 +236,7 @@

Package Reference

  • doctr.datasets
  • -
  • doctr.io
  • +
  • doctr.documents
  • doctr.models
  • doctr.transforms
  • doctr.utils
  • @@ -286,34 +286,26 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers -from tensorflow.keras.applications import ResNet50 from typing import List, Tuple, Optional, Any, Dict -from ...backbones import mobilenet_v3_large from doctr.utils.repr import NestedObject from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence from .base import DBPostProcessor, _DBNet -__all__ = ['DBNet', 'db_resnet50', 'db_mobilenet_v3_large'] +__all__ = ['DBNet', 'db_resnet50'] default_cfgs: Dict[str, Dict[str, Any]] = { 'db_resnet50': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), - 'backbone': ResNet50, + 'backbone': 'ResNet50', 'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + 'fpn_channels': 128, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip', }, - 'db_mobilenet_v3_large': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'backbone': mobilenet_v3_large, - 'fpn_layers': ["inverted_2", "inverted_5", "inverted_11", "final_block"], - 'input_shape': (1024, 1024, 3), - 'url': None, - }, } @@ -387,8 +379,6 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo Args: feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to - rotated_bbox: whether the segmentation map can include rotated bounding boxes - cfg: the configuration dict of the model """ _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] @@ -396,7 +386,7 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + fpn_channels: int = 128, rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: @@ -455,9 +445,9 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) # Compute balanced BCE loss for proba_map @@ -522,64 +512,30 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo return out -def _db_resnet( - arch: str, - pretrained: bool, - pretrained_backbone: bool = False, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) # Feature extractor - feat_extractor = IntermediateLayerGetter( - _cfg['backbone']( - include_top=False, - weights='imagenet' if pretrained_backbone else None, - input_shape=_cfg['input_shape'], - pooling=None, - ), - _cfg['fpn_layers'], + resnet = tf.keras.applications.__dict__[_cfg['backbone']]( + include_top=False, + weights=None, + input_shape=_cfg['input_shape'], + pooling=None, ) - # Build the model - model = DBNet(feat_extractor, cfg=_cfg, **kwargs) - # Load pretrained parameters - if pretrained: - load_pretrained_params(model, _cfg['url']) - - return model - - -def _db_mobilenet( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained - - # Patch the config - _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - - # Feature extractor feat_extractor = IntermediateLayerGetter( - _cfg['backbone']( - input_shape=_cfg['input_shape'], - include_top=False, - pretrained=pretrained_backbone, - ), + resnet, _cfg['fpn_layers'], ) + kwargs['fpn_channels'] = _cfg['fpn_channels'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] + # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters @@ -611,30 +567,6 @@

    Source code for doctr.models.detection.differentiable_binarization.tensorflo return _db_resnet('db_resnet50', pretrained, **kwargs)

- - -
-[docs] -def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_mobilenet_v3_large - >>> model = db_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_mobilenet('db_mobilenet_v3_large', pretrained, **kwargs)
-

@@ -667,7 +599,7 @@

Source code for doctr.models.detection.differentiable_binarization.tensorflo

-

+
diff --git a/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

Source code for doctr.models.detection.fast.tensorflow

import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html index 9e99d97e3f..9f836ce462 100644 --- a/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -236,7 +236,7 @@

Package Reference

  • doctr.datasets
  • -
  • doctr.io
  • +
  • doctr.documents
  • doctr.models
  • doctr.transforms
  • doctr.utils
  • @@ -300,7 +300,9 @@

    Source code for doctr.models.detection.linknet.tensorflow

    'linknet16': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'num_classes': 1, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': None, }, } @@ -433,7 +435,7 @@

    Source code for doctr.models.detection.linknet.tensorflow

    A loss tensor """ seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) @@ -461,7 +463,7 @@

    Source code for doctr.models.detection.linknet.tensorflow

    else: # Compute BCE loss with highlighted edges loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, out_map.dtype), + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), bce ) loss = tf.reduce_mean(loss) @@ -504,8 +506,12 @@

    Source code for doctr.models.detection.linknet.tensorflow

    # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters @@ -569,7 +575,7 @@

    Source code for doctr.models.detection.linknet.tensorflow

    +
    diff --git a/v0.3.1/_modules/doctr/models/detection/zoo.html b/v0.3.1/_modules/doctr/models/detection/zoo.html index 9408bc2ac1..23a2f451e3 100644 --- a/v0.3.1/_modules/doctr/models/detection/zoo.html +++ b/v0.3.1/_modules/doctr/models/detection/zoo.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -292,9 +292,9 @@

      Source code for doctr.models.detection.zoo

       
       
       if is_tf_available():
      -    ARCHS = ['db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
      +    ARCHS = ['db_resnet50', 'linknet16']
       elif is_torch_available():
      -    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
      +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
       
       
       def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
      @@ -368,7 +368,7 @@ 

      Source code for doctr.models.detection.zoo

             
           
         
      -
      +
    diff --git a/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html index eac75dc098..7b8529c26d 100644 --- a/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -286,48 +286,31 @@

      Source code for doctr.models.recognition.crnn.tensorflow

      from tensorflow.keras.models import Sequential, Model from typing import Tuple, Dict, Any, Optional, List -from ...backbones import vgg16_bn, resnet31, mobilenet_v3_small, mobilenet_v3_large +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor -from ....datasets import VOCABS -__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor', 'crnn_mobilenet_v3_small', - 'crnn_mobilenet_v3_large'] +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'crnn_vgg16_bn': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': vgg16_bn, 'rnn_units': 128, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip', }, 'crnn_resnet31': { 'mean': (0.694, 0.695, 0.693), 'std': (0.299, 0.296, 0.301), - 'backbone': resnet31, 'rnn_units': 128, + 'backbone': 'resnet31', 'rnn_units': 128, 'input_shape': (32, 128, 3), 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip', }, - 'crnn_mobilenet_v3_small': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_small, 'rnn_units': 128, - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': None, - }, - 'crnn_mobilenet_v3_large': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_large, 'rnn_units': 128, - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': None, - }, } @@ -434,7 +417,7 @@

      Source code for doctr.models.recognition.crnn.tensorflow

      """ gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -471,15 +454,7 @@

      Source code for doctr.models.recognition.crnn.tensorflow

      return out -def _crnn( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -488,10 +463,9 @@

      Source code for doctr.models.recognition.crnn.tensorflow

      _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) # Feature extractor - feat_extractor = _cfg['backbone']( + feat_extractor = backbones.__dict__[_cfg['backbone']]( input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) kwargs['vocab'] = _cfg['vocab'] @@ -549,51 +523,6 @@

      Source code for doctr.models.recognition.crnn.tensorflow

      """ return _crnn('crnn_resnet31', pretrained, **kwargs) - - -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_small', pretrained, **kwargs) - - -
      -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_large', pretrained, **kwargs)
      -
@@ -626,7 +555,7 @@

Source code for doctr.models.recognition.crnn.tensorflow

+
diff --git a/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html index 857cebe956..6d9bff4577 100644 --- a/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -236,7 +236,7 @@

Package Reference

  • doctr.datasets
  • -
  • doctr.io
  • +
  • doctr.documents
  • doctr.models
  • doctr.transforms
  • doctr.utils
  • @@ -280,7 +280,6 @@

    Source code for doctr.models.recognition.master.tensorflow

    # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -import math import tensorflow as tf from tensorflow.keras import layers, Sequential, Model from typing import Tuple, List, Dict, Any, Optional @@ -299,11 +298,11 @@

    Source code for doctr.models.recognition.master.tensorflow

    default_cfgs: Dict[str, Dict[str, Any]] = { 'master': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 128, 3), + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'input_shape': (48, 160, 3), 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/master-bade6eae.zip', + 'url': None, }, } @@ -323,9 +322,8 @@

    Source code for doctr.models.recognition.master.tensorflow

    def __init__( self, inplanes: int, - headers: int = 8, + headers: int = 1, att_scale: bool = False, - ratio: float = 0.0625, # bottleneck ratio of 1/16 as described in paper **kwargs ) -> None: super().__init__(**kwargs) @@ -333,7 +331,6 @@

    Source code for doctr.models.recognition.master.tensorflow

    self.headers = headers # h self.inplanes = inplanes # C self.att_scale = att_scale - self.planes = int(inplanes * ratio) self.single_header_inplanes = int(inplanes / headers) # C / h @@ -346,7 +343,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    self.transform = tf.keras.Sequential( [ tf.keras.layers.Conv2D( - filters=self.planes, + filters=self.inplanes, kernel_size=1, kernel_initializer=tf.initializers.he_normal() ), @@ -361,6 +358,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    name='transform' ) + @tf.function def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor: b, h, w, c = (tf.shape(inputs)[i] for i in range(4)) @@ -383,7 +381,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1)) # scale variance if self.att_scale and self.headers > 1: - context_mask = context_mask / math.sqrt(self.single_header_inplanes) + context_mask = context_mask / tf.sqrt(self.single_header_inplanes) # B*h, 1, H*W, 1 context_mask = tf.keras.activations.softmax(context_mask, axis=2) @@ -417,8 +415,8 @@

    Source code for doctr.models.recognition.master.tensorflow

    def __init__( self, - headers: int = 8, - input_shape: Tuple[int, int, int] = (32, 128, 3), + headers: int = 1, + input_shape: Tuple[int, int, int] = (48, 160, 3), ) -> None: _layers = [ # conv_1x @@ -467,13 +465,12 @@

    Source code for doctr.models.recognition.master.tensorflow

    self, vocab: str, d_model: int = 512, - headers: int = 8, # number of multi-aspect context + headers: int = 1, dff: int = 2048, - num_heads: int = 8, # number of heads in the transformer decoder + num_heads: int = 8, num_layers: int = 3, max_length: int = 50, - dropout: float = 0.2, - input_shape: Tuple[int, int, int] = (32, 128, 3), + input_shape: Tuple[int, int, int] = (48, 160, 3), cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() @@ -483,7 +480,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    self.cfg = cfg self.vocab_size = len(vocab) - self.feat_extractor = MAGCResnet(headers=headers, input_shape=input_shape) + self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape) self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model) # 3 more classes: EOS/PAD/SOS self.decoder = Decoder( @@ -493,13 +490,13 @@

    Source code for doctr.models.recognition.master.tensorflow

    dff=dff, vocab_size=self.vocab_size, maximum_position_encoding=max_length, - dropout=dropout, ) self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model) self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform()) self.postprocessor = MASTERPostProcessor(vocab=self.vocab) + @tf.function def make_mask(self, target: tf.Tensor) -> tf.Tensor: look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1]) target_padding_mask = create_padding_mask(target, self.vocab_size + 2) # Pad symbol @@ -536,7 +533,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -561,7 +558,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    """ # Encode - feature = self.feat_extractor(x, **kwargs) + feature = self.feature_extractor(x, **kwargs) b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) encoded = feature + self.feature_pe[:, :h * w, :] @@ -615,7 +612,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=encoded.dtype) # 3 symbols + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): ys_mask = self.make_mask(ys) @@ -731,7 +728,7 @@

    Source code for doctr.models.recognition.master.tensorflow

    +
    diff --git a/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

    Source code for doctr.models.recognition.parseq.tensorflow

    import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

    Source code for doctr.models.recognition.parseq.tensorflow

    self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

    Source code for doctr.models.recognition.parseq.tensorflow

    ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

    Source code for doctr.models.recognition.parseq.tensorflow

    target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html index edf6879e8b..3a9989ef30 100644 --- a/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -285,11 +285,10 @@

      Source code for doctr.models.recognition.sar.tensorflow

      from tensorflow.keras import Sequential, layers, Model from typing import Tuple, Dict, List, Any, Optional -from ...backbones import vgg16_bn, resnet31 +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor from doctr.utils.repr import NestedObject -from ....datasets import VOCABS __all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31'] @@ -297,19 +296,20 @@

      Source code for doctr.models.recognition.sar.tensorflow

      'sar_vgg16_bn': { 'mean': (.5, .5, .5), 'std': (1., 1., 1.), - 'backbone': vgg16_bn, 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, 'input_shape': (32, 128, 3), 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), 'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip', }, 'sar_resnet31': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': resnet31, 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/sar_resnet31-9ee49970.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip', }, } @@ -390,7 +390,7 @@

      Source code for doctr.models.recognition.sar.tensorflow

      super().__init__() self.vocab_size = vocab_size self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_layers)] + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) @@ -411,7 +411,7 @@

      Source code for doctr.models.recognition.sar.tensorflow

      # initialize states (each of shape (N, rnn_units)) states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype + inputs=None, batch_size=features.shape[0], dtype=tf.float32 ) # run first step of lstm # holistic: shape (N, rnn_units) @@ -526,7 +526,7 @@

      Source code for doctr.models.recognition.sar.tensorflow

      mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -591,15 +591,7 @@

      Source code for doctr.models.recognition.sar.tensorflow

      return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -612,9 +604,8 @@

      Source code for doctr.models.recognition.sar.tensorflow

      _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = default_cfgs[arch]['backbone']( + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( input_shape=_cfg['input_shape'], - pretrained=pretrained_backbone, include_top=False, ) @@ -712,7 +703,7 @@

      Source code for doctr.models.recognition.sar.tensorflow

      +
      diff --git a/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.3.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

      Source code for doctr.models.recognition.vitstr.tensorflow

      from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.3.1/_modules/doctr/models/recognition/zoo.html b/v0.3.1/_modules/doctr/models/recognition/zoo.html index eff472c9db..0f1bff8861 100644 --- a/v0.3.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.3.1/_modules/doctr/models/recognition/zoo.html @@ -236,7 +236,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -282,7 +282,7 @@

        Source code for doctr.models.recognition.zoo

        from typing import Any
         
        -from doctr import is_tf_available
        +from doctr.file_utils import is_tf_available, is_torch_available
         from .core import RecognitionPredictor
         from ..preprocessor import PreProcessor
         from .. import recognition
        @@ -291,8 +291,10 @@ 

        Source code for doctr.models.recognition.zoo

        __all__ = ["recognition_predictor"]
         
         
        -ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large',
        -         'sar_vgg16_bn', 'sar_resnet31', 'master']
        +if is_tf_available():
        +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
        +elif is_torch_available():
        +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
         
         
         def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
        @@ -304,9 +306,8 @@ 

        Source code for doctr.models.recognition.zoo

        kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
             kwargs['std'] = kwargs.get('std', _model.cfg['std'])
             kwargs['batch_size'] = kwargs.get('batch_size', 32)
        -    input_shape = _model.cfg['input_shape'][:2] if is_tf_available() else _model.cfg['input_shape'][-2:]
             predictor = RecognitionPredictor(
        -        PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs),
        +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                 _model
             )
         
        @@ -367,7 +368,7 @@ 

        Source code for doctr.models.recognition.zoo

           
        -
        +
        diff --git a/v0.3.1/_modules/doctr/transforms/modules/base.html b/v0.3.1/_modules/doctr/transforms/modules/base.html index 8d19a00ba7..e7b5ea10d9 100644 --- a/v0.3.1/_modules/doctr/transforms/modules/base.html +++ b/v0.3.1/_modules/doctr/transforms/modules/base.html @@ -236,7 +236,7 @@

        Package Reference

        • doctr.datasets
        • -
        • doctr.io
        • +
        • doctr.documents
        • doctr.models
        • doctr.transforms
        • doctr.utils
        • @@ -281,14 +281,13 @@

          Source code for doctr.transforms.modules.base

          # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. import random -from typing import List, Any, Callable, Dict, Tuple -import numpy as np +from typing import List, Any, Callable from doctr.utils.repr import NestedObject from .. import functional as F -__all__ = ['ColorInversion', 'OneOf', 'RandomApply', 'RandomRotate'] +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
          @@ -372,30 +371,6 @@

          Source code for doctr.transforms.modules.base

          return self.transform(img) return img
          - - -
          -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - def __init__(self, max_angle: float = 25., expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - angle = random.uniform(-self.max_angle, self.max_angle) - img, target['boxes'] = F.rotate(img, target['boxes'], angle, self.expand) - return img, target
          -
      @@ -428,7 +403,7 @@

      Source code for doctr.transforms.modules.base

      -
      +
      diff --git a/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html index 6c092476b5..51b31b4fc4 100644 --- a/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.3.1/_modules/doctr/transforms/modules/tensorflow.html @@ -236,7 +236,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -355,7 +355,6 @@

        Source code for doctr.transforms.modules.tensorflow

        return _repr def __call__(self, img: tf.Tensor) -> tf.Tensor: - input_dtype = img.dtype img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: # pad width @@ -366,7 +365,7 @@

        Source code for doctr.transforms.modules.tensorflow

        else: offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return tf.cast(img, dtype=input_dtype)
        + return img
        @@ -386,15 +385,15 @@

        Source code for doctr.transforms.modules.tensorflow

        std: standard deviation per channel """ def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
        @@ -640,7 +639,7 @@

        Source code for doctr.transforms.modules.tensorflow

        +
        diff --git a/v0.3.1/_modules/doctr/utils/visualization.html b/v0.3.1/_modules/doctr/utils/visualization.html index 99769898ea..21743f6182 100644 --- a/v0.3.1/_modules/doctr/utils/visualization.html +++ b/v0.3.1/_modules/doctr/utils/visualization.html @@ -236,7 +236,7 @@

        Package Reference

        • doctr.datasets
        • -
        • doctr.io
        • +
        • doctr.documents
        • doctr.models
        • doctr.transforms
        • doctr.utils
        • @@ -285,128 +285,65 @@

          Source code for doctr.utils.visualization

           import matplotlib.patches as patches
           import mplcursors
           from PIL import ImageFont, ImageDraw, Image
          -from copy import deepcopy
           import numpy as np
           import cv2
          -from typing import Tuple, List, Dict, Any, Union, Optional
          +from typing import Tuple, List, Dict, Any, Union
           
           from .common_types import BoundingBox, RotatedBbox
           
          -__all__ = ['visualize_page', 'synthetize_page', 'draw_boxes']
          +__all__ = ['visualize_page', 'synthetize_page']
           
           
          -def rect_patch(
          -    geometry: BoundingBox,
          +def create_rect_patch(
          +    geometry: Union[BoundingBox, RotatedBbox],
          +    label: str,
               page_dimensions: Tuple[int, int],
          -    label: Optional[str] = None,
          -    color: Tuple[float, float, float] = (0, 0, 0),
          +    color: Tuple[int, int, int],
               alpha: float = 0.3,
               linewidth: int = 2,
               fill: bool = True,
          -) -> patches.Rectangle:
          -    """Create a matplotlib rectangular patch for the element
          +) -> patches.Patch:
          +    """Create a matplotlib patch (rectangle) bounding the element
           
               Args:
                   geometry: bounding box of the element
          -        page_dimensions: dimensions of the Page
                   label: label to display when hovered
          -        color: color to draw box
          -        alpha: opacity parameter to fill the boxes, 0 = transparent
          -        linewidth: line width
          -        fill: whether the patch should be filled
          -
          -    Returns:
          -        a rectangular Patch
          -    """
          -
          -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
          -        raise ValueError("invalid geometry format")
          -
          -    # Unpack
          -    height, width = page_dimensions
          -    (xmin, ymin), (xmax, ymax) = geometry
          -    # Switch to absolute coords
          -    xmin, w = xmin * width, (xmax - xmin) * width
          -    ymin, h = ymin * height, (ymax - ymin) * height
          -
          -    return patches.Rectangle(
          -        (xmin, ymin),
          -        w,
          -        h,
          -        fill=fill,
          -        linewidth=linewidth,
          -        edgecolor=(*color, alpha),
          -        facecolor=(*color, alpha),
          -        label=label,
          -    )
          -
          -
          -def polygon_patch(
          -    geometry: RotatedBbox,
          -    page_dimensions: Tuple[int, int],
          -    label: Optional[str] = None,
          -    color: Tuple[float, float, float] = (0, 0, 0),
          -    alpha: float = 0.3,
          -    linewidth: int = 2,
          -    fill: bool = True,
          -) -> patches.Polygon:
          -    """Create a matplotlib polygon patch for the element
          -
          -    Args:
          -        geometry: bounding box of the element
                   page_dimensions: dimensions of the Page
          -        label: label to display when hovered
                   color: color to draw box
                   alpha: opacity parameter to fill the boxes, 0 = transparent
                   linewidth: line width
          -        fill: whether the patch should be filled
           
               Returns:
          -        a polygon Patch
          +        a rectangular Patch
               """
          -
          -    if len(geometry) != 5 or any(not isinstance(elt, float) for elt in geometry):
          -        raise ValueError("invalid geometry format")
          -
          -    # Unpack
               height, width = page_dimensions
          -    x, y, w, h, a = geometry
          -    # Switch to absolute coords
          -    x, w = x * width, w * width
          -    y, h = y * height, h * height
          -    points = cv2.boxPoints(((x, y), (w, h), a))
          -
          -    return patches.Polygon(
          -        points,
          -        fill=fill,
          -        linewidth=linewidth,
          -        edgecolor=(*color, alpha),
          -        facecolor=(*color, alpha),
          -        label=label,
          -    )
          -
          -
          -def create_obj_patch(
          -    geometry: Union[BoundingBox, RotatedBbox],
          -    page_dimensions: Tuple[int, int],
          -    **kwargs: Any,
          -) -> patches.Patch:
          -    """Create a matplotlib patch for the element
          -
          -    Args:
          -        geometry: bounding box (straight or rotated) of the element
          -        page_dimensions: dimensions of the page
          -
          -    Returns:
          -        a matplotlib Patch
          -    """
          -    if isinstance(geometry, tuple):
          -        if len(geometry) == 2:
          -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
          -        elif len(geometry) == 5:
          -            return polygon_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
          -
          -    raise ValueError("invalid geometry format")
          +    if len(geometry) == 5:
          +        x, y, w, h, a = geometry  # type: ignore[misc]
          +        x, w = x * width, w * width
          +        y, h = y * height, h * height
          +        points = cv2.boxPoints(((x, y), (w, h), a))
          +        return patches.Polygon(
          +            points,
          +            fill=fill,
          +            linewidth=linewidth,
          +            edgecolor=(*color, alpha),
          +            facecolor=(*color, alpha),
          +            label=label
          +        )
          +    else:
          +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
          +        xmin, xmax = xmin * width, xmax * width
          +        ymin, ymax = ymin * height, ymax * height
          +        return patches.Rectangle(
          +            (xmin, ymin),
          +            xmax - xmin,
          +            ymax - ymin,
          +            fill=fill,
          +            linewidth=linewidth,
          +            edgecolor=(*color, alpha),
          +            facecolor=(*color, alpha),
          +            label=label
          +        )
           
           
           
          @@ -457,8 +394,7 @@

          Source code for doctr.utils.visualization

           
               for block in page['blocks']:
                   if not words_only:
          -            rect = create_obj_patch(block['geometry'], page['dimensions'],
          -                                    label='block', color=(0, 1, 0), linewidth=1, **kwargs)
          +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                       # add patch on figure
                       ax.add_patch(rect)
                       if interactive:
          @@ -467,16 +403,14 @@ 

          Source code for doctr.utils.visualization

           
                   for line in block['lines']:
                       if not words_only:
          -                rect = create_obj_patch(line['geometry'], page['dimensions'],
          -                                        label='line', color=(1, 0, 0), linewidth=1, **kwargs)
          +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                           ax.add_patch(rect)
                           if interactive:
                               artists.append(rect)
           
                       for word in line['words']:
          -                rect = create_obj_patch(word['geometry'], page['dimensions'],
          -                                        label=f"{word['value']} (confidence: {word['confidence']:.2%})",
          -                                        color=(0, 0, 1), **kwargs)
          +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
          +                                         page['dimensions'], (0, 0, 1), **kwargs)
                           ax.add_patch(rect)
                           if interactive:
                               artists.append(rect)
          @@ -501,11 +435,11 @@ 

          Source code for doctr.utils.visualization

           
                   if display_artefacts:
                       for artefact in block['artefacts']:
          -                rect = create_obj_patch(
          +                rect = create_rect_patch(
                               artefact['geometry'],
          +                    'artefact',
                               page['dimensions'],
          -                    label='artefact',
          -                    color=(0.5, 0.5, 0.5),
          +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                               linewidth=1,
                               **kwargs
                           )
          @@ -575,37 +509,6 @@ 

          Source code for doctr.utils.visualization

                           response[ymin:ymax, xmin:xmax, :] = np.array(img)
           
               return response
          -
          -
          -def draw_boxes(
          -    boxes: np.ndarray,
          -    image: np.ndarray,
          -    color: Optional[Tuple] = None,
          -    **kwargs
          -) -> None:
          -    """Draw an array of relative straight boxes on an image
          -
          -    Args:
          -        boxes: array of relative boxes, of shape (*, 4)
          -        image: np array, float32 or uint8
          -    """
          -    h, w = image.shape[:2]
          -    # Convert boxes to absolute coords
          -    _boxes = deepcopy(boxes)
          -    _boxes[:, [0, 2]] *= w
          -    _boxes[:, [1, 3]] *= h
          -    _boxes = _boxes.astype(np.int32)
          -    for box in _boxes.tolist():
          -        xmin, ymin, xmax, ymax = box
          -        image = cv2.rectangle(
          -            image,
          -            (xmin, ymin),
          -            (xmax, ymax),
          -            color=color if isinstance(color, tuple) else (0, 0, 255),
          -            thickness=2
          -        )
          -    plt.imshow(image)
          -    plt.plot(**kwargs)
           
          @@ -638,7 +541,7 @@

          Source code for doctr.utils.visualization

                 
               
             
          -
          +
          diff --git a/v0.3.1/_modules/index.html b/v0.3.1/_modules/index.html index 7d41bde20c..c887b618c2 100644 --- a/v0.3.1/_modules/index.html +++ b/v0.3.1/_modules/index.html @@ -236,7 +236,7 @@

          Package Reference

          All modules for which code is available

          -
          -
      +
      diff --git a/v0.3.1/_sources/changelog.rst.txt b/v0.3.1/_sources/changelog.rst.txt index 60699905f0..430097d6c8 100644 --- a/v0.3.1/_sources/changelog.rst.txt +++ b/v0.3.1/_sources/changelog.rst.txt @@ -1,10 +1,6 @@ Changelog ========= -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.3.1/_sources/datasets.rst.txt b/v0.3.1/_sources/datasets.rst.txt index 4b2fe083c7..354122f1e5 100644 --- a/v0.3.1/_sources/datasets.rst.txt +++ b/v0.3.1/_sources/datasets.rst.txt @@ -22,7 +22,6 @@ Here are all datasets that are available through DocTR: .. autoclass:: SROIE .. autoclass:: CORD .. autoclass:: OCRDataset -.. autoclass:: CharacterGenerator Data Loading diff --git a/v0.3.1/_sources/index.rst.txt b/v0.3.1/_sources/index.rst.txt index 2e9ca31d01..fc3ff89fdf 100644 --- a/v0.3.1/_sources/index.rst.txt +++ b/v0.3.1/_sources/index.rst.txt @@ -77,7 +77,7 @@ Supported datasets :caption: Package Reference datasets - io + documents models transforms utils diff --git a/v0.3.1/_sources/installing.rst.txt b/v0.3.1/_sources/installing.rst.txt index f85fe6395c..5c8779dc1c 100644 --- a/v0.3.1/_sources/installing.rst.txt +++ b/v0.3.1/_sources/installing.rst.txt @@ -35,16 +35,6 @@ Install the last stable release of the package using pip: pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install python-doctr[tf] - # for PyTorch - pip install python-doctr[torch] - - Via Git ======= @@ -54,12 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.3.1/_sources/io.rst.txt b/v0.3.1/_sources/io.rst.txt deleted file mode 100644 index d23e11bdb9..0000000000 --- a/v0.3.1/_sources/io.rst.txt +++ /dev/null @@ -1,92 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.3.1/_sources/models.rst.txt b/v0.3.1/_sources/models.rst.txt index 0c52482a45..9830c6c153 100644 --- a/v0.3.1/_sources/models.rst.txt +++ b/v0.3.1/_sources/models.rst.txt @@ -46,7 +46,6 @@ Detection models Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large .. autofunction:: doctr.models.detection.linknet16 Detection predictors @@ -73,15 +72,9 @@ Identifying strings in images * - crnn_vgg16_bn - (32, 128, 3) - 15.8M - - 87.17 - - 92.93 + - 86.02 + - 91.3 - 12.8 - * - master - - (32, 128, 3) - - - - 87.61 - - 93.28 - - * - sar_vgg16_bn - (32, 128, 3) - 21.5M @@ -91,8 +84,8 @@ Identifying strings in images * - sar_resnet31 - (32, 128, 3) - 53.1M - - **87.67** - - **93.41** + - **86.3** + - **92.1** - 2.7 All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). @@ -120,7 +113,6 @@ Models expect a TensorFlow tensor as input and produces one in return. DocTR inc .. autofunction:: doctr.models.recognition.crnn_vgg16_bn -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large .. autofunction:: doctr.models.recognition.sar_vgg16_bn .. autofunction:: doctr.models.recognition.sar_resnet31 .. autofunction:: doctr.models.recognition.master @@ -142,13 +134,11 @@ Predictors that localize and identify text elements in images +=============================+============+===============+=========+============+===============+=========+ | **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | +-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 83.99 | 81.42 | 1.6 | -+-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.26 | 76.03 | | 84.61 | 82.02 | | +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | +-----------------------------+------------+---------------+---------+------------+---------------+---------+ | db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | +-----------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.48 | 76.26 | 0.27 | 84.66 | **82.07** | 0.83 | +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | +-----------------------------+------------+---------------+---------+------------+---------------+---------+ | Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | +-----------------------------+------------+---------------+---------+------------+---------------+---------+ @@ -169,21 +159,17 @@ We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform ex Results on private ocr datasets -+------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | -+====================================+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.56 | 80.94 | 65.79 | 70.10 | 49.35 | 50.84 | 78.99 | 92.73 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **78.91** | **81.31** | 65.57 | 69.86 | 50.65 | 52.17 | 78.86 | 92.57 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.84 | 81.23 | 65.90 | **70.21** | **51.17** | **52.72** | 79.17 | 92.68 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | -+------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ Two-stage approaches @@ -192,74 +178,6 @@ Those architectures involve one stage of text detection, and one stage of text r .. autofunction:: doctr.models.zoo.ocr_predictor -Export model output -^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } Model export ------------ diff --git a/v0.3.1/_sources/transforms.rst.txt b/v0.3.1/_sources/transforms.rst.txt index a17ad5d220..0230fe75f5 100644 --- a/v0.3.1/_sources/transforms.rst.txt +++ b/v0.3.1/_sources/transforms.rst.txt @@ -21,7 +21,6 @@ Here are all transformations that are available through DocTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate Composing transformations diff --git a/v0.3.1/_sources/using_doctr/using_model_export.rst.txt b/v0.3.1/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.3.1/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.3.1/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.3.1/_static/documentation_options.js b/v0.3.1/_static/documentation_options.js index 3a3f8007a0..a7b5cbe04a 100644 --- a/v0.3.1/_static/documentation_options.js +++ b/v0.3.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.3.1a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.3.1/changelog.html b/v0.3.1/changelog.html index 976cf392b1..6ed2620fb7 100644 --- a/v0.3.1/changelog.html +++ b/v0.3.1/changelog.html @@ -237,7 +237,7 @@

      Package Reference

      diff --git a/v0.3.1/datasets.html b/v0.3.1/datasets.html index c07fdd0da2..640791680a 100644 --- a/v0.3.1/datasets.html +++ b/v0.3.1/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -237,7 +237,7 @@

      Package Reference

    • Data Loading
        @@ -594,7 +568,7 @@

        Data Loading +

    • diff --git a/v0.3.1/genindex.html b/v0.3.1/genindex.html index 2812c96d00..10d0739337 100644 --- a/v0.3.1/genindex.html +++ b/v0.3.1/genindex.html @@ -235,7 +235,7 @@

      Package Reference

      • doctr.datasets
      • -
      • doctr.io
      • +
      • doctr.documents
      • doctr.models
      • doctr.transforms
      • doctr.utils
      • @@ -282,11 +282,11 @@

        Index

        A

        @@ -296,7 +296,7 @@

        A

        B

        @@ -306,8 +306,6 @@

        B

        C

        + + + + + + + + +
          -
        • CharacterGenerator (class in doctr.datasets) -
        • ColorInversion (class in doctr.transforms)
        • Compose (class in doctr.transforms) @@ -319,8 +317,6 @@

          C

        • convert_to_tflite() (in module doctr.models.export)
        • CORD (class in doctr.datasets) -
        • -
        • crnn_mobilenet_v3_large() (in module doctr.models.recognition)
        • crnn_vgg16_bn() (in module doctr.models.recognition)
        • @@ -333,20 +329,16 @@

          D

          @@ -366,13 +358,13 @@

          E

          F

          @@ -574,7 +562,7 @@

          V

          W

          @@ -612,7 +600,7 @@

          W

          - + diff --git a/v0.3.1/index.html b/v0.3.1/index.html index 2f4ec32286..b7be51df96 100644 --- a/v0.3.1/index.html +++ b/v0.3.1/index.html @@ -237,7 +237,7 @@

          Package Reference

          • doctr.datasets
          • -
          • doctr.io
          • +
          • doctr.documents
          • doctr.models
          • doctr.transforms
          • doctr.utils
          • @@ -357,7 +357,6 @@

            Supported datasetsNotes

            • Changelog
            • -
            • doctr.io
                -
              • Document structure
              • -
              • File reading
              • +
              • doctr.documents
              • doctr.models
                  @@ -473,7 +472,7 @@

                  Supported datasets + diff --git a/v0.3.1/installing.html b/v0.3.1/installing.html index 34b1354ec1..8068adc0ba 100644 --- a/v0.3.1/installing.html +++ b/v0.3.1/installing.html @@ -237,7 +237,7 @@

                  Package Reference

                  • doctr.datasets
                  • -
                  • doctr.io
                  • +
                  • doctr.documents
                  • doctr.models
                  • doctr.transforms
                  • doctr.utils
                  • @@ -304,13 +304,6 @@

                    Via Python Package
                    pip install python-doctr
                     
                    -

                    We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

                    -
                    # for TensorFlow
                    -pip install python-doctr[tf]
                    -# for PyTorch
                    -pip install python-doctr[torch]
                    -
                    -

                    Via Git

                    @@ -319,15 +312,6 @@

                    Via Git¶ pip install -e doctr/. -

                    Again, for framework-specific builds: -.. code:: bash

                    -
                    -

                    git clone https://github.com/mindee/doctr.git -# for TensorFlow -pip install -e doctr/.[tf] -# for PyTorch -pip install -e doctr/.[torch]

                    -

                    @@ -401,7 +385,7 @@

                    Via Git¶ - + diff --git a/v0.3.1/io.html b/v0.3.1/io.html deleted file mode 100644 index 1433bef6c7..0000000000 --- a/v0.3.1/io.html +++ /dev/null @@ -1,809 +0,0 @@ - - - - - - - - - - - - - doctr.io - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                    -
                    -
                    - -
                    - -
                    -
                    - -
                    - -
                    -
                    - -
                    -
                    -
                    - - - - - Back to top - -
                    - -
                    - -
                    - -
                    -
                    -
                    -

                    doctr.io

                    -

                    The io module enables users to easily access content from documents and export analysis -results to structured formats.

                    -
                    -

                    Document structure

                    -

                    Structural organization of the documents.

                    -
                    -

                    Word

                    -

                    A Word is an uninterrupted sequence of characters.

                    -
                    -
                    -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
                    -

                    Implements a word element

                    -
                    -
                    Parameters:
                    -
                      -
                    • value – the text string of the word

                    • -
                    • confidence – the confidence associated with the text prediction

                    • -
                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                    • -
                    • size (the page's)

                    • -
                    -
                    -
                    -
                    - -
                    -
                    -

                    Line

                    -

                    A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

                    -
                    -
                    -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
                    -

                    Implements a line element as a collection of words

                    -
                    -
                    Parameters:
                    -
                      -
                    • words – list of word elements

                    • -
                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

                    • -
                    -
                    -
                    -
                    - -
                    -
                    -

                    Artefact

                    -

                    An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

                    -
                    -
                    -class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
                    -

                    Implements a non-textual element

                    -
                    -
                    Parameters:
                    -
                      -
                    • artefact_type – the type of artefact

                    • -
                    • confidence – the confidence of the type prediction

                    • -
                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

                    • -
                    -
                    -
                    -
                    - -
                    -
                    -

                    Block

                    -

                    A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

                    -
                    -
                    -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
                    -

                    Implements a block element as a collection of lines and artefacts

                    -
                    -
                    Parameters:
                    -
                      -
                    • lines – list of line elements

                    • -
                    • artefacts – list of artefacts

                    • -
                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

                    • -
                    -
                    -
                    -
                    - -
                    -
                    -

                    Page

                    -

                    A Page is a collection of Blocks that were on the same physical page.

                    -
                    -
                    -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
                    -

                    Implements a page element as a collection of blocks

                    -
                    -
                    Parameters:
                    -
                      -
                    • blocks – list of block elements

                    • -
                    • page_idx – the index of the page in the input raw document

                    • -
                    • dimensions – the page size in pixels in format (width, height)

                    • -
                    • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

                    • -
                    • language – a dictionary with the language value and confidence of the prediction

                    • -
                    -
                    -
                    -
                    -
                    -show(page: ndarray, interactive: bool = True, **kwargs) None[source]
                    -

                    Overlay the result on a given image

                    -
                    -
                    Parameters:
                    -
                      -
                    • page – image encoded as a numpy array in uint8

                    • -
                    • interactive – whether the display should be interactive

                    • -
                    -
                    -
                    -
                    - -
                    - -
                    -
                    -

                    Document

                    -

                    A Document is a collection of Pages.

                    -
                    -
                    -class doctr.io.Document(pages: List[Page])[source]
                    -

                    Implements a document element as a collection of pages

                    -
                    -
                    Parameters:
                    -

                    pages – list of page elements

                    -
                    -
                    -
                    -
                    -show(pages: List[ndarray], **kwargs) None[source]
                    -

                    Overlay the result on a given image

                    -
                    -
                    Parameters:
                    -

                    pages – list of images encoded as numpy arrays in uint8

                    -
                    -
                    -
                    - -
                    - -
                    -
                    -
                    -

                    File reading

                    -

                    High-performance file reading and conversion to processable structured data.

                    -
                    -
                    -doctr.io.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
                    -

                    Read a PDF file and convert it into an image in numpy format

                    -
                    -
                    Example::
                    >>> from doctr.documents import read_pdf
                    ->>> doc = read_pdf("path/to/your/doc.pdf")
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    file – the path to the PDF file

                    -
                    -
                    Returns:
                    -

                    the list of pages decoded as numpy ndarray of shape H x W x 3

                    -
                    -
                    -
                    - -
                    -
                    -doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
                    -

                    Read an image file into numpy format

                    -
                    -
                    Example::
                    >>> from doctr.documents import read_img
                    ->>> page = read_img("path/to/your/doc.jpg")
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -
                      -
                    • file – the path to the image file

                    • -
                    • output_size – the expected output size of each page in format H x W

                    • -
                    • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

                    • -
                    -
                    -
                    Returns:
                    -

                    the page decoded as numpy ndarray of shape H x W x 3

                    -
                    -
                    -
                    - -
                    -
                    -doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]
                    -

                    Read an image file as a TensorFlow tensor

                    -
                    -
                    Parameters:
                    -
                      -
                    • img_path – location of the image file

                    • -
                    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                    • -
                    -
                    -
                    Returns:
                    -

                    decoded image as a tensor

                    -
                    -
                    -
                    - -
                    -
                    -doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]
                    -

                    Read a byte stream as a TensorFlow tensor

                    -
                    -
                    Parameters:
                    -
                      -
                    • img_content – bytes of a decoded image

                    • -
                    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                    • -
                    -
                    -
                    Returns:
                    -

                    decoded image as a tensor

                    -
                    -
                    -
                    - -
                    -
                    -doctr.io.read_html(url: str, **kwargs: Any) bytes[source]
                    -

                    Read a PDF file and convert it into an image in numpy format

                    -
                    -
                    Example::
                    >>> from doctr.documents import read_html
                    ->>> doc = read_html("https://www.yoursite.com")
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    url – URL of the target web page

                    -
                    -
                    Returns:
                    -

                    decoded PDF file as a bytes stream

                    -
                    -
                    -
                    - -
                    -
                    -class doctr.io.DocumentFile[source]
                    -

                    Read a document from multiple extensions

                    -
                    -
                    -classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
                    -

                    Read a PDF file

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    file – the path to the PDF file or a binary stream

                    -
                    -
                    Returns:
                    -

                    a PDF document

                    -
                    -
                    -
                    - -
                    -
                    -classmethod from_url(url: str, **kwargs) PDF[source]
                    -

                    Interpret a web page as a PDF document

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> doc = DocumentFile.from_url("https://www.yoursite.com")
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    url – the URL of the target web page

                    -
                    -
                    Returns:
                    -

                    a PDF document

                    -
                    -
                    -
                    - -
                    -
                    -classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
                    -

                    Read an image file (or a collection of image files) and convert it into an image in numpy format

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    files – the path to the image file or a binary stream, or a collection of those

                    -
                    -
                    Returns:
                    -

                    the list of pages decoded as numpy ndarray of shape H x W x 3

                    -
                    -
                    -
                    - -
                    - -
                    -
                    -class doctr.io.PDF(doc: Document)[source]
                    -

                    PDF document template

                    -
                    -
                    Parameters:
                    -

                    doc – input PDF document

                    -
                    -
                    -
                    -
                    -as_images(**kwargs) List[ndarray][source]
                    -

                    Convert all document pages to images

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    kwargs – keyword arguments of convert_page_to_numpy

                    -
                    -
                    Returns:
                    -

                    the list of pages decoded as numpy ndarray of shape H x W x 3

                    -
                    -
                    -
                    - -
                    -
                    -get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
                    -

                    Get the annotations for all words in the document

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
                    -
                    -
                    -
                    -
                    -
                    -
                    Parameters:
                    -

                    kwargs – keyword arguments of fitz.Page.getTextWords

                    -
                    -
                    Returns:
                    -

                    the list of pages annotations, represented as a list of tuple (bounding box, value)

                    -
                    -
                    -
                    - -
                    -
                    -get_artefacts() List[List[Tuple[float, float, float, float]]][source]
                    -

                    Get the artefacts for the entire document

                    -
                    -
                    Example::
                    >>> from doctr.documents import DocumentFile
                    ->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
                    -
                    -
                    -
                    -
                    -
                    -
                    Returns:
                    -

                    the list of pages artefacts, represented as a list of bounding boxes

                    -
                    -
                    -
                    - -
                    - -
                    -
                    - -
                    -
                    - -
                    - -
                    -
                    - - - - - - - - \ No newline at end of file diff --git a/v0.3.1/models.html b/v0.3.1/models.html index e2fcab9620..270664068f 100644 --- a/v0.3.1/models.html +++ b/v0.3.1/models.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.models - docTR documentation @@ -237,7 +237,7 @@

                    Package Reference

                    • doctr.datasets
                    • -
                    • doctr.io
                    • +
                    • doctr.documents
                    • doctr.models
                    • doctr.transforms
                    • doctr.utils
                    • @@ -365,30 +365,6 @@

                      Detection models -
                      -doctr.models.detection.db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) DBNet[source]
                      -

                      DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a mobilenet v3 large backbone.

                      -
                      -
                      Example::
                      >>> import tensorflow as tf
                      ->>> from doctr.models import db_mobilenet_v3_large
                      ->>> model = db_mobilenet_v3_large(pretrained=True)
                      ->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                      ->>> out = model(input_tensor)
                      -
                      -
                      -
                      -
                      -
                      -
                      Parameters:
                      -

                      pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

                      -
                      -
                      Returns:
                      -

                      text detection architecture

                      -
                      -
                      -
                      -
                      doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
                      @@ -449,9 +425,9 @@

                      Detection predictors

                      Text Recognition

                      Identifying strings in images

                      -
                      - - +
                      +
                      Text recognition model zoo
                      +@@ -473,29 +449,22 @@

                      Text Recognition

                      - - + + - - - - - - - - + - + - - + + @@ -545,31 +514,6 @@

                      Recognition models -
                      -doctr.models.recognition.crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) CRNN[source]
                      -

                      CRNN with a MobileNet V3 Large backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

                      -
                      -
                      Example::
                      >>> import tensorflow as tf
                      ->>> from doctr.models import crnn_mobilenet_v3_large
                      ->>> model = crnn_mobilenet_v3_large(pretrained=True)
                      ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
                      ->>> out = model(input_tensor)
                      -
                      -
                      -
                      -
                      -
                      -
                      Parameters:
                      -

                      pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

                      -
                      -
                      Returns:
                      -

                      text recognition architecture

                      -
                      -
                      -
                      -
                      doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
                      @@ -694,22 +638,14 @@

                      End-to-End OCR

                      - - + + - - + + - - - - - - - - - + @@ -717,15 +653,15 @@

                      End-to-End OCR

                      - - + + + - - + + - + @@ -733,7 +669,7 @@

                      End-to-End OCR

                      + @@ -741,7 +677,7 @@

                      End-to-End OCR

                      + @@ -766,7 +702,6 @@

                      End-to-End OCR

                      Receipts

                      - @@ -777,38 +712,14 @@

                      End-to-End OCR

                      - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + @@ -817,8 +728,6 @@

                      End-to-End OCR

                      @@ -827,8 +736,6 @@

                      End-to-End OCR - -
                      -

                      Export model output

                      -

                      The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

                      -

                      Here is a typical Document layout:

                      -
                      Document(
                      -  (pages): [Page(
                      -    dimensions=(340, 600)
                      -    (blocks): [Block(
                      -      (lines): [Line(
                      -        (words): [
                      -          Word(value='No.', confidence=0.91),
                      -          Word(value='RECEIPT', confidence=0.99),
                      -          Word(value='DATE', confidence=0.96),
                      -        ]
                      -      )]
                      -      (artefacts): []
                      -    )]
                      -  )]
                      -)
                      -
                      -
                      -

                      You can also export them as a nested dict, more appropriate for JSON format:

                      -
                      json_output = result.export()
                      -
                      -
                      -

                      For reference, here is the JSON export for the same Document as above:

                      -
                      {
                      -  'pages': [
                      -      {
                      -          'page_idx': 0,
                      -          'dimensions': (340, 600),
                      -          'orientation': {'value': None, 'confidence': None},
                      -          'language': {'value': None, 'confidence': None},
                      -          'blocks': [
                      -              {
                      -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                      -                  'lines': [
                      -                      {
                      -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                      -                          'words': [
                      -                              {
                      -                                  'value': 'No.',
                      -                                  'confidence': 0.914085328578949,
                      -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
                      -                              },
                      -                              {
                      -                                  'value': 'RECEIPT',
                      -                                  'confidence': 0.9949972033500671,
                      -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
                      -                              },
                      -                              {
                      -                                  'value': 'DATE',
                      -                                  'confidence': 0.9578408598899841,
                      -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
                      -                              }
                      -                          ]
                      -                      }
                      -                  ],
                      -                  'artefacts': []
                      -              }
                      -          ]
                      -      }
                      -  ]
                      -}
                      -
                      -
                      @@ -1055,14 +894,14 @@

                      Using SavedModel - +
                      Previous
                      -
                      doctr.io
                      +
                      doctr.documents
                      @@ -1101,7 +940,6 @@

                      Using SavedModelPre-processing for detection
                    • Detection models
                    • @@ -1115,7 +953,6 @@

                      Using SavedModelPre-processing for recognition
                    • Recognition models
                    • -
                    • Export model output
                    • Model export
                        @@ -1156,7 +992,7 @@

                        Using SavedModel + diff --git a/v0.3.1/objects.inv b/v0.3.1/objects.inv index 40c317b3cb..a22d2ce821 100644 Binary files a/v0.3.1/objects.inv and b/v0.3.1/objects.inv differ diff --git a/v0.3.1/search.html b/v0.3.1/search.html index f2a845e56a..fea94ac955 100644 --- a/v0.3.1/search.html +++ b/v0.3.1/search.html @@ -237,7 +237,7 @@

                        Package Reference

                        • doctr.datasets
                        • -
                        • doctr.io
                        • +
                        • doctr.documents
                        • doctr.models
                        • doctr.transforms
                        • doctr.utils
                        • @@ -318,7 +318,7 @@ - + diff --git a/v0.3.1/searchindex.js b/v0.3.1/searchindex.js index 2ae0bac7b1..231483d7a6 100644 --- a/v0.3.1/searchindex.js +++ b/v0.3.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[4, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[4, "block"]], "Build & train your predictor": [[2, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[2, null]], "Document": [[4, "document"]], "Document structure": [[4, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "Export model output": [[5, "export-model-output"]], "File reading": [[4, "file-reading"]], "Getting Started": [[2, "getting-started"]], "Installation": [[3, null]], "Line": [[4, "line"]], "Main Features": [[2, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[2, "model-zoo"]], "Notes": [[2, null]], "Package Reference": [[2, null]], "Page": [[4, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[3, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[2, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[2, "text-detection-models"]], "Text recognition model zoo": [[5, "id4"]], "Text recognition models": [[2, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[4, "word"]], "doctr.datasets": [[1, null]], "doctr.io": [[4, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]]}, "docnames": ["changelog", "datasets", "index", "installing", "io", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "index.rst", "installing.rst", "io.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.io)": [[4, "doctr.io.Artefact", false]], "as_images() (doctr.io.pdf method)": [[4, "doctr.io.PDF.as_images", false]], "block (class in doctr.io)": [[4, "doctr.io.Block", false]], "charactergenerator (class in doctr.datasets)": [[1, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[4, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.io)": [[4, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[4, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.io.pdf method)": [[4, "doctr.io.PDF.get_artefacts", false]], "get_words() (doctr.io.pdf method)": [[4, "doctr.io.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[4, "doctr.io.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[4, "doctr.io.Page", false]], "pdf (class in doctr.io)": [[4, "doctr.io.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[6, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.io)": [[4, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[4, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[4, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[4, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.io.document method)": [[4, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[4, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.io)": [[4, "doctr.io.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "CharacterGenerator"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.io": [[4, 0, 1, "", "Artefact"], [4, 0, 1, "", "Block"], [4, 0, 1, "", "Document"], [4, 0, 1, "", "DocumentFile"], [4, 0, 1, "", "Line"], [4, 0, 1, "", "PDF"], [4, 0, 1, "", "Page"], [4, 0, 1, "", "Word"], [4, 1, 1, "", "decode_img_as_tensor"], [4, 1, 1, "", "read_html"], [4, 1, 1, "", "read_img_as_numpy"], [4, 1, 1, "", "read_img_as_tensor"], [4, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[4, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[4, 2, 1, "", "from_images"], [4, 2, 1, "", "from_pdf"], [4, 2, 1, "", "from_url"]], "doctr.io.PDF": [[4, 2, 1, "", "as_images"], [4, 2, 1, "", "get_artefacts"], [4, 2, 1, "", "get_words"]], "doctr.io.Page": [[4, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_mobilenet_v3_large"], [5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_mobilenet_v3_large"], [5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomRotate"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [4, 7], "0": [1, 2, 5, 6, 7], "00": 5, "01": [], "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": [2, 5], "02562": 5, "03": [2, 5], "035": [], "0361328125": 5, "04": [], "05": 2, "06": [], "06640625": 5, "07": [2, 5], "08": [], "09": [], "0966796875": 5, "1": [1, 2, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": [2, 5], "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": 5, "1396484375": 5, "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": 5, "1684": [], "16x16": [], "17": 5, "1778": [], "1782": [], "18": 2, "185546875": 5, "19": [], "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [2, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 2, "2021": 2, "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": 5, "233": [], "236": [], "24": [], "246": [], "249": [], "25": [5, 6], "2504": [], "255": [4, 5, 6, 7], "256": 5, "257": [], "26": 5, "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": [2, 5], "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": 5, "33": [], "33402": [], "33608": [], "34": [], "340": 5, "3456": [], "35": 5, "3515625": 5, "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": 5, "42": 5, "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": 5, "51171875": 5, "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": 5, "55": [], "56": 5, "57": 5, "58": [], "580": [], "5810546875": 5, "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [3, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": 5, "7100000": [], "713": [], "7141797": [], "7149": [], "72": 5, "72dpi": [], "73": 5, "73257": [], "733": [], "74": [], "745": [], "75": 5, "753": [], "7581382": [], "76": 5, "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": 5, "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": 5, "849": [], "85": 5, "8564453125": 5, "857": [], "85875": [], "86": 5, "860": [], "8603515625": 5, "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": 5, "917": [], "92": 5, "921": [], "93": 5, "94": 5, "95": 7, "9578408598899841": 5, "96": [1, 5], "97": [], "98": 5, "99": 5, "9949972033500671": 5, "A": [1, 2, 4, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [3, 5], "If": [3, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": 5, "Of": 1, "Or": [], "The": [1, 4, 5, 7], "Then": 5, "To": [3, 5], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": 1, "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 4], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 4, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": 3, "aggreg": [1, 7], "aggress": [], "align": 4, "all": [1, 2, 4, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": 5, "alwai": [], "an": [1, 2, 4, 5, 7], "analysi": [4, 5], "ancient_greek": [], "angl": [4, 6], "ani": [1, 2, 4, 5, 6, 7], "annot": 4, "anot": [], "anoth": [1, 3, 5], "answer": [], "anyascii": [], "anyon": 2, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": 5, "ar": [1, 3, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [2, 5], "archiv": [], "area": [], "arg": 1, "argument": [1, 4], "around": 5, "arrai": [4, 7], "art": 2, "artefact": [5, 7], "artefact_typ": 4, "artifici": [], "arxiv": 5, "as_imag": 4, "asarrai": 7, "ascii_lett": 1, "aspect": [2, 6], "assess": 7, "assign": 7, "associ": 4, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [2, 5], "attent": [], "autoclass": [], "autom": 2, "automat": [], "autoregress": [], "avail": [2, 5, 6], "averag": [5, 6], "avoid": 3, "aw": [2, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": 5, "baselin": 5, "bash": 3, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": [1, 6], "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 2, "better": 5, "between": [6, 7], "bgr": 4, "bilinear": [5, 6], "bin_thresh": [], "binar": [2, 5], "binari": 4, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 4, 5, 6, 7], "boolean": [], "both": [2, 5, 6], "bottom": [], "bound": [1, 4, 6, 7], "box": [1, 4, 7], "box_thresh": [], "brew": 3, "bright": 6, "browser": [], "build": 3, "built": [], "byte": [4, 5], "c": [], "c5": 5, "c_j": [], "cach": 1, "cache_sampl": 1, "cairo": 3, "call": [], "callabl": [1, 6], "can": [1, 3, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 2, "channel": [4, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 4, 5, 7], "charactergener": 1, "characterist": [], "charg": 5, "charset": [], "chart": 4, "check": 5, "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 4, 6, 7], "class_nam": [], "classif": [], "classmethod": 4, "clear": [], "clone": 3, "close": [], "co": [], "code": [2, 3, 4], "codecov": [], "colab": [], "collate_fn": 1, "collect": 4, "color": 6, "colorinvers": 6, "column": 4, "com": [3, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 2, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 2, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": [4, 5], "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 4, 7], "consist": [], "consolid": [1, 2], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 4], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 4, "convert": [4, 5, 6], "convert_page_to_numpi": 4, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 2, "coordin": 4, "cord": [1, 2, 5], "core": 7, "corner": [], "correct": 6, "correspond": [3, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [2, 5], "creat": [], "crnn": [2, 5], "crnn_mobilenet_v3_larg": 5, "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": 1, "daili": 2, "danish": [], "data": [2, 4, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": 5, "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": 5, "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [2, 5], "deal": [], "decis": [], "decod": 4, "decode_img_as_tensor": 4, "dedic": [], "deem": [], "deep": 5, "def": [], "default": [4, 5], "defer": 1, "defin": 7, "deform": 5, "degre": 6, "degress": 4, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [2, 3], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": 4, "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": 3, "developp": 3, "deviat": 6, "devic": [], "dict": [4, 5, 7], "dictionari": [4, 7], "differ": [], "differenti": [2, 5], "digit": 1, "dimens": [4, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [4, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": 4, "do": 3, "doc": [4, 5], "docartefact": [], "docstr": [], "doctr": 3, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 4, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": [4, 5], "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": 1, "dynamic_seq_length": 1, "e": [3, 4], "each": [1, 2, 4, 5, 6, 7], "eas": [], "easi": [2, 7], "easier": 5, "easili": [4, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 4, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": [1, 4], "enclos": 4, "encod": [1, 4, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 2, 7], "english": [], "enough": 5, "ensur": [], "entir": 4, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 4, "ethnic": [], "evalu": [1, 2, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 4, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": 6, "expect": [4, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 4, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 4, "extern": [], "extra": 3, "extract": [1, 2], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [3, 5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 3, "figsiz": 7, "figur": 7, "file": [1, 2], "file_hash": 1, "file_nam": 1, "final": [], "find": 3, "fine": 2, "finnish": [], "first": [], "firsthand": 1, "fit": [], "fitz": 4, "flag": [], "flexibl": 7, "flip": [], "float": [4, 6, 7], "float32": [4, 5], "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 3, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 2, 5], "format": [4, 5], "forpost": [1, 2], "forum": [], "fp": 5, "fp16": [1, 5], "frac": 7, "frame": 5, "framework": [1, 3], "free": [], "french": [1, 5], "friendli": 2, "from": [1, 2, 4, 5, 6, 7], "from_hub": [], "from_imag": 4, "from_pdf": 4, "from_url": 4, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 2, 5], "further": [], "futur": [], "g": 4, "g_": 7, "g_x": 7, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 3, "gen": [], "gender": [], "gener": 1, "generic_cyrillic_lett": [], "geometri": [4, 5], "geq": 7, "german": [], "get": [4, 5], "get_artefact": 4, "get_word": 4, "gettextword": 4, "git": 2, "github": 3, "give": [], "given": [1, 4, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 2, "gpu": 2, "gracefulli": [], "graph": 4, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 3, "guid": [], "guidanc": [], "gvision": 5, "h": 4, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 4, "hello": 7, "help": [], "here": [1, 3, 5, 6], "hf": [], "hf_hub_download": [], "high": 4, "higher": 3, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 4, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [3, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 4, 5, 6, 7], "i7": [], "ic03": [], "ic13": [], "icdar": 2, "icdar2019": 1, "id": 5, "ident": [], "identifi": [2, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 4, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": 4, "img_fold": 1, "img_path": 4, "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 4, 5, 6, 7], "import": [1, 4, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [3, 5], "inclus": [], "increas": 6, "independ": [], "index": 4, "indic": 7, "individu": [], "infer": [2, 6], "inform": [1, 2, 5], "inherit": [1, 5], "input": [4, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 2, "instanc": 5, "instanti": 5, "instead": [1, 4], "insult": [], "int": [1, 4, 5, 6, 7], "int64": [], "integ": 7, "integr": 2, "intel": [], "interact": [4, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 4], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": 2, "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 4, 5, 7], "itself": [], "j": 7, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 4], "json": 5, "json_output": 5, "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 4], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 4, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 4, 5], "larg": 5, "largest": 7, "last": [1, 3, 5], "latenc": [], "later": [], "latest": 3, "latin": 1, "layer": [], "layout": 5, "lead": [], "leader": [], "learn": 5, "least": 3, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 3, "librari": 3, "light": 2, "lightweight": [], "like": [], "limits_": 7, "line": [2, 5, 7], "line_1_1": [], "link": [], "linknet": [2, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 3, "list": [1, 4, 6], "ll": 7, "load": [2, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 2, 5, 7], "localis": [], "localizationconfus": 7, "locat": 4, "login": [], "login_to_hub": [], "logo": 4, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 3, "made": 2, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 2, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [2, 5], "match": [2, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": 6, "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": [1, 6], "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 4, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 3, "minim": [], "minimalist": [], "minimum": [3, 7], "minval": 6, "miss": 3, "mistak": [], "mix": 2, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": 5, "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 3, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [4, 5, 6, 7], "more": 5, "most": 5, "mozilla": [], "multi": 2, "multilingu": [], "multipl": [1, 4, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 2, "ndarrai": [1, 4, 7], "necessari": 3, "need": [3, 7], "neg": 6, "nest": 5, "nestedobject": [], "network": [2, 5], "neural": [2, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 2], "non": [2, 4, 6, 7], "none": [1, 4, 5, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 2, "np": [5, 7], "num_output_channel": [], "num_sampl": 1, "number": [1, 6, 7], "numpi": [4, 5, 7], "o": 3, "obb": [], "obj_detect": [], "object": [1, 5], "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 2, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [2, 5], "optim": 2, "option": 1, "order": [1, 4, 5], "org": 5, "organ": 4, "orient": [4, 5], "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [4, 6], "output_s": [4, 6], "outsid": [], "over": [1, 3, 7], "overal": [], "overlai": 4, "overview": [], "overwrit": 1, "overwritten": [], "own": 2, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [3, 5, 7], "page1": 4, "page2": 4, "page_1": [], "page_idx": [4, 5], "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 3, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [5, 6], "paramet": [1, 2, 4, 5, 6, 7], "pars": [1, 2], "parseq": [], "part": 6, "parti": 3, "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 4, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [4, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 4, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 4, "pick": 6, "pictur": 4, "pip": 3, "pipelin": [], "pixbuf": 3, "pixel": [4, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 4, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 2, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [4, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 2, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [2, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 4], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [2, 5], "public": 2, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 2, "python3": [], "pytorch": [2, 3], "q": [], "qr": 4, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 2, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": 6, "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [4, 7], "re": [], "read": [2, 5], "read_html": 4, "read_img": 4, "read_img_as_numpi": 4, "read_img_as_tensor": 4, "read_pdf": 4, "readi": [], "real": [5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 2, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 2, "reduc": [3, 6], "refer": [3, 5], "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 4, "relat": 4, "releas": [0, 3], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [4, 5], "represent": 5, "request": [], "requir": [3, 6], "research": 2, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 4, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [4, 5], "return": [1, 4, 5, 7], "reusabl": 5, "review": [], "rgb": [4, 6], "rgb_mode": [], "rgb_output": 4, "right": [5, 7], "robust": 2, "root": 1, "rotat": [1, 4, 6], "rotated_bbox": [1, 7], "run": 3, "same": [4, 5, 7], "sampl": 1, "sample_transform": 1, "sar": [2, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 2], "scene": [2, 5], "scheme": 5, "score": 7, "scratch": 2, "script": [], "seamless": 2, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": 5, "secur": [], "see": [], "seemlessli": 2, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 4, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [4, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [4, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 4, 6, 7], "show": [2, 4, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 4, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 4, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 2, "smallest": 4, "snapshot_download": [], "snippet": [], "so": [1, 3], "social": [], "socio": [], "some": 3, "someth": [], "somewher": [], "sort": [], "sourc": [1, 4, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 4, "special": 2, "specif": [1, 3, 5, 7], "specifi": [1, 4], "speed": [2, 5], "sphinx": [], "sroie": [1, 2], "stabl": 3, "stackoverflow": [], "stage": 2, "standard": 6, "start": 1, "state": 2, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 4, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 4, "street": [], "strict": [], "strictli": 7, "string": [1, 4, 5, 7], "strive": 3, "strong": 5, "structur": [2, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": 1, "target": [1, 4, 5, 6], "target_s": 1, "task": [1, 2, 5], "task2": [], "tax": 5, "team": [], "techminde": [], "templat": 4, "tensor": [1, 4, 5, 6], "tensorflow": [2, 3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [4, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [2, 5], "textstylebrush": [], "textual": [1, 2, 4], "tf": [3, 4, 5, 6], "tf_model": 5, "tflite": 5, "than": [3, 7], "thank": [], "thei": [], "them": [1, 3, 5], "thi": [3, 5, 7], "thing": [], "third": 3, "those": [3, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 4, "tm": [], "tmp": [], "togeth": [4, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": 3, "torchvis": 6, "total": [], "toward": 3, "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 2], "translat": [], "troll": [], "true": [1, 4, 5, 6, 7], "truth": 7, "tune": 2, "tupl": [4, 5, 6, 7], "turn": [], "two": 4, "txt": [], "type": [4, 5], "typic": 5, "u": 5, "ucsd": [], "udac": [], "uint8": [4, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 4, "understand": [1, 2, 5], "unidecod": 7, "uniform": [5, 6], "uniformli": 6, "uninterrupt": 4, "union": 7, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": [1, 6], "uppercas": [], "url": [1, 4], "us": [1, 3, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [2, 5], "v0": 2, "v1": [], "v3": 5, "valid": [], "valu": [4, 5, 6], "valuabl": 2, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 2, "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 2, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [2, 5], "vocabulari": 1, "w": [4, 7], "w3": [], "wa": [], "wai": [1, 2, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 4, 5, 6], "weasyprint": [], "web": 4, "websit": [], "welcom": 2, "well": [], "were": 4, "what": [], "when": [], "whenev": [], "where": [4, 7], "whether": [1, 4, 6, 7], "which": 5, "whichev": 3, "while": 6, "why": [], "width": 4, "wiki": [], "wildreceipt": [], "window": [3, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [2, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 4, "www": 4, "x": [4, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 4, "xmin": 4, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 4, "ymin": 4, "yolov8": [], "you": [3, 5], "your": [1, 4, 5, 7], "yoursit": 4, "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "DocTR: Document Text Recognition", "Installation", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": [], "28": 0, "29": [], "3": 0, "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 4, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 4, "bug": [], "build": 2, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 2], "detect": [2, 5], "develop": [], "do": [], "doctr": [1, 2, 4, 5, 6, 7], "document": [2, 4], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 2, "feedback": [], "file": 4, "from": [], "gener": [], "get": 2, "git": 3, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 3, "integr": [], "io": 4, "lambda": [], "let": [], "line": 4, "linux": [], "load": 1, "loader": [], "main": 2, "mode": [], "model": [2, 5], "modifi": [], "modul": [], "name": [], "note": 2, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": 5, "own": [], "packag": [2, 3], "page": 4, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [2, 5], "prepar": [], "prerequisit": 3, "pretrain": [], "process": 5, "push": [], "python": 3, "qualiti": [], "question": [], "read": 4, "readi": [], "recognit": [2, 5], "refer": 2, "report": [], "request": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 2, "structur": 4, "style": [], "support": [1, 2, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [2, 5], "train": 2, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 3, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 4, "your": 2, "zoo": [2, 5]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file diff --git a/v0.3.1/transforms.html b/v0.3.1/transforms.html index 2c48ed615d..d42da50481 100644 --- a/v0.3.1/transforms.html +++ b/v0.3.1/transforms.html @@ -237,7 +237,7 @@

                          Package Reference

                    • Composing transformations

                      @@ -670,7 +655,6 @@

                      Composing transformationsRandomHue
                    • RandomGamma
                    • RandomJpegQuality
                    • -
                    • RandomRotate
                    • Composing transformations
                        @@ -690,7 +674,7 @@

                        Composing transformations + diff --git a/v0.3.1/using_doctr/using_model_export.html b/v0.3.1/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.3.1/using_doctr/using_model_export.html +++ b/v0.3.1/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

                        Half-precision
                        diff --git a/v0.4.0/_modules/doctr/datasets/classification/tensorflow.html b/v0.4.0/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 40da1ffe4c..0000000000 --- a/v0.4.0/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,359 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                        -
                        -
                        - -
                        - -
                        -
                        - -
                        - -
                        -
                        - -
                        -
                        -
                        - - - - - Back to top - -
                        -
                        - -
                        - -
                        -
                        -

                        Source code for doctr.datasets.classification.tensorflow

                        -# Copyright (C) 2021, Mindee.
                        -
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        -
                        -import tensorflow as tf
                        -from .base import _CharacterGenerator
                        -
                        -
                        -__all__ = ['CharacterGenerator']
                        -
                        -
                        -
                        -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
                        - -
                        -
                        -
                        -
                        - - -
                        -
                        - - Made with Sphinx and @pradyunsg's - - Furo - -
                        -
                        - -
                        -
                        - -
                        -
                        - -
                        -
                        - - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/datasets/cord.html b/v0.4.0/_modules/doctr/datasets/cord.html index 5679c787e7..3b89955bd8 100644 --- a/v0.4.0/_modules/doctr/datasets/cord.html +++ b/v0.4.0/_modules/doctr/datasets/cord.html @@ -236,7 +236,7 @@

                        Package Reference

                        • doctr.datasets
                        • -
                        • doctr.io
                        • +
                        • doctr.documents
                        • doctr.models
                        • doctr.transforms
                        • doctr.utils
                        • @@ -327,18 +327,17 @@

                          Source code for doctr.datasets.cord

                                   super().__init__(url, None, sha256, True, **kwargs)
                           
                                   # # List images
                          -        tmp_root = os.path.join(self.root, 'image')
                          +        self.root = os.path.join(self._root, 'image')
                                   self.data: List[Tuple[str, Dict[str, Any]]] = []
                          -        np_dtype = np.float16 if self.fp16 else np.float32
                                   self.train = train
                                   self.sample_transforms = sample_transforms
                          -        for img_path in os.listdir(tmp_root):
                          +        for img_path in os.listdir(self.root):
                                       # File existence check
                          -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                          -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                          +            if not os.path.exists(os.path.join(self.root, img_path)):
                          +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                       stem = Path(img_path).stem
                                       _targets = []
                          -            with open(os.path.join(self.root, 'json', f"{stem}.json"), 'rb') as f:
                          +            with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f:
                                           label = json.load(f)
                                           for line in label["valid_line"]:
                                               for word in line["words"]:
                          @@ -351,7 +350,7 @@ 

                          Source code for doctr.datasets.cord

                                                               [x[1], y[1]],
                                                               [x[2], y[2]],
                                                               [x[3], y[3]],
                          -                                ], dtype=np_dtype)))
                          +                                ], dtype=np.float32)))
                                                       else:
                                                           # Reduce 8 coords to 4
                                                           box = [min(x), min(y), max(x), max(y)]
                          @@ -363,7 +362,6 @@ 

                          Source code for doctr.datasets.cord

                                           img_path,
                                           dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets)
                                       ))
                          -        self.root = tmp_root
                           
                               def extra_repr(self) -> str:
                                   return f"train={self.train}"
                          @@ -400,7 +398,7 @@

                          Source code for doctr.datasets.cord

                                 
                               
                             
                          -
                          +
                          diff --git a/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.4.0/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

                          Package Reference

                          • doctr.datasets
                          • -
                          • doctr.io
                          • +
                          • doctr.documents
                          • doctr.models
                          • doctr.transforms
                          • doctr.utils
                          • @@ -284,7 +284,6 @@

                            Source code for doctr.datasets.datasets.tensorflow

                            from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

                            Source code for doctr.datasets.datasets.tensorflow

                            class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

                            Source code for doctr.datasets.datasets.tensorflow

                            +
                            diff --git a/v0.4.0/_modules/doctr/datasets/funsd.html b/v0.4.0/_modules/doctr/datasets/funsd.html index 6ff6059aef..2f5494dc2a 100644 --- a/v0.4.0/_modules/doctr/datasets/funsd.html +++ b/v0.4.0/_modules/doctr/datasets/funsd.html @@ -236,7 +236,7 @@

                            Package Reference

                            • doctr.datasets
                            • -
                            • doctr.io
                            • +
                            • doctr.documents
                            • doctr.models
                            • doctr.transforms
                            • doctr.utils
                            • @@ -329,14 +329,14 @@

                              Source code for doctr.datasets.funsd

                                       subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data')
                               
                                       # # List images
                              -        tmp_root = os.path.join(self.root, subfolder, 'images')
                              +        self.root = os.path.join(self._root, subfolder, 'images')
                                       self.data: List[Tuple[str, Dict[str, Any]]] = []
                              -        for img_path in os.listdir(tmp_root):
                              +        for img_path in os.listdir(self.root):
                                           # File existence check
                              -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                              -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                              +            if not os.path.exists(os.path.join(self.root, img_path)):
                              +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                           stem = Path(img_path).stem
                              -            with open(os.path.join(self.root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
                              +            with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
                                               data = json.load(f)
                               
                                           _targets = [(word['text'], word['box']) for block in data['form']
                              @@ -352,8 +352,6 @@ 

                              Source code for doctr.datasets.funsd

                               
                                           self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets)))
                               
                              -        self.root = tmp_root
                              -
                                   def extra_repr(self) -> str:
                                       return f"train={self.train}"
                              @@ -389,7 +387,7 @@

                              Source code for doctr.datasets.funsd

                                     
                                   
                                 
                              -
                              +
                            diff --git a/v0.4.0/_modules/doctr/datasets/loader.html b/v0.4.0/_modules/doctr/datasets/loader.html index 0547a9778b..ba5bc217e0 100644 --- a/v0.4.0/_modules/doctr/datasets/loader.html +++ b/v0.4.0/_modules/doctr/datasets/loader.html @@ -236,7 +236,7 @@

                            Package Reference

                            • doctr.datasets
                            • -
                            • doctr.io
                            • +
                            • doctr.documents
                            • doctr.models
                            • doctr.transforms
                            • doctr.utils
                            • @@ -283,9 +283,9 @@

                              Source code for doctr.datasets.loader

                               import math
                               import tensorflow as tf
                               import numpy as np
                              -from typing import Optional, Callable
                              +from typing import Optional
                               
                              -from doctr.utils.multithreading import multithread_exec
                              +from .multithreading import multithread_exec
                               
                               __all__ = ["DataLoader"]
                               
                              @@ -334,23 +334,16 @@ 

                              Source code for doctr.datasets.loader

                                       batch_size: int = 1,
                                       drop_last: bool = False,
                                       workers: Optional[int] = None,
                              -        collate_fn: Optional[Callable] = None,
                                   ) -> None:
                                       self.dataset = dataset
                                       self.shuffle = shuffle
                                       self.batch_size = batch_size
                                       nb = len(self.dataset) / batch_size
                                       self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
                              -        if collate_fn is None:
                              -            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                              -        else:
                              -            self.collate_fn = collate_fn
                              +        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                                       self.workers = workers
                                       self.reset()
                               
                              -    def __len__(self) -> int:
                              -        return self.num_batches
                              -
                                   def reset(self) -> None:
                                       # Updates indices after each epoch
                                       self._num_yielded = 0
                              @@ -409,7 +402,7 @@ 

                              Source code for doctr.datasets.loader

                                     
                                   
                                 
                              -
                              +
                              diff --git a/v0.4.0/_modules/doctr/datasets/ocr.html b/v0.4.0/_modules/doctr/datasets/ocr.html index cdd44c1cef..2c4fb1b838 100644 --- a/v0.4.0/_modules/doctr/datasets/ocr.html +++ b/v0.4.0/_modules/doctr/datasets/ocr.html @@ -226,27 +226,21 @@ @@ -293,6 +287,7 @@

                              Source code for doctr.datasets.ocr

                               from typing import List, Dict, Any, Tuple, Optional, Callable
                               
                               from .datasets import AbstractDataset
                              +from doctr.utils.geometry import fit_rbbox
                               
                               
                               __all__ = ['OCRDataset']
                              @@ -307,6 +302,7 @@ 

                              Source code for doctr.datasets.ocr

                                       img_folder: local path to image folder (all jpg at the root)
                                       label_file: local path to the label file
                                       sample_transforms: composable transformations that will be applied to each image
                              +        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                       **kwargs: keyword arguments from `VisionDataset`.
                                   """
                               
                              @@ -315,33 +311,46 @@ 

                              Source code for doctr.datasets.ocr

                                       img_folder: str,
                                       label_file: str,
                                       sample_transforms: Optional[Callable[[Any], Any]] = None,
                              +        rotated_bbox: bool = False,
                                       **kwargs: Any,
                                   ) -> None:
                              -        super().__init__(img_folder, **kwargs)
                              +
                                       self.sample_transforms = sample_transforms
                              +        self.root = img_folder
                               
                                       # List images
                                       self.data: List[Tuple[str, Dict[str, Any]]] = []
                              -        np_dtype = np.float16 if self.fp16 else np.float32
                                       with open(label_file, 'rb') as f:
                                           data = json.load(f)
                               
                              -        for img_name, annotations in data.items():
                              +        for file_dic in data:
                                           # Get image path
                              -            img_name = Path(img_name)
                              +            img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg'
                                           # File existence check
                                           if not os.path.exists(os.path.join(self.root, img_name)):
                                               raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
                               
                                           # handle empty images
                              -            if len(annotations["typed_words"]) == 0:
                              -                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[])))
                              +            if (len(file_dic["coordinates"]) == 0 or
                              +               (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")):
                              +                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[])))
                                               continue
                              -            # Unpack
                              -            box_targets = [tuple(map(float, obj['geometry'])) for obj in annotations['typed_words']]
                              -            text_targets = [obj['value'] for obj in annotations['typed_words']]
                              +            is_valid: List[bool] = []
                              +            box_targets: List[List[float]] = []
                              +            for box in file_dic["coordinates"]:
                              +                if rotated_bbox:
                              +                    x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32))
                              +                    box = [x, y, w, h, alpha]
                              +                    is_valid.append(w > 0 and h > 0)
                              +                else:
                              +                    xs, ys = zip(*box)
                              +                    box = [min(xs), min(ys), max(xs), max(ys)]
                              +                    is_valid.append(box[0] < box[2] and box[1] < box[3])
                              +                if is_valid[-1]:
                              +                    box_targets.append(box)
                               
                              -            self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
                              + text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] + self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                              @@ -375,7 +384,7 @@

                              Source code for doctr.datasets.ocr

                                     
                                   
                                 
                              -
                            +
                        diff --git a/v0.4.0/_modules/doctr/datasets/sroie.html b/v0.4.0/_modules/doctr/datasets/sroie.html index 5a1c9eaeb7..0425870abb 100644 --- a/v0.4.0/_modules/doctr/datasets/sroie.html +++ b/v0.4.0/_modules/doctr/datasets/sroie.html @@ -236,7 +236,7 @@

                        Package Reference

                        • doctr.datasets
                        • -
                        • doctr.io
                        • +
                        • doctr.documents
                        • doctr.models
                        • doctr.transforms
                        • doctr.utils
                        • @@ -331,16 +331,15 @@

                          Source code for doctr.datasets.sroie

                                       raise NotImplementedError
                           
                                   # # List images
                          -        tmp_root = os.path.join(self.root, 'images')
                          +        self.root = os.path.join(self._root, 'images')
                                   self.data: List[Tuple[str, Dict[str, Any]]] = []
                          -        np_dtype = np.float16 if self.fp16 else np.float32
                          -        for img_path in os.listdir(tmp_root):
                          +        for img_path in os.listdir(self.root):
                                       # File existence check
                          -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                          -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                          +            if not os.path.exists(os.path.join(self.root, img_path)):
                          +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                       stem = Path(img_path).stem
                                       _targets = []
                          -            with open(os.path.join(self.root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                          +            with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                                           for row in csv.reader(f, delimiter=','):
                                               # Safeguard for blank lines
                                               if len(row) > 0:
                          @@ -355,8 +354,7 @@ 

                          Source code for doctr.datasets.sroie

                           
                                       text_targets, box_targets = zip(*_targets)
                           
                          -            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
                          -        self.root = tmp_root
                          +            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                           
                               def extra_repr(self) -> str:
                                   return f"train={self.train}"
                          @@ -393,7 +391,7 @@

                          Source code for doctr.datasets.sroie

                                 
                               
                             
                          -
                          + diff --git a/v0.4.0/_modules/doctr/datasets/utils.html b/v0.4.0/_modules/doctr/datasets/utils.html index eea8ba01f4..499d3fff84 100644 --- a/v0.4.0/_modules/doctr/datasets/utils.html +++ b/v0.4.0/_modules/doctr/datasets/utils.html @@ -236,7 +236,7 @@

                          Package Reference

                          • doctr.datasets
                          • -
                          • doctr.io
                          • +
                          • doctr.documents
                          • doctr.models
                          • doctr.transforms
                          • doctr.utils
                          • @@ -283,12 +283,11 @@

                            Source code for doctr.datasets.utils

                             import string
                             import unicodedata
                             import numpy as np
                            -from functools import partial
                             from typing import List, Optional, Any
                             
                             from .vocabs import VOCABS
                             
                            -__all__ = ['translate', 'encode_string', 'decode_sequence', 'encode_sequences']
                            +__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
                             
                             
                             def translate(
                            @@ -325,7 +324,7 @@ 

                            Source code for doctr.datasets.utils

                                 return translated
                             
                             
                            -def encode_string(
                            +def encode_sequence(
                                 input_string: str,
                                 vocab: str,
                             ) -> List[int]:
                            @@ -352,13 +351,12 @@ 

                            Source code for doctr.datasets.utils

                                     mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
                             
                                 Returns:
                            -        A string, decoded from input_array
                            -    """
                            +        A string, decoded from input_array"""
                             
                                 if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
                                     raise AssertionError("Input must be an array of int, with max less than mapping size")
                            -
                            -    return ''.join(map(mapping.__getitem__, input_array))
                            +    decoded = ''.join(mapping[idx] for idx in input_array)
                            +    return decoded
                             
                             
                             
                            @@ -370,7 +368,6 @@

                            Source code for doctr.datasets.utils

                                 eos: int = -1,
                                 sos: Optional[int] = None,
                                 pad: Optional[int] = None,
                            -    dynamic_seq_length: bool = False,
                                 **kwargs: Any,
                             ) -> np.ndarray:
                                 """Encode character sequences using a given vocab as mapping
                            @@ -382,7 +379,6 @@ 

                            Source code for doctr.datasets.utils

                                     eos: encoding of End Of String
                                     sos: optional encoding of Start Of String
                                     pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
                            -        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
                             
                                 Returns:
                                     the padded encoded data as a tensor
                            @@ -391,32 +387,29 @@ 

                            Source code for doctr.datasets.utils

                                 if 0 <= eos < len(vocab):
                                     raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
                             
                            -    if not isinstance(target_size, int) or dynamic_seq_length:
                            -        # Maximum string length + EOS
                            -        max_length = max(len(w) for w in sequences) + 1
                            -        if isinstance(sos, int):
                            -            max_length += 1
                            -        if isinstance(pad, int):
                            -            max_length += 1
                            -        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
                            +    if not isinstance(target_size, int):
                            +        target_size = max(len(w) for w in sequences)
                            +        if sos:
                            +            target_size += 1
                            +        if pad:
                            +            target_size += 1
                             
                                 # Pad all sequences
                            -    if isinstance(pad, int):  # pad with padding symbol
                            +    if pad:  # pad with padding symbol
                                     if 0 <= pad < len(vocab):
                                         raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
                                     # In that case, add EOS at the end of the word before padding
                            -        default_symbol = pad
                            +        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
                                 else:  # pad with eos symbol
                            -        default_symbol = eos
                            -    encoded_data = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
                            +        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
                             
                            -    # Encode the strings
                            -    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
                            -        if isinstance(pad, int):  # add eos at the end of the sequence
                            -            seq.append(eos)
                            -        encoded_data[idx, :min(len(seq), target_size)] = seq[:min(len(seq), target_size)]
                            +    for idx, seq in enumerate(sequences):
                            +        encoded_seq = encode_sequence(seq, vocab)
                            +        if pad:  # add eos at the end of the sequence
                            +            encoded_seq.append(eos)
                            +        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
                             
                            -    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
                            +    if sos:  # place eos symbol at the beginning of each sequence
                                     if 0 <= sos < len(vocab):
                                         raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
                                     encoded_data = np.roll(encoded_data, 1)
                            @@ -456,7 +449,7 @@ 

                            Source code for doctr.datasets.utils

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/io/elements.html b/v0.4.0/_modules/doctr/io/elements.html index 0a7f9b8ff2..73dbca5877 100644 --- a/v0.4.0/_modules/doctr/io/elements.html +++ b/v0.4.0/_modules/doctr/io/elements.html @@ -228,20 +228,32 @@ @@ -642,7 +996,7 @@

                            Source code for doctr.io.elements

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/io/html.html b/v0.4.0/_modules/doctr/io/html.html index 1dca6c97e4..d5495fcd8a 100644 --- a/v0.4.0/_modules/doctr/io/html.html +++ b/v0.4.0/_modules/doctr/io/html.html @@ -226,20 +226,38 @@

                            Source code for doctr.io.html

                            -# Copyright (C) 2021, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                            -from weasyprint import HTML
                             from typing import Any
                             
                            -__all__ = ['read_html']
                            +__all__ = ["read_html"]
                             
                             
                             
                            -[docs] +[docs] def read_html(url: str, **kwargs: Any) -> bytes: """Read a PDF file and convert it into an image in numpy format - Example:: - >>> from doctr.documents import read_html - >>> doc = read_html("https://www.yoursite.com") + >>> from doctr.io import read_html + >>> doc = read_html("https://www.yoursite.com") Args: + ---- url: URL of the target web page + **kwargs: keyword arguments from `weasyprint.HTML` + Returns: + ------- decoded PDF file as a bytes stream """ + from weasyprint import HTML return HTML(url, **kwargs).write_pdf()
                            @@ -335,7 +356,7 @@

                            Source code for doctr.io.html

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/io/image/base.html b/v0.4.0/_modules/doctr/io/image/base.html index defcac7f86..1ba249a68a 100644 --- a/v0.4.0/_modules/doctr/io/image/base.html +++ b/v0.4.0/_modules/doctr/io/image/base.html @@ -226,20 +226,38 @@

                            Source code for doctr.io.image.base

                            -# Copyright (C) 2021, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                             from pathlib import Path
                             from typing import Optional, Tuple
                            -import numpy as np
                            +
                             import cv2
                            +import numpy as np
                            +
                             from doctr.utils.common_types import AbstractFile
                             
                            -__all__ = ['read_img_as_numpy']
                            +__all__ = ["read_img_as_numpy"]
                             
                             
                             
                            -[docs] +[docs] def read_img_as_numpy( file: AbstractFile, output_size: Optional[Tuple[int, int]] = None, @@ -298,25 +318,26 @@

                            Source code for doctr.io.image.base

                             ) -> np.ndarray:
                                 """Read an image file into numpy format
                             
                            -    Example::
                            -        >>> from doctr.documents import read_img
                            -        >>> page = read_img("path/to/your/doc.jpg")
                            +    >>> from doctr.io import read_img_as_numpy
                            +    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
                             
                                 Args:
                            +    ----
                                     file: the path to the image file
                                     output_size: the expected output size of each page in format H x W
                                     rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
                            +
                                 Returns:
                            +    -------
                                     the page decoded as numpy ndarray of shape H x W x 3
                                 """
                            -
                                 if isinstance(file, (str, Path)):
                                     if not Path(file).is_file():
                                         raise FileNotFoundError(f"unable to access {file}")
                                     img = cv2.imread(str(file), cv2.IMREAD_COLOR)
                                 elif isinstance(file, bytes):
                            -        file = np.frombuffer(file, np.uint8)
                            -        img = cv2.imdecode(file, cv2.IMREAD_COLOR)
                            +        _file: np.ndarray = np.frombuffer(file, np.uint8)
                            +        img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
                                 else:
                                     raise TypeError("unsupported object type for argument 'file'")
                             
                            @@ -363,7 +384,7 @@ 

                            Source code for doctr.io.image.base

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/io/image/tensorflow.html b/v0.4.0/_modules/doctr/io/image/tensorflow.html index 64db7b45b4..f9faeeab1c 100644 --- a/v0.4.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.4.0/_modules/doctr/io/image/tensorflow.html @@ -226,20 +226,38 @@

                            Source code for doctr.io.image.tensorflow

                            -# Copyright (C) 2021, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                            +
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +from typing import Tuple
                             
                             import numpy as np
                            -from PIL import Image
                             import tensorflow as tf
                            -
                            -if tf.__version__ >= '2.6.0':
                            -    from tensorflow.keras.utils import img_to_array
                            -else:
                            -    from tensorflow.keras.preprocessing.image import img_to_array
                            +from PIL import Image
                            +from tensorflow.keras.utils import img_to_array
                             
                             from doctr.utils.common_types import AbstractPath
                             
                            -__all__ = ['tensor_from_pil', 'read_img_as_tensor', 'decode_img_as_tensor', 'tensor_from_numpy']
                            +__all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
                             
                             
                            -def tensor_from_pil(pil_img: Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                            +def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                                 """Convert a PIL Image to a TensorFlow tensor
                             
                                 Args:
                            +    ----
                                     pil_img: a PIL image
                                     dtype: the output tensor data type
                             
                                 Returns:
                            +    -------
                                     decoded image as tensor
                                 """
                            -
                                 npy_img = img_to_array(pil_img)
                             
                                 return tensor_from_numpy(npy_img, dtype)
                             
                             
                             
                            -[docs] +[docs] def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor: """Read an image file as a TensorFlow tensor Args: + ---- img_path: location of the image file dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255. Returns: + ------- decoded image as a tensor """ - if dtype not in (tf.uint8, tf.float16, tf.float32): raise ValueError("insupported value for dtype") @@ -338,18 +356,19 @@

                            Source code for doctr.io.image.tensorflow

                             
                             
                             
                            -[docs] +[docs] def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor: """Read a byte stream as a TensorFlow tensor Args: + ---- img_content: bytes of a decoded image dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255. Returns: + ------- decoded image as a tensor """ - if dtype not in (tf.uint8, tf.float16, tf.float32): raise ValueError("insupported value for dtype") @@ -367,13 +386,14 @@

                            Source code for doctr.io.image.tensorflow

                                 """Read an image file as a TensorFlow tensor
                             
                                 Args:
                            -        img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                            +    ----
                            +        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                                     dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                             
                                 Returns:
                            +    -------
                                     same image as a tensor of shape (H, W, C)
                                 """
                            -
                                 if dtype not in (tf.uint8, tf.float16, tf.float32):
                                     raise ValueError("insupported value for dtype")
                             
                            @@ -384,6 +404,11 @@ 

                            Source code for doctr.io.image.tensorflow

                                     img = tf.clip_by_value(img, 0, 1)
                             
                                 return img
                            +
                            +
                            +def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
                            +    """Get the shape of an image"""
                            +    return img.shape[:2]
                             
                            @@ -416,7 +441,7 @@

                            Source code for doctr.io.image.tensorflow

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/io/pdf.html b/v0.4.0/_modules/doctr/io/pdf.html index 2d383b9e85..91baf96f7b 100644 --- a/v0.4.0/_modules/doctr/io/pdf.html +++ b/v0.4.0/_modules/doctr/io/pdf.html @@ -226,20 +226,38 @@

                            Source code for doctr.io.pdf

                            -# Copyright (C) 2021, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                            +
                            +from typing import Any, List, Optional
                             
                             import numpy as np
                            -import cv2
                            -from pathlib import Path
                            -import fitz
                            -from typing import List, Tuple, Optional, Any, Dict
                            +import pypdfium2 as pdfium
                             
                            -from doctr.utils.common_types import AbstractFile, Bbox
                            +from doctr.utils.common_types import AbstractFile
                             
                            -__all__ = ['read_pdf', 'PDF']
                            +__all__ = ["read_pdf"]
                             
                             
                             
                            -[docs] -def read_pdf(file: AbstractFile, **kwargs: Any) -> fitz.Document: +[docs] +def read_pdf( + file: AbstractFile, + scale: float = 2, + rgb_mode: bool = True, + password: Optional[str] = None, + **kwargs: Any, +) -> List[np.ndarray]: """Read a PDF file and convert it into an image in numpy format - Example:: - >>> from doctr.documents import read_pdf - >>> doc = read_pdf("path/to/your/doc.pdf") + >>> from doctr.io import read_pdf + >>> doc = read_pdf("path/to/your/doc.pdf") Args: + ---- file: the path to the PDF file - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - - if isinstance(file, (str, Path)) and not Path(file).is_file(): - raise FileNotFoundError(f"unable to access {file}") - - fitz_args: Dict[str, AbstractFile] = {} - - if isinstance(file, (str, Path)): - fitz_args['filename'] = file - elif isinstance(file, bytes): - fitz_args['stream'] = file - else: - raise TypeError("unsupported object type for argument 'file'") - - # Read pages with fitz and convert them to numpy ndarrays - return fitz.open(**fitz_args, filetype="pdf", **kwargs)
                            - - - -def convert_page_to_numpy( - page: fitz.fitz.Page, - output_size: Optional[Tuple[int, int]] = None, - bgr_output: bool = False, - default_scales: Tuple[float, float] = (2, 2), -) -> np.ndarray: - """Convert a fitz page to a numpy-formatted image - - Args: - page: the page of a file read with PyMuPDF - output_size: the expected output size of each page in format H x W. Default goes to 840 x 595 for A4 pdf, - if you want to increase the resolution while preserving the original A4 aspect ratio can pass (1024, 726) - rgb_output: whether the output ndarray channel order should be RGB instead of BGR. - default_scales: spatial scaling to be applied when output_size is not specified where (1, 1) - corresponds to 72 dpi rendering. + scale: rendering scale (1 corresponds to 72dpi) + rgb_mode: if True, the output will be RGB, otherwise BGR + password: a password to unlock the document, if encrypted + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` Returns: - the rendered image in numpy format + ------- + the list of pages decoded as numpy ndarray of shape H x W x C """ - - # If no output size is specified, keep the origin one - if output_size is not None: - scales = (output_size[1] / page.MediaBox[2], output_size[0] / page.MediaBox[3]) - else: - # Default 72 DPI (scales of (1, 1)) is unnecessarily low - scales = default_scales - - transform_matrix = fitz.Matrix(*scales) - - # Generate the pixel map using the transformation matrix - pixmap = page.getPixmap(matrix=transform_matrix) - # Decode it into a numpy - img = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.height, pixmap.width, 3) - - # Switch the channel order - if bgr_output: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - return img - - -
                            -[docs] -class PDF: - """PDF document template - - Args: - doc: input PDF document - """ - def __init__(self, doc: fitz.Document) -> None: - self.doc = doc - -
                            -[docs] - def as_images(self, **kwargs) -> List[np.ndarray]: - """Convert all document pages to images - - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images() - - Args: - kwargs: keyword arguments of `convert_page_to_numpy` - Returns: - the list of pages decoded as numpy ndarray of shape H x W x 3 - """ - return [convert_page_to_numpy(page, **kwargs) for page in self.doc]
                            - - - def get_page_words(self, idx, **kwargs) -> List[Tuple[Bbox, str]]: - """Get the annotations for all words of a given page""" - - # xmin, ymin, xmax, ymax, value, block_idx, line_idx, word_idx - return [(info[:4], info[4]) for info in self.doc[idx].getTextWords(**kwargs)] - -
                            -[docs] - def get_words(self, **kwargs) -> List[List[Tuple[Bbox, str]]]: - """Get the annotations for all words in the document - - Example:: - >>> from doctr.documents import DocumentFile - >>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words() - - Args: - kwargs: keyword arguments of `fitz.Page.getTextWords` - Returns: - the list of pages annotations, represented as a list of tuple (bounding box, value) - """ - return [self.get_page_words(idx, **kwargs) for idx in range(len(self.doc))]
                            - - - def get_page_artefacts(self, idx) -> List[Tuple[float, float, float, float]]: - return [tuple(self.doc[idx].getImageBbox(artefact)) # type: ignore[misc] - for artefact in self.doc[idx].get_images(full=True)] - -
                            -[docs] - def get_artefacts(self) -> List[List[Tuple[float, float, float, float]]]: - """Get the artefacts for the entire document - - Example:: - >>> from doctr.documents import DocumentFile - >>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts() - - Returns: - the list of pages artefacts, represented as a list of bounding boxes - """ - - return [self.get_page_artefacts(idx) for idx in range(len(self.doc))]
                            -
                            + # Rasterise pages to numpy ndarrays with pypdfium2 + pdf = pdfium.PdfDocument(file, password=password) + try: + return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf] + finally: + pdf.close()
                            @@ -467,7 +373,7 @@

                            Source code for doctr.io.pdf

                                   
                                 
                               
                            - + diff --git a/v0.4.0/_modules/doctr/io/reader.html b/v0.4.0/_modules/doctr/io/reader.html index ac14a8ce45..49cdc7d152 100644 --- a/v0.4.0/_modules/doctr/io/reader.html +++ b/v0.4.0/_modules/doctr/io/reader.html @@ -226,20 +226,38 @@

                            Source code for doctr.io.reader

                            -# Copyright (C) 2021, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                            -import numpy as np
                             from pathlib import Path
                            -from typing import List, Union, Sequence
                            +from typing import List, Sequence, Union
                            +
                            +import numpy as np
                            +
                            +from doctr.file_utils import requires_package
                             from doctr.utils.common_types import AbstractFile
                            -from .pdf import read_pdf, PDF
                            +
                             from .html import read_html
                             from .image import read_img_as_numpy
                            +from .pdf import read_pdf
                             
                            -__all__ = ['DocumentFile']
                            +__all__ = ["DocumentFile"]
                             
                             
                             
                            -[docs] +[docs] class DocumentFile: """Read a document from multiple extensions"""
                            -[docs] +[docs] @classmethod - def from_pdf(cls, file: AbstractFile, **kwargs) -> PDF: + def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]: """Read a PDF file - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") + >>> from doctr.io import DocumentFile + >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf") Args: + ---- file: the path to the PDF file or a binary stream + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` + Returns: - a PDF document + ------- + the list of pages decoded as numpy ndarray of shape H x W x 3 """ - - doc = read_pdf(file, **kwargs) - - return PDF(doc)
                            + return read_pdf(file, **kwargs)
                            -[docs] +[docs] @classmethod - def from_url(cls, url: str, **kwargs) -> PDF: + def from_url(cls, url: str, **kwargs) -> List[np.ndarray]: """Interpret a web page as a PDF document - Example:: - >>> from doctr.documents import DocumentFile - >>> doc = DocumentFile.from_url("https://www.yoursite.com") + >>> from doctr.io import DocumentFile + >>> doc = DocumentFile.from_url("https://www.yoursite.com") Args: + ---- url: the URL of the target web page + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` + Returns: - a PDF document + ------- + the list of pages decoded as numpy ndarray of shape H x W x 3 """ + requires_package( + "weasyprint", + "`.from_url` requires weasyprint installed.\n" + + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation", + ) pdf_stream = read_html(url) return cls.from_pdf(pdf_stream, **kwargs)
                            -[docs] +[docs] @classmethod def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]: """Read an image file (or a collection of image files) and convert it into an image in numpy format - Example:: - >>> from doctr.documents import DocumentFile - >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) + >>> from doctr.io import DocumentFile + >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"]) Args: + ---- files: the path to the image file or a binary stream, or a collection of those + **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy` + Returns: + ------- the list of pages decoded as numpy ndarray of shape H x W x 3 """ if isinstance(files, (str, Path, bytes)): @@ -389,7 +422,7 @@

                            Source code for doctr.io.reader

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.4.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index e2bfbfbe7a..0000000000 --- a/v0.4.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,686 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            -
                            - -
                            - -
                            -
                            -

                            Source code for doctr.models.backbones.mobilenet.tensorflow

                            -# Copyright (C) 2021, Mindee.
                            -
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            -
                            -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
                            -
                            -import tensorflow as tf
                            -from tensorflow.keras import layers
                            -from tensorflow.keras.models import Sequential
                            -from typing import Optional, Tuple, Any, Dict, List, Union
                            -from ...utils import conv_sequence, load_pretrained_params
                            -from ....datasets import VOCABS
                            -
                            -
                            -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
                            -           "mobilenet_v3_large_r"]
                            -
                            -
                            -default_cfgs: Dict[str, Dict[str, Any]] = {
                            -    'mobilenet_v3_large': {
                            -        'mean': (0.694, 0.695, 0.693),
                            -        'std': (0.299, 0.296, 0.301),
                            -        'input_shape': (32, 32, 3),
                            -        'vocab': VOCABS['legacy_french'],
                            -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
                            -    },
                            -    'mobilenet_v3_large_r': {
                            -        'mean': (0.694, 0.695, 0.693),
                            -        'std': (0.299, 0.296, 0.301),
                            -        'input_shape': (32, 32, 3),
                            -        'vocab': VOCABS['french'],
                            -        'url': None,
                            -    },
                            -    'mobilenet_v3_small': {
                            -        'mean': (0.694, 0.695, 0.693),
                            -        'std': (0.299, 0.296, 0.301),
                            -        'input_shape': (32, 32, 3),
                            -        'vocab': VOCABS['legacy_french'],
                            -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
                            -    },
                            -    'mobilenet_v3_small_r': {
                            -        'mean': (0.694, 0.695, 0.693),
                            -        'std': (0.299, 0.296, 0.301),
                            -        'input_shape': (32, 32, 3),
                            -        'vocab': VOCABS['french'],
                            -        'url': None,
                            -    }
                            -}
                            -
                            -
                            -def hard_swish(x: tf.Tensor) -> tf.Tensor:
                            -    return x * tf.nn.relu6(x + 3.) / 6.0
                            -
                            -
                            -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
                            -    if min_value is None:
                            -        min_value = divisor
                            -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
                            -    # Make sure that round down does not go down by more than 10%.
                            -    if new_v < 0.9 * v:
                            -        new_v += divisor
                            -    return new_v
                            -
                            -
                            -class SqueezeExcitation(Sequential):
                            -    """Squeeze and Excitation.
                            -    """
                            -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
                            -        super().__init__(
                            -            [
                            -                layers.GlobalAveragePooling2D(),
                            -                layers.Dense(chan // squeeze_factor, activation='relu'),
                            -                layers.Dense(chan, activation='hard_sigmoid'),
                            -                layers.Reshape((1, 1, chan))
                            -            ]
                            -        )
                            -
                            -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
                            -        x = super().call(inputs, **kwargs)
                            -        x = tf.math.multiply(inputs, x)
                            -        return x
                            -
                            -
                            -class InvertedResidualConfig:
                            -    def __init__(
                            -        self,
                            -        input_channels: int,
                            -        kernel: int,
                            -        expanded_channels: int,
                            -        out_channels: int,
                            -        use_se: bool,
                            -        activation: str,
                            -        stride: Union[int, Tuple[int, int]],
                            -        width_mult: float = 1,
                            -    ) -> None:
                            -        self.input_channels = self.adjust_channels(input_channels, width_mult)
                            -        self.kernel = kernel
                            -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
                            -        self.out_channels = self.adjust_channels(out_channels, width_mult)
                            -        self.use_se = use_se
                            -        self.use_hs = activation == "HS"
                            -        self.stride = stride
                            -
                            -    @staticmethod
                            -    def adjust_channels(channels: int, width_mult: float):
                            -        return _make_divisible(channels * width_mult, 8)
                            -
                            -
                            -class InvertedResidual(layers.Layer):
                            -    """InvertedResidual for mobilenet
                            -
                            -    Args:
                            -        conf: configuration object for inverted residual
                            -    """
                            -    def __init__(
                            -        self,
                            -        conf: InvertedResidualConfig,
                            -        **kwargs: Any,
                            -    ) -> None:
                            -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
                            -        super().__init__(**kwargs)
                            -
                            -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
                            -
                            -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
                            -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
                            -
                            -        _layers = []
                            -        # expand
                            -        if conf.expanded_channels != conf.input_channels:
                            -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
                            -
                            -        # depth-wise
                            -        _layers.extend(conv_sequence(
                            -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
                            -            groups=conf.expanded_channels,
                            -        ))
                            -
                            -        if conf.use_se:
                            -            _layers.append(SqueezeExcitation(conf.expanded_channels))
                            -
                            -        # project
                            -        _layers.extend(conv_sequence(
                            -            conf.out_channels, None, kernel_size=1, bn=True,
                            -        ))
                            -
                            -        self.block = Sequential(_layers)
                            -
                            -    def call(
                            -        self,
                            -        inputs: tf.Tensor,
                            -        **kwargs: Any,
                            -    ) -> tf.Tensor:
                            -
                            -        out = self.block(inputs, **kwargs)
                            -        if self.use_res_connect:
                            -            out = tf.add(out, inputs)
                            -
                            -        return out
                            -
                            -
                            -class MobileNetV3(Sequential):
                            -    """Implements MobileNetV3, inspired from both:
                            -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
                            -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
                            -    """
                            -
                            -    def __init__(
                            -        self,
                            -        layout: List[InvertedResidualConfig],
                            -        input_shape: Tuple[int, int, int],
                            -        include_top: bool = False,
                            -        head_chans: int = 1024,
                            -        num_classes: int = 1000,
                            -    ) -> None:
                            -
                            -        _layers = [
                            -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
                            -                       input_shape=input_shape), name="stem")
                            -        ]
                            -
                            -        for idx, conf in enumerate(layout):
                            -            _layers.append(
                            -                InvertedResidual(conf, name=f"inverted_{idx}"),
                            -            )
                            -
                            -        _layers.append(
                            -            Sequential(
                            -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
                            -                name="final_block"
                            -            )
                            -        )
                            -
                            -        if include_top:
                            -            _layers.extend([
                            -                layers.GlobalAveragePooling2D(),
                            -                layers.Dense(head_chans, activation=hard_swish),
                            -                layers.Dropout(0.2),
                            -                layers.Dense(num_classes),
                            -            ])
                            -
                            -        super().__init__(_layers)
                            -
                            -
                            -def _mobilenet_v3(
                            -    arch: str,
                            -    pretrained: bool,
                            -    input_shape: Optional[Tuple[int, int, int]] = None,
                            -    **kwargs: Any
                            -) -> MobileNetV3:
                            -    input_shape = input_shape or default_cfgs[arch]['input_shape']
                            -
                            -    # cf. Table 1 & 2 of the paper
                            -    if arch.startswith("mobilenet_v3_small"):
                            -        inverted_residual_setting = [
                            -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
                            -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                            -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
                            -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                            -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                            -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                            -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
                            -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
                            -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                            -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                            -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                            -        ]
                            -        head_chans = 1024
                            -    else:
                            -        inverted_residual_setting = [
                            -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
                            -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
                            -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
                            -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                            -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                            -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                            -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                            -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
                            -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                            -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                            -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
                            -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
                            -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                            -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                            -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                            -        ]
                            -        head_chans = 1280
                            -
                            -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
                            -
                            -    # Build the model
                            -    model = MobileNetV3(
                            -        inverted_residual_setting,
                            -        input_shape,
                            -        head_chans=head_chans,
                            -        **kwargs,
                            -    )
                            -    # Load pretrained parameters
                            -    if pretrained:
                            -        load_pretrained_params(model, default_cfgs[arch]['url'])
                            -
                            -    return model
                            -
                            -
                            -
                            -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
                            - - - -
                            -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
                            - - - -
                            -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
                            - - - -
                            -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
                            - -
                            -
                            -
                            -
                            - - -
                            -
                            - - Made with Sphinx and @pradyunsg's - - Furo - -
                            -
                            - -
                            -
                            - -
                            -
                            - -
                            -
                            - - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.4.0/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index f6acf21656..0000000000 --- a/v0.4.0/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,519 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            -
                            - -
                            - -
                            -
                            -

                            Source code for doctr.models.backbones.resnet.tensorflow

                            -# Copyright (C) 2021, Mindee.
                            -
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            -
                            -import tensorflow as tf
                            -from tensorflow.keras import layers
                            -from tensorflow.keras.models import Sequential
                            -from typing import Tuple, Dict, Optional, Any, List
                            -from ...utils import conv_sequence, load_pretrained_params
                            -
                            -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
                            -
                            -
                            -default_cfgs: Dict[str, Dict[str, Any]] = {
                            -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
                            -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
                            -                 'url': None},
                            -}
                            -
                            -
                            -class ResnetBlock(layers.Layer):
                            -
                            -    """Implements a resnet31 block with shortcut
                            -
                            -    Args:
                            -        conv_shortcut: Use of shortcut
                            -        output_channels: number of channels to use in Conv2D
                            -        kernel_size: size of square kernels
                            -        strides: strides to use in the first convolution of the block
                            -    """
                            -    def __init__(
                            -        self,
                            -        output_channels: int,
                            -        conv_shortcut: bool,
                            -        strides: int = 1,
                            -        **kwargs
                            -    ) -> None:
                            -
                            -        super().__init__(**kwargs)
                            -        if conv_shortcut:
                            -            self.shortcut = Sequential(
                            -                [
                            -                    layers.Conv2D(
                            -                        filters=output_channels,
                            -                        strides=strides,
                            -                        padding='same',
                            -                        kernel_size=1,
                            -                        use_bias=False,
                            -                        kernel_initializer='he_normal'
                            -                    ),
                            -                    layers.BatchNormalization()
                            -                ]
                            -            )
                            -        else:
                            -            self.shortcut = layers.Lambda(lambda x: x)
                            -        self.conv_block = Sequential(
                            -            self.conv_resnetblock(output_channels, 3, strides)
                            -        )
                            -        self.act = layers.Activation('relu')
                            -
                            -    @staticmethod
                            -    def conv_resnetblock(
                            -        output_channels: int,
                            -        kernel_size: int,
                            -        strides: int = 1,
                            -    ) -> List[layers.Layer]:
                            -        return [
                            -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
                            -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
                            -            layers.BatchNormalization(),
                            -        ]
                            -
                            -    def call(
                            -        self,
                            -        inputs: tf.Tensor
                            -    ) -> tf.Tensor:
                            -        clone = self.shortcut(inputs)
                            -        conv_out = self.conv_block(inputs)
                            -        out = self.act(clone + conv_out)
                            -
                            -        return out
                            -
                            -
                            -class ResnetStage(Sequential):
                            -
                            -    """Implements a resnet31 stage
                            -
                            -    Args:
                            -        num_blocks: number of blocks inside the stage
                            -        output_channels: number of channels to use in Conv2D
                            -        downsample: if true, performs a /2 downsampling at the first block of the stage
                            -    """
                            -    def __init__(
                            -        self,
                            -        num_blocks: int,
                            -        output_channels: int,
                            -        downsample: bool = False,
                            -    ) -> None:
                            -
                            -        super().__init__()
                            -        final_blocks = [
                            -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
                            -        ]
                            -        if downsample is True:
                            -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
                            -        else:
                            -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
                            -        for final_block in final_blocks:
                            -            self.add(final_block)
                            -
                            -
                            -class ResNet(Sequential):
                            -
                            -    """Resnet class with two convolutions and a maxpooling before the first stage
                            -
                            -    Args:
                            -        num_blocks: number of resnet block in each stage
                            -        output_channels: number of channels in each stage
                            -        conv_seq: wether to add a conv_sequence after each stage
                            -        pooling: pooling to add after each stage (if None, no pooling)
                            -        input_shape: shape of inputs
                            -        include_top: whether the classifier head should be instantiated
                            -    """
                            -
                            -    def __init__(
                            -        self,
                            -        num_blocks: Tuple[int, int, int, int],
                            -        output_channels: Tuple[int, int, int, int],
                            -        conv_seq: Tuple[bool, bool, bool, bool],
                            -        pooling: Tuple[
                            -            Optional[Tuple[int, int]],
                            -            Optional[Tuple[int, int]],
                            -            Optional[Tuple[int, int]],
                            -            Optional[Tuple[int, int]]
                            -        ],
                            -        input_shape: Tuple[int, int, int] = (640, 640, 3),
                            -        include_top: bool = False,
                            -    ) -> None:
                            -
                            -        _layers = [
                            -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
                            -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
                            -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
                            -        ]
                            -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
                            -            _layers.append(ResnetStage(n_blocks, out_channels))
                            -            if conv:
                            -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
                            -            if pool:
                            -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
                            -        super().__init__(_layers)
                            -
                            -
                            -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
                            -
                            -    # Build the model
                            -    model = ResNet(
                            -        default_cfgs[arch]['num_blocks'],
                            -        default_cfgs[arch]['output_channels'],
                            -        default_cfgs[arch]['conv_seq'],
                            -        default_cfgs[arch]['pooling'],
                            -        **kwargs
                            -    )
                            -    # Load pretrained parameters
                            -    if pretrained:
                            -        load_pretrained_params(model, default_cfgs[arch]['url'])
                            -
                            -    return model
                            -
                            -
                            -
                            -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
                            - -
                            -
                            -
                            -
                            - - -
                            -
                            - - Made with Sphinx and @pradyunsg's - - Furo - -
                            -
                            - -
                            -
                            - -
                            -
                            - -
                            -
                            - - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.4.0/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 77770ca928..0000000000 --- a/v0.4.0/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,412 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            -
                            - -
                            - -
                            -
                            -

                            Source code for doctr.models.backbones.vgg.tensorflow

                            -# Copyright (C) 2021, Mindee.
                            -
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            -
                            -from tensorflow.keras import layers
                            -from tensorflow.keras.models import Sequential
                            -from typing import Tuple, Dict, Any
                            -
                            -from ...utils import conv_sequence, load_pretrained_params
                            -
                            -
                            -__all__ = ['VGG', 'vgg16_bn']
                            -
                            -
                            -default_cfgs: Dict[str, Dict[str, Any]] = {
                            -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
                            -                 'rect_pools': (False, False, True, True, True),
                            -                 'url': None},
                            -}
                            -
                            -
                            -class VGG(Sequential):
                            -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
                            -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
                            -
                            -    Args:
                            -        num_blocks: number of convolutional block in each stage
                            -        planes: number of output channels in each stage
                            -        rect_pools: whether pooling square kernels should be replace with rectangular ones
                            -        input_shape: shapes of the input tensor
                            -        include_top: whether the classifier head should be instantiated
                            -    """
                            -    def __init__(
                            -        self,
                            -        num_blocks: Tuple[int, int, int, int, int],
                            -        planes: Tuple[int, int, int, int, int],
                            -        rect_pools: Tuple[bool, bool, bool, bool, bool],
                            -        input_shape: Tuple[int, int, int] = (512, 512, 3),
                            -        include_top: bool = False,
                            -    ) -> None:
                            -
                            -        _layers = []
                            -        # Specify input_shape only for the first layer
                            -        kwargs = {"input_shape": input_shape}
                            -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
                            -            for _ in range(nb_blocks):
                            -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
                            -                kwargs = {}
                            -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
                            -        super().__init__(_layers)
                            -
                            -
                            -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
                            -
                            -    # Build the model
                            -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
                            -                default_cfgs[arch]['rect_pools'], **kwargs)
                            -    # Load pretrained parameters
                            -    if pretrained:
                            -        load_pretrained_params(model, default_cfgs[arch]['url'])
                            -
                            -    return model
                            -
                            -
                            -
                            -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
                            - -
                            -
                            -
                            -
                            - - -
                            -
                            - - Made with Sphinx and @pradyunsg's - - Furo - -
                            -
                            - -
                            -
                            - -
                            -
                            - -
                            -
                            - - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index f850c994bc..e181ef6a1f 100644 --- a/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -304,8 +304,8 @@

                            Source code for doctr.models.classification.magc_resnet.tensorflow

                            from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import activations, layers -from keras.models import Sequential +from tensorflow.keras import activations, layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index 02fc8802d6..c9545166e7 100644 --- a/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -304,8 +304,8 @@

                            Source code for doctr.models.classification.mobilenet.tensorflow

                            from typing import Any, Dict, List, Optional, Tuple, Union import tensorflow as tf -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params diff --git a/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html index f4bcd65452..620d4f0635 100644 --- a/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -302,9 +302,9 @@

                            Source code for doctr.models.classification.resnet.tensorflow

                            from typing import Any, Callable, Dict, List, Optional, Tuple import tensorflow as tf -from keras import layers -from keras.applications import ResNet50 -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.applications import ResNet50 +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

                            Source code for doctr.models.classification.textnet.tensorflow

                            from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html index d6142a8376..66ee6dcdd8 100644 --- a/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -301,8 +301,8 @@

                            Source code for doctr.models.classification.vgg.tensorflow

                            from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import layers -from keras.models import Sequential +from tensorflow.keras import layers +from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS diff --git a/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

                            Source code for doctr.models.classification.vit.tensorflow

                            from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index be49814da5..dc65e2ed03 100644 --- a/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -226,27 +226,21 @@ @@ -289,38 +283,29 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers -from tensorflow.keras.applications import ResNet50 from typing import List, Tuple, Optional, Any, Dict -from ...backbones import mobilenet_v3_large from doctr.utils.repr import NestedObject from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence from .base import DBPostProcessor, _DBNet -__all__ = ['DBNet', 'db_resnet50', 'db_mobilenet_v3_large'] +__all__ = ['DBNet', 'db_resnet50'] default_cfgs: Dict[str, Dict[str, Any]] = { 'db_resnet50': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), - 'backbone': ResNet50, + 'backbone': 'ResNet50', 'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + 'fpn_channels': 128, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip', }, - 'db_mobilenet_v3_large': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'backbone': mobilenet_v3_large, - 'fpn_layers': ["inverted_2", "inverted_5", "inverted_11", "final_block"], - 'input_shape': (1024, 1024, 3), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/db_mobilenet_v3_large-8c16d5bf.zip', - }, } @@ -394,8 +379,6 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo Args: feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to - rotated_bbox: whether the segmentation map can include rotated bounding boxes - cfg: the configuration dict of the model """ _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] @@ -403,7 +386,7 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea + fpn_channels: int = 128, rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: @@ -444,7 +427,7 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[np.ndarray] + target: List[Dict[str, Any]] ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output @@ -462,9 +445,9 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) # Compute balanced BCE loss for proba_map @@ -500,7 +483,7 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, return_boxes: bool = False, **kwargs: Any, @@ -529,64 +512,30 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo return out -def _db_resnet( - arch: str, - pretrained: bool, - pretrained_backbone: bool = False, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) # Feature extractor - feat_extractor = IntermediateLayerGetter( - _cfg['backbone']( - include_top=False, - weights='imagenet' if pretrained_backbone else None, - input_shape=_cfg['input_shape'], - pooling=None, - ), - _cfg['fpn_layers'], + resnet = tf.keras.applications.__dict__[_cfg['backbone']]( + include_top=False, + weights=None, + input_shape=_cfg['input_shape'], + pooling=None, ) - # Build the model - model = DBNet(feat_extractor, cfg=_cfg, **kwargs) - # Load pretrained parameters - if pretrained: - load_pretrained_params(model, _cfg['url']) - - return model - - -def _db_mobilenet( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained - - # Patch the config - _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - - # Feature extractor feat_extractor = IntermediateLayerGetter( - _cfg['backbone']( - input_shape=_cfg['input_shape'], - include_top=False, - pretrained=pretrained_backbone, - ), + resnet, _cfg['fpn_layers'], ) + kwargs['fpn_channels'] = _cfg['fpn_channels'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] + # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters @@ -618,30 +567,6 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo return _db_resnet('db_resnet50', pretrained, **kwargs)

                            - - -
                            -[docs] -def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import db_mobilenet_v3_large - >>> model = db_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_mobilenet('db_mobilenet_v3_large', pretrained, **kwargs)
                            -
                            @@ -674,7 +599,7 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo - + diff --git a/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

                            Source code for doctr.models.detection.fast.tensorflow

                            import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html index 31bb1117eb..9f836ce462 100644 --- a/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -226,27 +226,21 @@ @@ -289,7 +283,6 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers, Sequential @@ -307,7 +300,9 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            'linknet16': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'num_classes': 1, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': None, }, } @@ -419,7 +414,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            def compute_loss( self, out_map: tf.Tensor, - target: List[np.ndarray], + target: List[Dict[str, Any]], focal_loss: bool = False, alpha: float = .5, gamma: float = 2., @@ -440,7 +435,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            A loss tensor """ seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) @@ -468,7 +463,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            else: # Compute BCE loss with highlighted edges loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, out_map.dtype), + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), bce ) loss = tf.reduce_mean(loss) @@ -478,7 +473,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, return_boxes: bool = False, focal_loss: bool = True, @@ -511,8 +506,12 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters @@ -576,7 +575,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            +
                            diff --git a/v0.4.0/_modules/doctr/models/detection/zoo.html b/v0.4.0/_modules/doctr/models/detection/zoo.html index aa72312531..23a2f451e3 100644 --- a/v0.4.0/_modules/doctr/models/detection/zoo.html +++ b/v0.4.0/_modules/doctr/models/detection/zoo.html @@ -226,27 +226,21 @@ @@ -289,7 +283,7 @@

                            Source code for doctr.models.detection.zoo

                             from typing import Any
                             
                             from doctr.file_utils import is_tf_available, is_torch_available
                            -from .predictor import DetectionPredictor
                            +from .core import DetectionPredictor
                             from ..preprocessor import PreProcessor
                             from .. import detection
                             
                            @@ -298,9 +292,9 @@ 

                            Source code for doctr.models.detection.zoo

                             
                             
                             if is_tf_available():
                            -    ARCHS = ['db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
                            +    ARCHS = ['db_resnet50', 'linknet16']
                             elif is_torch_available():
                            -    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
                            +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
                             
                             
                             def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
                            @@ -314,7 +308,7 @@ 

                            Source code for doctr.models.detection.zoo

                                 kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                 kwargs['batch_size'] = kwargs.get('batch_size', 1)
                                 predictor = DetectionPredictor(
                            -        PreProcessor(_model.cfg['input_shape'][:-1] if is_tf_available() else _model.cfg['input_shape'][1:], **kwargs),
                            +        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
                                     _model
                                 )
                                 return predictor
                            @@ -328,12 +322,12 @@ 

                            Source code for doctr.models.detection.zoo

                                 Example::
                                     >>> import numpy as np
                                     >>> from doctr.models import detection_predictor
                            -        >>> model = detection_predictor(arch='db_resnet50', pretrained=True)
                            +        >>> model = detection_predictor(pretrained=True)
                                     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                     >>> out = model([input_page])
                             
                                 Args:
                            -        arch: name of the architecture to use (e.g. 'db_resnet50')
                            +        arch: name of the architecture to use ('db_resnet50')
                                     pretrained: If True, returns a model pre-trained on our text detection dataset
                             
                                 Returns:
                            @@ -374,7 +368,7 @@ 

                            Source code for doctr.models.detection.zoo

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html index f44bf003d0..7b8529c26d 100644 --- a/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -226,27 +226,21 @@ @@ -292,38 +286,30 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            from tensorflow.keras.models import Sequential, Model from typing import Tuple, Dict, Any, Optional, List -from ...backbones import vgg16_bn, resnet31, mobilenet_v3_small_r, mobilenet_v3_large_r +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor -from ....datasets import VOCABS -__all__ = ['CRNN', 'crnn_vgg16_bn', 'CTCPostProcessor', 'crnn_mobilenet_v3_small', - 'crnn_mobilenet_v3_large'] +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'crnn_vgg16_bn': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': vgg16_bn, 'rnn_units': 128, - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip', - }, - 'crnn_mobilenet_v3_small': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_small_r, 'rnn_units': 128, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip', }, - 'crnn_mobilenet_v3_large': { + 'crnn_resnet31': { 'mean': (0.694, 0.695, 0.693), 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_large_r, 'rnn_units': 128, + 'backbone': 'resnet31', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': None, + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip', }, } @@ -431,7 +417,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            """ gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -468,15 +454,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            return out -def _crnn( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -485,10 +463,9 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) # Feature extractor - feat_extractor = _cfg['backbone']( + feat_extractor = backbones.__dict__[_cfg['backbone']]( input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) kwargs['vocab'] = _cfg['vocab'] @@ -527,40 +504,14 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            -[docs] -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_small', pretrained, **kwargs)
                            - - - -
                            -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based +def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Example:: >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) + >>> from doctr.models import crnn_resnet31 + >>> model = crnn_resnet31(pretrained=True) >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) @@ -571,8 +522,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            text recognition architecture """ - return _crnn('crnn_mobilenet_v3_large', pretrained, **kwargs)
                            - + return _crnn('crnn_resnet31', pretrained, **kwargs)
                            @@ -605,7 +555,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            +
                            diff --git a/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html index dc6ec3701a..6d9bff4577 100644 --- a/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -226,27 +226,21 @@ @@ -286,12 +280,12 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -import math import tensorflow as tf from tensorflow.keras import layers, Sequential, Model from typing import Tuple, List, Dict, Any, Optional from copy import deepcopy +from ..core import RecognitionPostProcessor from ...backbones.resnet import ResnetStage from ...utils import conv_sequence, load_pretrained_params from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask @@ -304,11 +298,11 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            default_cfgs: Dict[str, Dict[str, Any]] = { 'master': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/master-bade6eae.zip', + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'input_shape': (48, 160, 3), + 'vocab': VOCABS['french'], + 'url': None, }, } @@ -328,9 +322,8 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            def __init__( self, inplanes: int, - headers: int = 8, + headers: int = 1, att_scale: bool = False, - ratio: float = 0.0625, # bottleneck ratio of 1/16 as described in paper **kwargs ) -> None: super().__init__(**kwargs) @@ -338,7 +331,6 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.headers = headers # h self.inplanes = inplanes # C self.att_scale = att_scale - self.planes = int(inplanes * ratio) self.single_header_inplanes = int(inplanes / headers) # C / h @@ -351,7 +343,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.transform = tf.keras.Sequential( [ tf.keras.layers.Conv2D( - filters=self.planes, + filters=self.inplanes, kernel_size=1, kernel_initializer=tf.initializers.he_normal() ), @@ -366,6 +358,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            name='transform' ) + @tf.function def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor: b, h, w, c = (tf.shape(inputs)[i] for i in range(4)) @@ -388,7 +381,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1)) # scale variance if self.att_scale and self.headers > 1: - context_mask = context_mask / math.sqrt(self.single_header_inplanes) + context_mask = context_mask / tf.sqrt(self.single_header_inplanes) # B*h, 1, H*W, 1 context_mask = tf.keras.activations.softmax(context_mask, axis=2) @@ -422,8 +415,8 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            def __init__( self, - headers: int = 8, - input_shape: Tuple[int, int, int] = (32, 128, 3), + headers: int = 1, + input_shape: Tuple[int, int, int] = (48, 160, 3), ) -> None: _layers = [ # conv_1x @@ -472,13 +465,12 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self, vocab: str, d_model: int = 512, - headers: int = 8, # number of multi-aspect context + headers: int = 1, dff: int = 2048, - num_heads: int = 8, # number of heads in the transformer decoder + num_heads: int = 8, num_layers: int = 3, max_length: int = 50, - dropout: float = 0.2, - input_shape: Tuple[int, int, int] = (32, 128, 3), + input_shape: Tuple[int, int, int] = (48, 160, 3), cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() @@ -488,7 +480,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.cfg = cfg self.vocab_size = len(vocab) - self.feat_extractor = MAGCResnet(headers=headers, input_shape=input_shape) + self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape) self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model) # 3 more classes: EOS/PAD/SOS self.decoder = Decoder( @@ -498,13 +490,13 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            dff=dff, vocab_size=self.vocab_size, maximum_position_encoding=max_length, - dropout=dropout, ) self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model) self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform()) self.postprocessor = MASTERPostProcessor(vocab=self.vocab) + @tf.function def make_mask(self, target: tf.Tensor) -> tf.Tensor: look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1]) target_padding_mask = create_padding_mask(target, self.vocab_size + 2) # Pad symbol @@ -541,7 +533,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -566,7 +558,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            """ # Encode - feature = self.feat_extractor(x, **kwargs) + feature = self.feature_extractor(x, **kwargs) b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) encoded = feature + self.feature_pe[:, :h * w, :] @@ -620,7 +612,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=encoded.dtype) # 3 symbols + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): ys_mask = self.make_mask(ys) @@ -640,7 +632,6 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures - Args: vocab: string containing the ordered sequence of supported characters ignore_case: if True, ignore case of letters @@ -691,17 +682,14 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            [docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: >>> import tensorflow as tf >>> from doctr.models import master >>> model = master(pretrained=False) >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) - Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - Returns: text recognition architecture """ @@ -740,7 +728,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            +
                            diff --git a/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html index 63ec5c6017..3a9989ef30 100644 --- a/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -226,27 +226,21 @@ @@ -291,22 +285,31 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            from tensorflow.keras import Sequential, layers, Model from typing import Tuple, Dict, List, Any, Optional -from ...backbones import vgg16_bn, resnet31 +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor from doctr.utils.repr import NestedObject -from ....datasets import VOCABS -__all__ = ['SAR', 'SARPostProcessor', 'sar_resnet31'] +__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31'] default_cfgs: Dict[str, Dict[str, Any]] = { + 'sar_vgg16_bn': { + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'input_shape': (32, 128, 3), + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip', + }, 'sar_resnet31': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': resnet31, 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/sar_resnet31-9ee49970.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip', }, } @@ -387,7 +390,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            super().__init__() self.vocab_size = vocab_size self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_layers)] + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) @@ -408,7 +411,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            # initialize states (each of shape (N, rnn_units)) states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype + inputs=None, batch_size=features.shape[0], dtype=tf.float32 ) # run first step of lstm # holistic: shape (N, rnn_units) @@ -523,7 +526,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -588,15 +591,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -609,9 +604,8 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = default_cfgs[arch]['backbone']( + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( input_shape=_cfg['input_shape'], - pretrained=pretrained_backbone, include_top=False, ) @@ -631,6 +625,30 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return model +
                            +[docs] +def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: + """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong + Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. + + Example:: + >>> import tensorflow as tf + >>> from doctr.models import sar_vgg16_bn + >>> model = sar_vgg16_bn(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + + Returns: + text recognition architecture + """ + + return _sar('sar_vgg16_bn', pretrained, **kwargs)
                            + + +
                            [docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: @@ -685,7 +703,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            +
                            diff --git a/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.4.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

                            Source code for doctr.models.recognition.vitstr.tensorflow

                            from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.4.0/_modules/doctr/models/recognition/zoo.html b/v0.4.0/_modules/doctr/models/recognition/zoo.html index b07332edbb..0f1bff8861 100644 --- a/v0.4.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.4.0/_modules/doctr/models/recognition/zoo.html @@ -226,27 +226,21 @@ @@ -288,16 +282,19 @@

                            Source code for doctr.models.recognition.zoo

                            from typing import Any
                             
                            -from doctr import is_tf_available
                            -from .predictor import RecognitionPredictor
                            -from doctr.models.preprocessor import PreProcessor
                            +from doctr.file_utils import is_tf_available, is_torch_available
                            +from .core import RecognitionPredictor
                            +from ..preprocessor import PreProcessor
                             from .. import recognition
                             
                             
                             __all__ = ["recognition_predictor"]
                             
                             
                            -ARCHS = ['crnn_vgg16_bn', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large', 'sar_resnet31', 'master']
                            +if is_tf_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
                            +elif is_torch_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
                             
                             
                             def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                            @@ -309,9 +306,8 @@ 

                            Source code for doctr.models.recognition.zoo

                            kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                 kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                 kwargs['batch_size'] = kwargs.get('batch_size', 32)
                            -    input_shape = _model.cfg['input_shape'][:2] if is_tf_available() else _model.cfg['input_shape'][-2:]
                                 predictor = RecognitionPredictor(
                            -        PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs),
                            +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                                     _model
                                 )
                             
                            @@ -331,7 +327,7 @@ 

                            Source code for doctr.models.recognition.zoo

                                    >>> out = model([input_page])
                             
                                 Args:
                            -        arch: name of the architecture to use (e.g. 'crnn_vgg16_bn')
                            +        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
                                     pretrained: If True, returns a model pre-trained on our text recognition dataset
                             
                                 Returns:
                            @@ -372,7 +368,7 @@ 

                            Source code for doctr.models.recognition.zoo

                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/models/zoo.html b/v0.4.0/_modules/doctr/models/zoo.html index 373731f035..bfa5a6fdf4 100644 --- a/v0.4.0/_modules/doctr/models/zoo.html +++ b/v0.4.0/_modules/doctr/models/zoo.html @@ -226,27 +226,16 @@ @@ -287,7 +276,7 @@

                            Source code for doctr.models.zoo

                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                             from typing import Any
                            -from .predictor import OCRPredictor
                            +from .core import OCRPredictor
                             from .detection.zoo import detection_predictor
                             from .recognition.zoo import recognition_predictor
                             
                            @@ -307,7 +296,7 @@ 

                            Source code for doctr.models.zoo

                             
                             
                             
                            -[docs] +[docs] def ocr_predictor( det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', @@ -319,7 +308,7 @@

                            Source code for doctr.models.zoo

                                 Example::
                                     >>> import numpy as np
                                     >>> from doctr.models import ocr_predictor
                            -        >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                            +        >>> model = ocr_predictor(pretrained=True)
                                     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                     >>> out = model([input_page])
                             
                            @@ -365,7 +354,7 @@ 

                            Source code for doctr.models.zoo

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/transforms/modules/base.html b/v0.4.0/_modules/doctr/transforms/modules/base.html index 0cc12e51db..e7b5ea10d9 100644 --- a/v0.4.0/_modules/doctr/transforms/modules/base.html +++ b/v0.4.0/_modules/doctr/transforms/modules/base.html @@ -226,27 +226,21 @@ @@ -287,15 +281,13 @@

                            Source code for doctr.transforms.modules.base

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. import random -import math -from typing import List, Any, Callable, Dict, Tuple -import numpy as np +from typing import List, Any, Callable from doctr.utils.repr import NestedObject from .. import functional as F -__all__ = ['ColorInversion', 'OneOf', 'RandomApply', 'RandomRotate', 'RandomCrop'] +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
                            @@ -379,64 +371,6 @@

                            Source code for doctr.transforms.modules.base

                            return self.transform(img) return img
                            - - -
                            -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image and its boxes - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - def __init__(self, max_angle: float = 5., expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - angle = random.uniform(-self.max_angle, self.max_angle) - r_img, r_boxes = F.rotate(img, target["boxes"], angle, self.expand) - return r_img, dict(boxes=r_boxes)
                            - - - -
                            -[docs] -class RandomCrop(NestedObject): - """Randomly crop a tensor image and its boxes - - Args: - scale: tuple of floats, relative (min_area, max_area) of the crop - ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w - """ - def __init__(self, scale: Tuple[float, float] = (0.08, 1.), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: - self.scale = scale - self.ratio = ratio - - def extra_repr(self) -> str: - return f"scale={self.scale}, ratio={self.ratio}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - h, w = img.shape[:2] - scale = random.uniform(self.scale[0], self.scale[1]) - ratio = random.uniform(self.ratio[0], self.ratio[1]) - crop_h = math.sqrt(scale * ratio) - crop_w = math.sqrt(scale / ratio) - start_x, start_y = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h) - crop_box = ( - max(0, int(round(start_x * w))), - max(0, int(round(start_y * h))), - min(int(round((start_x + crop_w) * w)), w - 1), - min(int(round((start_y + crop_h) * h)), h - 1) - ) - croped_img, crop_boxes = F.crop_detection(img, target["boxes"], crop_box) - return croped_img, dict(boxes=crop_boxes)
                            -
                            @@ -469,7 +403,7 @@

                            Source code for doctr.transforms.modules.base

                            -
                            +
                            diff --git a/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html index 6c092476b5..51b31b4fc4 100644 --- a/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.4.0/_modules/doctr/transforms/modules/tensorflow.html @@ -236,7 +236,7 @@

                            Package Reference

                            • doctr.datasets
                            • -
                            • doctr.io
                            • +
                            • doctr.documents
                            • doctr.models
                            • doctr.transforms
                            • doctr.utils
                            • @@ -355,7 +355,6 @@

                              Source code for doctr.transforms.modules.tensorflow

                              return _repr def __call__(self, img: tf.Tensor) -> tf.Tensor: - input_dtype = img.dtype img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: # pad width @@ -366,7 +365,7 @@

                              Source code for doctr.transforms.modules.tensorflow

                              else: offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return tf.cast(img, dtype=input_dtype)
                              + return img
                              @@ -386,15 +385,15 @@

                              Source code for doctr.transforms.modules.tensorflow

                              std: standard deviation per channel """ def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
                              @@ -640,7 +639,7 @@

                              Source code for doctr.transforms.modules.tensorflow

                              +
                              diff --git a/v0.4.0/_modules/doctr/utils/visualization.html b/v0.4.0/_modules/doctr/utils/visualization.html index e7ca1589c7..21743f6182 100644 --- a/v0.4.0/_modules/doctr/utils/visualization.html +++ b/v0.4.0/_modules/doctr/utils/visualization.html @@ -226,27 +226,21 @@ @@ -291,130 +285,65 @@

                              Source code for doctr.utils.visualization

                               import matplotlib.patches as patches
                               import mplcursors
                               from PIL import ImageFont, ImageDraw, Image
                              -from copy import deepcopy
                               import numpy as np
                               import cv2
                              -from unidecode import unidecode
                              -from typing import Tuple, List, Dict, Any, Union, Optional
                              +from typing import Tuple, List, Dict, Any, Union
                               
                               from .common_types import BoundingBox, RotatedBbox
                              -from .fonts import get_font
                               
                              -__all__ = ['visualize_page', 'synthesize_page', 'draw_boxes']
                              +__all__ = ['visualize_page', 'synthetize_page']
                               
                               
                              -def rect_patch(
                              -    geometry: BoundingBox,
                              +def create_rect_patch(
                              +    geometry: Union[BoundingBox, RotatedBbox],
                              +    label: str,
                                   page_dimensions: Tuple[int, int],
                              -    label: Optional[str] = None,
                              -    color: Tuple[float, float, float] = (0, 0, 0),
                              +    color: Tuple[int, int, int],
                                   alpha: float = 0.3,
                                   linewidth: int = 2,
                                   fill: bool = True,
                              -) -> patches.Rectangle:
                              -    """Create a matplotlib rectangular patch for the element
                              +) -> patches.Patch:
                              +    """Create a matplotlib patch (rectangle) bounding the element
                               
                                   Args:
                                       geometry: bounding box of the element
                              -        page_dimensions: dimensions of the Page
                                       label: label to display when hovered
                              +        page_dimensions: dimensions of the Page
                                       color: color to draw box
                                       alpha: opacity parameter to fill the boxes, 0 = transparent
                                       linewidth: line width
                              -        fill: whether the patch should be filled
                               
                                   Returns:
                                       a rectangular Patch
                                   """
                              -
                              -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
                              -        raise ValueError("invalid geometry format")
                              -
                              -    # Unpack
                                   height, width = page_dimensions
                              -    (xmin, ymin), (xmax, ymax) = geometry
                              -    # Switch to absolute coords
                              -    xmin, w = xmin * width, (xmax - xmin) * width
                              -    ymin, h = ymin * height, (ymax - ymin) * height
                              -
                              -    return patches.Rectangle(
                              -        (xmin, ymin),
                              -        w,
                              -        h,
                              -        fill=fill,
                              -        linewidth=linewidth,
                              -        edgecolor=(*color, alpha),
                              -        facecolor=(*color, alpha),
                              -        label=label,
                              -    )
                              -
                              -
                              -def polygon_patch(
                              -    geometry: RotatedBbox,
                              -    page_dimensions: Tuple[int, int],
                              -    label: Optional[str] = None,
                              -    color: Tuple[float, float, float] = (0, 0, 0),
                              -    alpha: float = 0.3,
                              -    linewidth: int = 2,
                              -    fill: bool = True,
                              -) -> patches.Polygon:
                              -    """Create a matplotlib polygon patch for the element
                              -
                              -    Args:
                              -        geometry: bounding box of the element
                              -        page_dimensions: dimensions of the Page
                              -        label: label to display when hovered
                              -        color: color to draw box
                              -        alpha: opacity parameter to fill the boxes, 0 = transparent
                              -        linewidth: line width
                              -        fill: whether the patch should be filled
                              -
                              -    Returns:
                              -        a polygon Patch
                              -    """
                              -
                              -    if len(geometry) != 5 or any(not isinstance(elt, float) for elt in geometry):
                              -        raise ValueError("invalid geometry format")
                              -
                              -    # Unpack
                              -    height, width = page_dimensions
                              -    x, y, w, h, a = geometry
                              -    # Switch to absolute coords
                              -    x, w = x * width, w * width
                              -    y, h = y * height, h * height
                              -    points = cv2.boxPoints(((x, y), (w, h), a))
                              -
                              -    return patches.Polygon(
                              -        points,
                              -        fill=fill,
                              -        linewidth=linewidth,
                              -        edgecolor=(*color, alpha),
                              -        facecolor=(*color, alpha),
                              -        label=label,
                              -    )
                              -
                              -
                              -def create_obj_patch(
                              -    geometry: Union[BoundingBox, RotatedBbox],
                              -    page_dimensions: Tuple[int, int],
                              -    **kwargs: Any,
                              -) -> patches.Patch:
                              -    """Create a matplotlib patch for the element
                              -
                              -    Args:
                              -        geometry: bounding box (straight or rotated) of the element
                              -        page_dimensions: dimensions of the page
                              -
                              -    Returns:
                              -        a matplotlib Patch
                              -    """
                              -    if isinstance(geometry, tuple):
                              -        if len(geometry) == 2:
                              -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                              -        elif len(geometry) == 5:
                              -            return polygon_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                              -
                              -    raise ValueError("invalid geometry format")
                              +    if len(geometry) == 5:
                              +        x, y, w, h, a = geometry  # type: ignore[misc]
                              +        x, w = x * width, w * width
                              +        y, h = y * height, h * height
                              +        points = cv2.boxPoints(((x, y), (w, h), a))
                              +        return patches.Polygon(
                              +            points,
                              +            fill=fill,
                              +            linewidth=linewidth,
                              +            edgecolor=(*color, alpha),
                              +            facecolor=(*color, alpha),
                              +            label=label
                              +        )
                              +    else:
                              +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
                              +        xmin, xmax = xmin * width, xmax * width
                              +        ymin, ymax = ymin * height, ymax * height
                              +        return patches.Rectangle(
                              +            (xmin, ymin),
                              +            xmax - xmin,
                              +            ymax - ymin,
                              +            fill=fill,
                              +            linewidth=linewidth,
                              +            edgecolor=(*color, alpha),
                              +            facecolor=(*color, alpha),
                              +            label=label
                              +        )
                               
                               
                               
                              @@ -465,8 +394,7 @@

                              Source code for doctr.utils.visualization

                               
                                   for block in page['blocks']:
                                       if not words_only:
                              -            rect = create_obj_patch(block['geometry'], page['dimensions'],
                              -                                    label='block', color=(0, 1, 0), linewidth=1, **kwargs)
                              +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                                           # add patch on figure
                                           ax.add_patch(rect)
                                           if interactive:
                              @@ -475,16 +403,14 @@ 

                              Source code for doctr.utils.visualization

                               
                                       for line in block['lines']:
                                           if not words_only:
                              -                rect = create_obj_patch(line['geometry'], page['dimensions'],
                              -                                        label='line', color=(1, 0, 0), linewidth=1, **kwargs)
                              +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                                               ax.add_patch(rect)
                                               if interactive:
                                                   artists.append(rect)
                               
                                           for word in line['words']:
                              -                rect = create_obj_patch(word['geometry'], page['dimensions'],
                              -                                        label=f"{word['value']} (confidence: {word['confidence']:.2%})",
                              -                                        color=(0, 0, 1), **kwargs)
                              +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
                              +                                         page['dimensions'], (0, 0, 1), **kwargs)
                                               ax.add_patch(rect)
                                               if interactive:
                                                   artists.append(rect)
                              @@ -509,11 +435,11 @@ 

                              Source code for doctr.utils.visualization

                               
                                       if display_artefacts:
                                           for artefact in block['artefacts']:
                              -                rect = create_obj_patch(
                              +                rect = create_rect_patch(
                                                   artefact['geometry'],
                              +                    'artefact',
                                                   page['dimensions'],
                              -                    label='artefact',
                              -                    color=(0.5, 0.5, 0.5),
                              +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                                                   linewidth=1,
                                                   **kwargs
                                               )
                              @@ -530,13 +456,10 @@ 

                              Source code for doctr.utils.visualization

                               
                               
                               
                              -
                              -[docs] -def synthesize_page( +def synthetize_page( page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, - font_family: Optional[str] = None, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -544,12 +467,10 @@

                              Source code for doctr.utils.visualization

                                       page: exported Page object to represent
                                       draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
                                       font_size: size of the font, default font = 13
                              -        font_family: family of the font
                               
                                   Return:
                              -        the synthesized page
                              +        A np array (drawn page)
                                   """
                              -
                                   # Draw template
                                   h, w = page["dimensions"]
                                   response = 255 * np.ones((h, w, 3), dtype=np.int32)
                              @@ -564,15 +485,16 @@ 

                              Source code for doctr.utils.visualization

                                               ymin, ymax = int(h * ymin), int(h * ymax)
                               
                                               # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
                              -                font = get_font(font_family, int(0.75 * (ymax - ymin)))
                              -                img = Image.new('RGB', (xmax - xmin, ymax - ymin), color=(255, 255, 255))
                              +                h_box, w_box = ymax - ymin, xmax - xmin
                              +                h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75))
                              +                img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255))
                                               d = ImageDraw.Draw(img)
                              +
                                               # Draw in black the value of the word
                              -                try:
                              -                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
                              -                except UnicodeEncodeError:
                              -                    # When character cannot be encoded, use its unidecode version
                              -                    d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
                              +                d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0))
                              +
                              +                # Resize back to box size
                              +                img = img.resize((w_box, h_box), Image.NEAREST)
                               
                                               # Colorize if draw_proba
                                               if draw_proba:
                              @@ -586,39 +508,7 @@ 

                              Source code for doctr.utils.visualization

                                               # Write to response page
                                               response[ymin:ymax, xmin:xmax, :] = np.array(img)
                               
                              -    return response
                              - - - -def draw_boxes( - boxes: np.ndarray, - image: np.ndarray, - color: Optional[Tuple] = None, - **kwargs -) -> None: - """Draw an array of relative straight boxes on an image - - Args: - boxes: array of relative boxes, of shape (*, 4) - image: np array, float32 or uint8 - """ - h, w = image.shape[:2] - # Convert boxes to absolute coords - _boxes = deepcopy(boxes) - _boxes[:, [0, 2]] *= w - _boxes[:, [1, 3]] *= h - _boxes = _boxes.astype(np.int32) - for box in _boxes.tolist(): - xmin, ymin, xmax, ymax = box - image = cv2.rectangle( - image, - (xmin, ymin), - (xmax, ymax), - color=color if isinstance(color, tuple) else (0, 0, 255), - thickness=2 - ) - plt.imshow(image) - plt.plot(**kwargs) + return response
                              @@ -651,7 +541,7 @@

                              Source code for doctr.utils.visualization

                                     
                                   
                                 
                              -
                              +
                              diff --git a/v0.4.0/_modules/index.html b/v0.4.0/_modules/index.html index 7b9d4cfa7d..c887b618c2 100644 --- a/v0.4.0/_modules/index.html +++ b/v0.4.0/_modules/index.html @@ -226,27 +226,21 @@ @@ -281,25 +275,19 @@

                              All modules for which code is available

                              -
                              -
                              +
                              diff --git a/v0.4.0/_sources/changelog.rst.txt b/v0.4.0/_sources/changelog.rst.txt index 95358cf771..430097d6c8 100644 --- a/v0.4.0/_sources/changelog.rst.txt +++ b/v0.4.0/_sources/changelog.rst.txt @@ -1,14 +1,6 @@ Changelog ========= -v0.3.1 (2021-08-27) -------------------- -Release note: `v0.3.1 `_ - -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.4.0/_sources/datasets.rst.txt b/v0.4.0/_sources/datasets.rst.txt index 71c2d69e88..354122f1e5 100644 --- a/v0.4.0/_sources/datasets.rst.txt +++ b/v0.4.0/_sources/datasets.rst.txt @@ -11,13 +11,17 @@ can be a significant save of time. Available Datasets ------------------ +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. + +.. autoclass:: doctr.datasets.datasets.VisionDataset + + Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD .. autoclass:: OCRDataset -.. autoclass:: CharacterGenerator Data Loading @@ -55,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.4.0/_sources/index.rst.txt b/v0.4.0/_sources/index.rst.txt index a9dfeab2d8..fc3ff89fdf 100644 --- a/v0.4.0/_sources/index.rst.txt +++ b/v0.4.0/_sources/index.rst.txt @@ -1,7 +1,7 @@ DocTR: Document Text Recognition ================================ -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -12,6 +12,9 @@ DocTR provides an easy and powerful way to extract valuable information from you * |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. +Welcome to the documentation of `DocTR `_! + + Main Features ------------- @@ -20,32 +23,39 @@ Main Features * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor * |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, minimal dependencies -* |:tools:| Actively maintained by Mindee -* |:factory:| Easy integration (available templates for browser demo & API deployment) +* |:bird:| Light package, small dependencies +* |:tools:| Daily maintained +* |:factory:| Easy integration +Getting Started +--------------- + .. toctree:: :maxdepth: 2 - :caption: Getting started - :hidden: installing +Build & train your predictor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) +* Fine-tune or train from scratch any detection or recognition model to specialize on your data + + Model zoo ^^^^^^^^^ Text detection models """"""""""""""""""""" - * DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ - * LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ + * `DBNet `_ (Differentiable Binarization) + * `LinkNet `_ Text recognition models """"""""""""""""""""""" - * SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ - * CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ - * MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ + * `SAR `_ (Show, Attend and Read) + * `CRNN `_ (Convolutional Recurrent Neural Network) + * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) Supported datasets @@ -57,28 +67,17 @@ Supported datasets .. toctree:: :maxdepth: 2 - :caption: Using DocTR - :hidden: + :caption: Notes - using_models - using_model_export + changelog .. toctree:: :maxdepth: 2 :caption: Package Reference - :hidden: datasets - io + documents models transforms utils - - -.. toctree:: - :maxdepth: 2 - :caption: Notes - :hidden: - - changelog diff --git a/v0.4.0/_sources/installing.rst.txt b/v0.4.0/_sources/installing.rst.txt index bb5a7a527f..5c8779dc1c 100644 --- a/v0.4.0/_sources/installing.rst.txt +++ b/v0.4.0/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install python-doctr[tf] - # for PyTorch - pip install python-doctr[torch] - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.4.0/_sources/io.rst.txt b/v0.4.0/_sources/io.rst.txt deleted file mode 100644 index d23e11bdb9..0000000000 --- a/v0.4.0/_sources/io.rst.txt +++ /dev/null @@ -1,92 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.4.0/_sources/models.rst.txt b/v0.4.0/_sources/models.rst.txt index 77ec8c16e8..9830c6c153 100644 --- a/v0.4.0/_sources/models.rst.txt +++ b/v0.4.0/_sources/models.rst.txt @@ -1,54 +1,215 @@ doctr.models ============ +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. + .. currentmodule:: doctr.models +For a given task, DocTR provides a Predictor, which is composed of 2 components: -doctr.models.backbones ----------------------- +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.backbones.vgg16_bn -.. autofunction:: doctr.models.backbones.resnet31 +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.backbones.mobilenet_v3_small ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.backbones.mobilenet_v3_large +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.backbones.mobilenet_v3_small_r +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.backbones.mobilenet_v3_large_r +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: -doctr.models.detection ----------------------- +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -.. autofunction:: doctr.models.detection.linknet16 + +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.0/_sources/transforms.rst.txt b/v0.4.0/_sources/transforms.rst.txt index 91e9f0c0f6..0230fe75f5 100644 --- a/v0.4.0/_sources/transforms.rst.txt +++ b/v0.4.0/_sources/transforms.rst.txt @@ -21,8 +21,6 @@ Here are all transformations that are available through DocTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop Composing transformations diff --git a/v0.4.0/_sources/using_doctr/using_model_export.rst.txt b/v0.4.0/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.4.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.4.0/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.4.0/_sources/using_model_export.rst.txt b/v0.4.0/_sources/using_model_export.rst.txt deleted file mode 100644 index ff2bdfd3e7..0000000000 --- a/v0.4.0/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.0/_sources/using_models.rst.txt b/v0.4.0/_sources/using_models.rst.txt deleted file mode 100644 index 17b2be0d4d..0000000000 --- a/v0.4.0/_sources/using_models.rst.txt +++ /dev/null @@ -1,290 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, DocTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in DocTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet16 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.15 - - 92.92 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - - - - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - - - - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by DocTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.00 | 76.02 | 0.85 | 83.87 | 81.34 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | -+==============================================+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } \ No newline at end of file diff --git a/v0.4.0/_sources/utils.rst.txt b/v0.4.0/_sources/utils.rst.txt index c15f3a786e..69c1abe0eb 100644 --- a/v0.4.0/_sources/utils.rst.txt +++ b/v0.4.0/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,15 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update .. automethod:: summary diff --git a/v0.4.0/_static/documentation_options.js b/v0.4.0/_static/documentation_options.js index 8a08253fe6..a7b5cbe04a 100644 --- a/v0.4.0/_static/documentation_options.js +++ b/v0.4.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.4.0a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.4.0/changelog.html b/v0.4.0/changelog.html index 53153f4c8e..6ed2620fb7 100644 --- a/v0.4.0/changelog.html +++ b/v0.4.0/changelog.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Changelog - docTR documentation @@ -227,27 +227,21 @@ @@ -289,14 +283,6 @@
                              diff --git a/v0.4.0/datasets.html b/v0.4.0/datasets.html index 687d43367d..640791680a 100644 --- a/v0.4.0/datasets.html +++ b/v0.4.0/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,27 +227,21 @@ @@ -293,6 +287,12 @@

                              doctr.datasets

                              Available Datasets

                              +

                              The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

                              +
                              +
                              +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
                              +
                              +

                              Here are all datasets that are available through DocTR:

                              @@ -368,7 +368,7 @@

                              doctr.datasets
                              -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, **kwargs: Any)[source]
                              +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                              Implements an OCR dataset

                              Parameters:
                              @@ -376,43 +376,20 @@

                              doctr.datasets -
                              -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
                              -

                              Implements a character image generation dataset

                              -
                              -
                              Example::
                              >>> from doctr.datasets import CharacterGenerator
                              ->>> ds = CharacterGenerator(vocab='abdef')
                              ->>> img, target = ds[0]
                              -
                              -
                              -
                              -
                              -
                              -
                              Parameters:
                              -
                                -
                              • vocab – vocabulary to take the character from

                              • -
                              • num_samples – number of samples that will be generated iterating over the dataset

                              • -
                              • cache_samples – whether generated images should be cached firsthand

                              • -
                              • sample_transforms – composable transformations that will be applied to each image

                              • -
                              -
                              -
                              -

                              -

                    • Data Loading

                      Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

                      -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None, collate_fn: Callable | None = None)[source]
                      +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

                      Implements a dataset wrapper for fast data loading

                      Example::
                      >>> from doctr.datasets import FUNSD, DataLoader
                      @@ -474,39 +451,19 @@ 

                      Data Loading

                      - - - - - - - - - - + + - - - - - - - - - - - - - - + +
                      Text recognition model zoo

                      crnn_vgg16_bn

                      (32, 128, 3)

                      15.8M

                      87.17

                      92.93

                      86.02

                      91.3

                      12.8

                      master

                      (32, 128, 3)

                      87.61

                      93.28

                      sar_vgg16_bn

                      sar_vgg16_bn

                      (32, 128, 3)

                      21.5M

                      86.2

                      91.7

                      3.3

                      sar_resnet31

                      sar_resnet31

                      (32, 128, 3)

                      53.1M

                      87.67

                      93.41

                      86.3

                      92.1

                      2.7

                      db_resnet50 + crnn_vgg16_bn

                      71.25

                      76.02

                      70.08

                      74.77

                      0.85

                      83.99

                      81.42

                      82.19

                      79.67

                      1.6

                      db_resnet50 + master

                      71.26

                      76.03

                      84.61

                      82.02

                      db_resnet50 + sar_vgg16_bn

                      db_resnet50 + sar_vgg16_bn

                      N/A

                      N/A

                      0.49

                      db_resnet50 + sar_resnet31

                      71.48

                      76.26

                      db_resnet50 + sar_resnet31

                      N/A

                      N/A

                      0.27

                      84.66

                      82.07

                      N/A

                      N/A

                      0.83

                      Gvision text detection

                      Gvision text detection

                      59.50

                      62.50

                      Gvision doc. text detection

                      Gvision doc. text detection

                      64.00

                      53.30

                      AWS textract

                      AWS textract

                      78.10

                      83.00

                      Invoices

                      IDs

                      US Tax Forms

                      db_resnet50 + crnn_vgg16_bn (ours)

                      78.56

                      80.94

                      65.79

                      70.10

                      49.35

                      50.84

                      78.99

                      92.73

                      db_resnet50 + master (ours)

                      78.91

                      81.31

                      65.57

                      69.86

                      50.65

                      52.17

                      78.86

                      92.57

                      db_resnet50 + sar_resnet31 (ours)

                      78.84

                      81.23

                      65.90

                      70.21

                      51.17

                      52.72

                      79.17

                      92.68

                      78.90

                      81.01

                      65.68

                      69.86

                      49.48

                      50.46

                      Gvision doc. text detection

                      68.91

                      AWS textract

                      75.77

                      latin

                      94

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

                      english

                      100

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

                      legacy_french

                      123

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                      96

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

                      french

                      126

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

                      portuguese

                      131

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

                      spanish

                      116

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

                      german

                      108

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

                      154

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                      -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
                      +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

                      Encode character sequences using a given vocab as mapping

                      Parameters:
                      @@ -517,7 +474,6 @@

                      Data LoadingReturns: @@ -534,23 +490,23 @@

                      Data Loading - +
                      Next
                      -
                      doctr.io
                      +
                      doctr.documents
                      - +
                      Previous
                      -
                      Preparing your model for inference
                      +
                      Changelog
                      @@ -586,11 +542,11 @@

                      Data Loadingdoctr.datasets
                      • Available Datasets
                      • Data Loading
                          @@ -612,7 +568,7 @@

                          Data Loading + diff --git a/v0.4.0/genindex.html b/v0.4.0/genindex.html index 68b71f61c7..10d0739337 100644 --- a/v0.4.0/genindex.html +++ b/v0.4.0/genindex.html @@ -225,27 +225,21 @@ @@ -282,17 +276,17 @@

                          Index

                          -
                          A | B | C | D | E | F | G | L | M | N | O | P | R | S | T | U | V | W
                          +
                          A | B | C | D | E | F | G | L | M | N | O | P | Q | R | S | T | V | W

                          A

                          @@ -302,7 +296,7 @@

                          A

                          B

                          @@ -312,19 +306,17 @@

                          B

                          C

                          -
                          -

                          U

                          - - -
                          -
                          -

                          V

                            @@ -598,7 +562,7 @@

                            V

                            W

                            @@ -636,7 +600,7 @@

                            W

                            - + diff --git a/v0.4.0/index.html b/v0.4.0/index.html index 33506fb7f6..b7be51df96 100644 --- a/v0.4.0/index.html +++ b/v0.4.0/index.html @@ -227,27 +227,21 @@ @@ -289,13 +283,14 @@

                            DocTR: Document Text Recognition

                            -

                            State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

                            +

                            State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

                            https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

                            DocTR provides an easy and powerful way to extract valuable information from your documents:

                            • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

                            • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

                            +

                            Welcome to the documentation of DocTR!

                            Main Features

                            @@ -396,7 +448,9 @@

                            Supported datasets
                            • DocTR: Document Text Recognition
                                -
                              • Main Features
                                  +
                                • Main Features
                                • +
                                • Getting Started
                                    +
                                  • Build & train your predictor
                                  • Model zoo
                                    • Text detection models
                                    • Text recognition models
                                    • @@ -418,7 +472,7 @@

                                      Supported datasets + diff --git a/v0.4.0/installing.html b/v0.4.0/installing.html index 4211681ad8..8068adc0ba 100644 --- a/v0.4.0/installing.html +++ b/v0.4.0/installing.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Installation - docTR documentation @@ -227,27 +227,21 @@ @@ -289,16 +283,16 @@

                                      Installation

                                      -

                                      This library requires Python 3.6 or higher.

                                      +

                                      This library requires Python 3.6 or higher.

                                      Prerequisites

                                      Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

                                      If you are running another OS than Linux, you will need a few extra dependencies.

                                      -

                                      For MacOS users, you can install them using Homebrew as follows:

                                      +

                                      For MacOS users, you can install them as follows:

                                      brew install cairo pango gdk-pixbuf libffi
                                       
                                      @@ -306,17 +300,10 @@

                                      Prerequisites

                                      Via Python Package

                                      -

                                      Install the last stable release of the package using pip:

                                      +

                                      Install the last stable release of the package using pip:

                                      pip install python-doctr
                                       
                                      -

                                      We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

                                      -
                                      # for TensorFlow
                                      -pip install python-doctr[tf]
                                      -# for PyTorch
                                      -pip install python-doctr[torch]
                                      -
                                      -

                                      Via Git

                                      @@ -325,14 +312,6 @@

                                      Via Git¶ pip install -e doctr/. -

                                      Again, for framework-specific builds:

                                      -
                                      git clone https://github.com/mindee/doctr.git
                                      -# for TensorFlow
                                      -pip install -e doctr/.[tf]
                                      -# for PyTorch
                                      -pip install -e doctr/.[torch]
                                      -
                                      -

                                      @@ -341,12 +320,12 @@

                                      Via Git
                                      - + diff --git a/v0.4.0/io.html b/v0.4.0/io.html deleted file mode 100644 index 8d587e2f1c..0000000000 --- a/v0.4.0/io.html +++ /dev/null @@ -1,815 +0,0 @@ - - - - - - - - - - - - - doctr.io - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      -
                                      -
                                      - - - - - Back to top - -
                                      - -
                                      - -
                                      - -
                                      -
                                      -
                                      -

                                      doctr.io

                                      -

                                      The io module enables users to easily access content from documents and export analysis -results to structured formats.

                                      -
                                      -

                                      Document structure

                                      -

                                      Structural organization of the documents.

                                      -
                                      -

                                      Word

                                      -

                                      A Word is an uninterrupted sequence of characters.

                                      -
                                      -
                                      -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float])[source]
                                      -

                                      Implements a word element

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • value – the text string of the word

                                      • -
                                      • confidence – the confidence associated with the text prediction

                                      • -
                                      • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                                      • -
                                      • size (the page's)

                                      • -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -

                                      Line

                                      -

                                      A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

                                      -
                                      -
                                      -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
                                      -

                                      Implements a line element as a collection of words

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • words – list of word elements

                                      • -
                                      • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

                                      • -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -

                                      Artefact

                                      -

                                      An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

                                      -
                                      -
                                      -class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
                                      -

                                      Implements a non-textual element

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • artefact_type – the type of artefact

                                      • -
                                      • confidence – the confidence of the type prediction

                                      • -
                                      • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

                                      • -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -

                                      Block

                                      -

                                      A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

                                      -
                                      -
                                      -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | Tuple[float, float, float, float, float] | None = None)[source]
                                      -

                                      Implements a block element as a collection of lines and artefacts

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • lines – list of line elements

                                      • -
                                      • artefacts – list of artefacts

                                      • -
                                      • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

                                      • -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -

                                      Page

                                      -

                                      A Page is a collection of Blocks that were on the same physical page.

                                      -
                                      -
                                      -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
                                      -

                                      Implements a page element as a collection of blocks

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • blocks – list of block elements

                                      • -
                                      • page_idx – the index of the page in the input raw document

                                      • -
                                      • dimensions – the page size in pixels in format (width, height)

                                      • -
                                      • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

                                      • -
                                      • language – a dictionary with the language value and confidence of the prediction

                                      • -
                                      -
                                      -
                                      -
                                      -
                                      -show(page: ndarray, interactive: bool = True, **kwargs) None[source]
                                      -

                                      Overlay the result on a given image

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • page – image encoded as a numpy array in uint8

                                      • -
                                      • interactive – whether the display should be interactive

                                      • -
                                      -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      -

                                      Document

                                      -

                                      A Document is a collection of Pages.

                                      -
                                      -
                                      -class doctr.io.Document(pages: List[Page])[source]
                                      -

                                      Implements a document element as a collection of pages

                                      -
                                      -
                                      Parameters:
                                      -

                                      pages – list of page elements

                                      -
                                      -
                                      -
                                      -
                                      -show(pages: List[ndarray], **kwargs) None[source]
                                      -

                                      Overlay the result on a given image

                                      -
                                      -
                                      Parameters:
                                      -

                                      pages – list of images encoded as numpy arrays in uint8

                                      -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      -
                                      -

                                      File reading

                                      -

                                      High-performance file reading and conversion to processable structured data.

                                      -
                                      -
                                      -doctr.io.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
                                      -

                                      Read a PDF file and convert it into an image in numpy format

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import read_pdf
                                      ->>> doc = read_pdf("path/to/your/doc.pdf")
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      file – the path to the PDF file

                                      -
                                      -
                                      Returns:
                                      -

                                      the list of pages decoded as numpy ndarray of shape H x W x 3

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
                                      -

                                      Read an image file into numpy format

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import read_img
                                      ->>> page = read_img("path/to/your/doc.jpg")
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • file – the path to the image file

                                      • -
                                      • output_size – the expected output size of each page in format H x W

                                      • -
                                      • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

                                      • -
                                      -
                                      -
                                      Returns:
                                      -

                                      the page decoded as numpy ndarray of shape H x W x 3

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]
                                      -

                                      Read an image file as a TensorFlow tensor

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • img_path – location of the image file

                                      • -
                                      • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                      • -
                                      -
                                      -
                                      Returns:
                                      -

                                      decoded image as a tensor

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]
                                      -

                                      Read a byte stream as a TensorFlow tensor

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • img_content – bytes of a decoded image

                                      • -
                                      • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                      • -
                                      -
                                      -
                                      Returns:
                                      -

                                      decoded image as a tensor

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.io.read_html(url: str, **kwargs: Any) bytes[source]
                                      -

                                      Read a PDF file and convert it into an image in numpy format

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import read_html
                                      ->>> doc = read_html("https://www.yoursite.com")
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      url – URL of the target web page

                                      -
                                      -
                                      Returns:
                                      -

                                      decoded PDF file as a bytes stream

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -class doctr.io.DocumentFile[source]
                                      -

                                      Read a document from multiple extensions

                                      -
                                      -
                                      -classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
                                      -

                                      Read a PDF file

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      file – the path to the PDF file or a binary stream

                                      -
                                      -
                                      Returns:
                                      -

                                      a PDF document

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -classmethod from_url(url: str, **kwargs) PDF[source]
                                      -

                                      Interpret a web page as a PDF document

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> doc = DocumentFile.from_url("https://www.yoursite.com")
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      url – the URL of the target web page

                                      -
                                      -
                                      Returns:
                                      -

                                      a PDF document

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
                                      -

                                      Read an image file (or a collection of image files) and convert it into an image in numpy format

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      files – the path to the image file or a binary stream, or a collection of those

                                      -
                                      -
                                      Returns:
                                      -

                                      the list of pages decoded as numpy ndarray of shape H x W x 3

                                      -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      -class doctr.io.PDF(doc: Document)[source]
                                      -

                                      PDF document template

                                      -
                                      -
                                      Parameters:
                                      -

                                      doc – input PDF document

                                      -
                                      -
                                      -
                                      -
                                      -as_images(**kwargs) List[ndarray][source]
                                      -

                                      Convert all document pages to images

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      kwargs – keyword arguments of convert_page_to_numpy

                                      -
                                      -
                                      Returns:
                                      -

                                      the list of pages decoded as numpy ndarray of shape H x W x 3

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
                                      -

                                      Get the annotations for all words in the document

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      kwargs – keyword arguments of fitz.Page.getTextWords

                                      -
                                      -
                                      Returns:
                                      -

                                      the list of pages annotations, represented as a list of tuple (bounding box, value)

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -get_artefacts() List[List[Tuple[float, float, float, float]]][source]
                                      -

                                      Get the artefacts for the entire document

                                      -
                                      -
                                      Example::
                                      >>> from doctr.documents import DocumentFile
                                      ->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Returns:
                                      -

                                      the list of pages artefacts, represented as a list of bounding boxes

                                      -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/models.html b/v0.4.0/models.html index 0c490e56d0..270664068f 100644 --- a/v0.4.0/models.html +++ b/v0.4.0/models.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.models - docTR documentation @@ -227,27 +227,21 @@ @@ -289,184 +283,64 @@

                                      doctr.models

                                      -
                                      -

                                      doctr.models.backbones

                                      -
                                      -
                                      -doctr.models.backbones.vgg16_bn(pretrained: bool = False, **kwargs: Any) VGG[source]
                                      -

                                      VGG-16 architecture as described in “Very Deep Convolutional Networks for Large-Scale Image Recognition”, modified by adding batch normalization.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import vgg16_bn
                                      ->>> model = vgg16_bn(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained (bool) – If True, returns a model pre-trained on ImageNet

                                      -
                                      -
                                      Returns:
                                      -

                                      VGG feature extractor

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.models.backbones.resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
                                      -

                                      Resnet31 architecture with rectangular pooling windows as described in -“Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition”,. Downsizing: (H, W) –> (H/8, W/4)

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import resnet31
                                      ->>> model = resnet31(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained – boolean, True if model is pretrained

                                      -
                                      -
                                      Returns:
                                      -

                                      A resnet31 model

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.models.backbones.mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
                                      -

                                      MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import mobilenetv3_large
                                      ->>> model = mobilenetv3_small(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained – boolean, True if model is pretrained

                                      -
                                      -
                                      Returns:
                                      -

                                      a keras.Model

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.models.backbones.mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
                                      -

                                      MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import mobilenetv3_large
                                      ->>> model = mobilenetv3_large(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained – boolean, True if model is pretrained

                                      -
                                      -
                                      Returns:
                                      -

                                      a keras.Model

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.models.backbones.mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
                                      -

                                      MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,, with rectangular pooling.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import mobilenet_v3_small_r
                                      ->>> model = mobilenet_v3_small_r(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained – boolean, True if model is pretrained

                                      -
                                      -
                                      Returns:
                                      -

                                      a keras.Model

                                      -
                                      -
                                      -
                                      - -
                                      -
                                      -doctr.models.backbones.mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
                                      -

                                      MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import mobilenet_v3_large_r
                                      ->>> model = mobilenet_v3_large_r(pretrained=False)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      +

                                      The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture.

                                      +

                                      For a given task, DocTR provides a Predictor, which is composed of 2 components:

                                      +
                                        +
                                      • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

                                      • +
                                      • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

                                      • +
                                      +
                                      +

                                      Text Detection

                                      +

                                      Localizing text elements in images

                                      +
                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                                      FUNSD

                                      CORD

                                      Architecture

                                      Input shape

                                      # params

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      FPS

                                      db_resnet50

                                      (1024, 1024, 3)

                                      25.2 M

                                      82.14

                                      87.64

                                      92.49

                                      89.66

                                      2.1

                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained – boolean, True if model is pretrained

                                      -
                                      -
                                      Returns:
                                      -

                                      a keras.Model

                                      -
                                      -
                                      -
                                      - +

                                      All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

                                      +

                                      Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                                      +

                                      FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

                                      +
                                      +

                                      Pre-processing for detection

                                      +

                                      In DocTR, the pre-processing scheme for detection is the following:

                                      +
                                        +
                                      1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

                                      2. +
                                      3. batch images together

                                      4. +
                                      5. normalize the batch using the training data statistics

                                      6. +
                                      -
                                      -

                                      doctr.models.detection

                                      -
                                      -
                                      -doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
                                      -

                                      LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import linknet16
                                      ->>> model = linknet16(pretrained=True)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

                                      -
                                      -
                                      Returns:
                                      -

                                      text detection architecture

                                      -
                                      -
                                      -
                                      - +
                                      +

                                      Detection models

                                      +

                                      Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

                                      doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
                                      @@ -492,13 +366,13 @@

                                      doctr.models.detection

                                      -
                                      -doctr.models.detection.db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) DBNet[source]
                                      -

                                      DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a mobilenet v3 large backbone.

                                      +
                                      +doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
                                      +

                                      LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import db_mobilenet_v3_large
                                      ->>> model = db_mobilenet_v3_large(pretrained=True)
                                      +>>> from doctr.models import linknet16
                                      +>>> model = linknet16(pretrained=True)
                                       >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                                       >>> out = model(input_tensor)
                                       
                                      @@ -515,6 +389,10 @@

                                      doctr.models.detection

                                      +
                                      +
                                      +

                                      Detection predictors

                                      +

                                      Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

                                      doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]
                                      @@ -522,7 +400,7 @@

                                      doctr.models.detection
                                      Example::

                                      -
                                      -

                                      doctr.models.recognition

                                      +
                                      +
                                      +

                                      Text Recognition

                                      +

                                      Identifying strings in images

                                      +
                                      + + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                      Text recognition model zoo

                                      Architecture

                                      Input shape

                                      # params

                                      FUNSD

                                      CORD

                                      FPS

                                      crnn_vgg16_bn

                                      (32, 128, 3)

                                      15.8M

                                      86.02

                                      91.3

                                      12.8

                                      sar_vgg16_bn

                                      (32, 128, 3)

                                      21.5M

                                      86.2

                                      91.7

                                      3.3

                                      sar_resnet31

                                      (32, 128, 3)

                                      53.1M

                                      86.3

                                      92.1

                                      2.7

                                      +
                                      +

                                      All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

                                      +

                                      All these recognition models are trained with our french vocab (cf. Supported Vocabs).

                                      +

                                      Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

                                      +

                                      FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

                                      +
                                      +

                                      Pre-processing for recognition

                                      +

                                      In DocTR, the pre-processing scheme for recognition is the following:

                                      +
                                        +
                                      1. resize each input image to the target size (bilinear interpolation by default) without deformation.

                                      2. +
                                      3. pad the image to the target size (with zeros by default)

                                      4. +
                                      5. batch images together

                                      6. +
                                      7. normalize the batch using the training data statistics

                                      8. +
                                      +
                                      +
                                      +

                                      Recognition models

                                      +

                                      Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

                                      doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
                                      @@ -571,40 +515,15 @@

                                      doctr.models.recognition -
                                      -doctr.models.recognition.crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) CRNN[source]
                                      -

                                      CRNN with a MobileNet V3 Small backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import crnn_mobilenet_v3_small
                                      ->>> model = crnn_mobilenet_v3_small(pretrained=True)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
                                      ->>> out = model(input_tensor)
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -
                                      Parameters:
                                      -

                                      pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

                                      -
                                      -
                                      Returns:
                                      -

                                      text recognition architecture

                                      -
                                      -
                                      -

                                      - -
                                      -
                                      -doctr.models.recognition.crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) CRNN[source]
                                      -

                                      CRNN with a MobileNet V3 Large backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

                                      +
                                      +doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
                                      +

                                      SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong +Baseline for Irregular Text Recognition”.

                                      Example::
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import crnn_mobilenet_v3_large
                                      ->>> model = crnn_mobilenet_v3_large(pretrained=True)
                                      ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
                                      +>>> from doctr.models import sar_vgg16_bn
                                      +>>> model = sar_vgg16_bn(pretrained=False)
                                      +>>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
                                       >>> out = model(input_tensor)
                                       
                                      @@ -646,17 +565,15 @@

                                      doctr.models.recognition
                                      doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
                                      -

                                      MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.

                                      -
                                      -
                                      Example::
                                      >>> import tensorflow as tf
                                      +

                                      MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. +Example:

                                      +
                                      >>> import tensorflow as tf
                                       >>> from doctr.models import master
                                       >>> model = master(pretrained=False)
                                       >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
                                       >>> out = model(input_tensor)
                                       
                                      -
                                      -
                                      Parameters:

                                      pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

                                      @@ -667,6 +584,10 @@

                                      doctr.models.recognition +

                                      Recognition predictors

                                      +

                                      Combining the right components around a given architecture for easier usage.

                                      doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
                                      @@ -684,7 +605,7 @@

                                      doctr.models.recognition
                                      Parameters:
                                        -
                                      • arch – name of the architecture to use (e.g. ‘crnn_vgg16_bn’)

                                      • +
                                      • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

                                      • pretrained – If True, returns a model pre-trained on our text recognition dataset

                                      @@ -695,16 +616,141 @@

                                      doctr.models.recognition -

                                      doctr.models.zoo

                                      +

                                      +
                                      +

                                      End-to-End OCR

                                      +

                                      Predictors that localize and identify text elements in images

                                      +
                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                                      FUNSD

                                      CORD

                                      Architecture

                                      Recall

                                      Precision

                                      FPS

                                      Recall

                                      Precision

                                      FPS

                                      db_resnet50 + crnn_vgg16_bn

                                      70.08

                                      74.77

                                      0.85

                                      82.19

                                      79.67

                                      1.6

                                      db_resnet50 + sar_vgg16_bn

                                      N/A

                                      N/A

                                      0.49

                                      N/A

                                      N/A

                                      1.0

                                      db_resnet50 + sar_resnet31

                                      N/A

                                      N/A

                                      0.27

                                      N/A

                                      N/A

                                      0.83

                                      Gvision text detection

                                      59.50

                                      62.50

                                      75.30

                                      70.00

                                      Gvision doc. text detection

                                      64.00

                                      53.30

                                      68.90

                                      61.10

                                      AWS textract

                                      78.10

                                      83.00

                                      87.50

                                      66.00

                                      +
                                      +

                                      All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

                                      +

                                      All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

                                      +

                                      Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                                      +

                                      FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

                                      +

                                      Results on private ocr datasets

                                      +
                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                                      Receipts

                                      Invoices

                                      IDs

                                      Architecture

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      db_resnet50 + crnn_vgg16_bn (ours)

                                      78.90

                                      81.01

                                      65.68

                                      69.86

                                      49.48

                                      50.46

                                      Gvision doc. text detection

                                      68.91

                                      59.89

                                      63.20

                                      52.85

                                      43.70

                                      29.21

                                      AWS textract

                                      75.77

                                      77.70

                                      70.47

                                      69.13

                                      46.39

                                      43.32

                                      +
                                      +
                                      +

                                      Two-stage approaches

                                      +

                                      Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

                                      -
                                      -doctr.models.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]
                                      +
                                      +doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]

                                      End-to-end OCR architecture using one model for localization, and another for text recognition.

                                      Example::
                                      >>> import numpy as np
                                       >>> from doctr.models import ocr_predictor
                                      ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                                      +>>> model = ocr_predictor(pretrained=True)
                                       >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                       >>> out = model([input_page])
                                       
                                      @@ -724,6 +770,113 @@

                                      doctr.models.zoo +

                                      Model export

                                      +

                                      Utility functions to make the most of document analysis models.

                                      +
                                      +

                                      Model compression

                                      +
                                      +
                                      +doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
                                      +

                                      Converts a model to TFLite format

                                      +
                                      +
                                      Example::
                                      >>> from tensorflow.keras import Sequential
                                      +>>> from doctr.models import convert_to_tflite, conv_sequence
                                      +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                                      +>>> serialized_model = convert_to_tflite(model)
                                      +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      Parameters:
                                      +

                                      tf_model – a keras model

                                      +
                                      +
                                      Returns:
                                      +

                                      the model

                                      +
                                      +
                                      Return type:
                                      +

                                      bytes

                                      +
                                      +
                                      +
                                      + +
                                      +
                                      +doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
                                      +

                                      Converts a model to half precision

                                      +
                                      +
                                      Example::
                                      >>> from tensorflow.keras import Sequential
                                      +>>> from doctr.models import convert_to_fp16, conv_sequence
                                      +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                                      +>>> serialized_model = convert_to_fp16(model)
                                      +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      Parameters:
                                      +

                                      tf_model – a keras model

                                      +
                                      +
                                      Returns:
                                      +

                                      the serialized FP16 model

                                      +
                                      +
                                      Return type:
                                      +

                                      bytes

                                      +
                                      +
                                      +
                                      + +
                                      +
                                      +doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
                                      +

                                      Quantize a Tensorflow model

                                      +
                                      +
                                      Example::
                                      >>> from tensorflow.keras import Sequential
                                      +>>> from doctr.models import quantize_model, conv_sequence
                                      +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                                      +>>> serialized_model = quantize_model(model, (224, 224, 3))
                                      +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      Parameters:
                                      +
                                        +
                                      • tf_model – a keras model

                                      • +
                                      • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

                                      • +
                                      +
                                      +
                                      Returns:
                                      +

                                      the serialized quantized model

                                      +
                                      +
                                      Return type:
                                      +

                                      bytes

                                      +
                                      +
                                      +
                                      + +
                                      +
                                      +

                                      Using SavedModel

                                      +

                                      Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +SavedModel format as follows:

                                      +
                                      >>> import tensorflow as tf
                                      +>>> from doctr.models import db_resnet50
                                      +>>> model = db_resnet50(pretrained=True)
                                      +>>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                                      +>>> _ = model(input_t, training=False)
                                      +>>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
                                      +
                                      +
                                      +

                                      And loaded just as easily:

                                      +
                                      >>> import tensorflow as tf
                                      +>>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
                                      +
                                      +
                                      +

                                      @@ -741,14 +894,14 @@

                                      doctr.models.zoo - +
                                      Previous
                                      -
                                      doctr.io
                                      +
                                      doctr.documents
                                      @@ -783,33 +936,49 @@

                                      doctr.models.zoo
                                      • doctr.models
                                          -
                                        • doctr.models.backbones
                                            -
                                          • vgg16_bn()
                                          • -
                                          • resnet31()
                                          • -
                                          • mobilenet_v3_small()
                                          • -
                                          • mobilenet_v3_large()
                                          • -
                                          • mobilenet_v3_small_r()
                                          • -
                                          • mobilenet_v3_large_r()
                                          • +
                                          • Text Detection
                                              +
                                            • Pre-processing for detection
                                            • +
                                            • Detection models
                                            • -
                                            • doctr.models.detection
                                                -
                                              • linknet16()
                                              • -
                                              • db_resnet50()
                                              • -
                                              • db_mobilenet_v3_large()
                                              • +
                                              • Detection predictors
                                              • -
                                              • doctr.models.recognition
                                                  +
                                                +
                                              • +
                                              • Text Recognition @@ -823,7 +992,7 @@

                                                doctr.models.zoo + diff --git a/v0.4.0/objects.inv b/v0.4.0/objects.inv index 8dd2b2bc27..a22d2ce821 100644 Binary files a/v0.4.0/objects.inv and b/v0.4.0/objects.inv differ diff --git a/v0.4.0/search.html b/v0.4.0/search.html index f3c5a3139a..fea94ac955 100644 --- a/v0.4.0/search.html +++ b/v0.4.0/search.html @@ -227,27 +227,21 @@ @@ -324,7 +318,7 @@ - + diff --git a/v0.4.0/searchindex.js b/v0.4.0/searchindex.js index 8e217203cf..231483d7a6 100644 --- a/v0.4.0/searchindex.js +++ b/v0.4.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"Artefact": [[4, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Available architectures": [[8, "available-architectures"], [8, "id1"], [8, "id3"]], "Block": [[4, "block"]], "Changelog": [[0, null]], "Choosing the right model": [[8, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection predictors": [[8, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[2, null]], "Document": [[4, "document"]], "Document structure": [[4, "document-structure"]], "End-to-End OCR": [[8, "end-to-end-ocr"]], "File reading": [[4, "file-reading"]], "Half-precision": [[7, "half-precision"]], "Installation": [[3, null]], "Line": [[4, "line"]], "Main Features": [[2, "main-features"]], "Model compression": [[7, "model-compression"]], "Model zoo": [[2, "model-zoo"]], "Page": [[4, "page"]], "Post-training quantization": [[7, "post-training-quantization"]], "Preparing your model for inference": [[7, null]], "Prerequisites": [[3, "prerequisites"]], "Recognition predictors": [[8, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[2, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[9, "task-evaluation"]], "TensorFlow Lite": [[7, "tensorflow-lite"]], "Text Detection": [[8, "text-detection"]], "Text Recognition": [[8, "text-recognition"]], "Text detection models": [[2, "text-detection-models"]], "Text recognition model zoo": [[8, "id5"]], "Text recognition models": [[2, "text-recognition-models"]], "Two-stage approaches": [[8, "two-stage-approaches"]], "Using SavedModel": [[7, "using-savedmodel"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[8, "what-should-i-do-with-the-output"]], "Word": [[4, "word"]], "doctr.datasets": [[1, null]], "doctr.io": [[4, null]], "doctr.models": [[5, null]], "doctr.models.backbones": [[5, "doctr-models-backbones"]], "doctr.models.detection": [[5, "doctr-models-detection"]], "doctr.models.recognition": [[5, "doctr-models-recognition"]], "doctr.models.zoo": [[5, "doctr-models-zoo"]], "doctr.transforms": [[6, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]]}, "docnames": ["changelog", "datasets", "index", "installing", "io", "models", "transforms", "using_model_export", "using_models", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "index.rst", "installing.rst", "io.rst", "models.rst", "transforms.rst", "using_model_export.rst", "using_models.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.io)": [[4, "doctr.io.Artefact", false]], "as_images() (doctr.io.pdf method)": [[4, "doctr.io.PDF.as_images", false]], "block (class in doctr.io)": [[4, "doctr.io.Block", false]], "charactergenerator (class in doctr.datasets)": [[1, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[4, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.io)": [[4, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[4, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[4, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.io.pdf method)": [[4, "doctr.io.PDF.get_artefacts", false]], "get_words() (doctr.io.pdf method)": [[4, "doctr.io.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[4, "doctr.io.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "mobilenet_v3_large() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.mobilenet_v3_small", false]], "mobilenet_v3_small_r() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[5, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[4, "doctr.io.Page", false]], "pdf (class in doctr.io)": [[4, "doctr.io.PDF", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[6, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[6, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.io)": [[4, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[4, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[4, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[4, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "resnet31() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.resnet31", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[4, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[4, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn() (in module doctr.models.backbones)": [[5, "doctr.models.backbones.vgg16_bn", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.io)": [[4, "doctr.io.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "CharacterGenerator"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.io": [[4, 0, 1, "", "Artefact"], [4, 0, 1, "", "Block"], [4, 0, 1, "", "Document"], [4, 0, 1, "", "DocumentFile"], [4, 0, 1, "", "Line"], [4, 0, 1, "", "PDF"], [4, 0, 1, "", "Page"], [4, 0, 1, "", "Word"], [4, 1, 1, "", "decode_img_as_tensor"], [4, 1, 1, "", "read_html"], [4, 1, 1, "", "read_img_as_numpy"], [4, 1, 1, "", "read_img_as_tensor"], [4, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[4, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[4, 2, 1, "", "from_images"], [4, 2, 1, "", "from_pdf"], [4, 2, 1, "", "from_url"]], "doctr.io.PDF": [[4, 2, 1, "", "as_images"], [4, 2, 1, "", "get_artefacts"], [4, 2, 1, "", "get_words"]], "doctr.io.Page": [[4, 2, 1, "", "show"]], "doctr.models": [[5, 1, 1, "", "ocr_predictor"]], "doctr.models.backbones": [[5, 1, 1, "", "mobilenet_v3_large"], [5, 1, 1, "", "mobilenet_v3_large_r"], [5, 1, 1, "", "mobilenet_v3_small"], [5, 1, 1, "", "mobilenet_v3_small_r"], [5, 1, 1, "", "resnet31"], [5, 1, 1, "", "vgg16_bn"]], "doctr.models.detection": [[5, 1, 1, "", "db_mobilenet_v3_large"], [5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_mobilenet_v3_large"], [5, 1, 1, "", "crnn_mobilenet_v3_small"], [5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomCrop"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomRotate"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [4, 9], "0": [1, 6, 8, 9], "00": 8, "01": [], "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 8, "02562": 5, "03": 8, "035": [], "0361328125": 8, "04": 8, "05": [], "06": 8, "06640625": 8, "07": [], "08": [6, 8], "09": [], "0966796875": 8, "1": [1, 5, 6, 7, 8, 9], "10": [1, 8, 9], "100": [1, 6, 7, 8, 9], "1000": 8, "101": [], "1024": [5, 7, 8, 9], "104": [], "106": [], "108": 1, "1095": [], "11": 8, "110": 9, "1107": [], "114": [], "115": [], "1156": [], "116": 1, "118": [], "11800h": [], "11th": [], "12": 8, "120": [], "123": 1, "126": 1, "1268": [], "128": [5, 8], "13": [8, 9], "130": [], "13068": [], "131": 1, "1337891": [], "1357421875": 8, "1396484375": 8, "14": 8, "1420": [], "14470v1": [], "149": [], "15": 8, "150": [8, 9], "154": [], "1552": [], "16": 5, "160": 5, "1630859375": 8, "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": [], "185546875": 8, "19": [], "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 8, "1999": [], "1m": 8, "2": [2, 3, 6, 7, 8], "20": 8, "200": 9, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 2, "2021": [], "207901": [], "21": 8, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6, 7], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 8, "2504": [], "255": [4, 5, 6, 8, 9], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 8, "2700": [], "2710": [], "2749": [], "28": [], "287": [], "29": 8, "296": [], "299": [], "2d": 8, "2m": 8, "3": [2, 3, 4, 5, 6, 7, 8, 9], "30": 8, "300": [], "3000": [], "301": [], "30595": 8, "30ghz": [], "31": [5, 8], "32": [1, 5, 6, 7, 8], "3232421875": 8, "33": 6, "33402": [], "33608": [], "34": 8, "340": 8, "3456": [], "35": 8, "3515625": 8, "36": 8, "360": [], "37": 8, "38": [], "39": 8, "4": [5, 8], "40": [], "406": 6, "41": 8, "42": 8, "43": 8, "44": [], "45": [], "456": 6, "46": 8, "47": 8, "472": [], "48": 5, "485": 6, "49": 8, "49377": [], "5": [1, 6, 9], "50": [5, 8], "51": 8, "51171875": 8, "512": 5, "52": [1, 8], "529": [], "53": 8, "533": [], "54": [], "540": [], "5478515625": 8, "55": [], "56": 8, "57": 8, "58": [], "580": [], "5810546875": 8, "583": [], "59": 8, "595": [], "597": [], "5k": [], "5m": 8, "6": [3, 6, 8], "60": 6, "600": [5, 8, 9], "61": 8, "611": [], "62": 8, "625": [], "626": [], "629": [], "63": 8, "630": [], "64": [5, 6, 8], "640": [], "641": [], "647": [], "65": 8, "66": 8, "660": [], "664": [], "666": [], "67": 8, "672": [], "68": 8, "689": [], "69": 8, "693": [], "694": [], "695": [], "6m": [], "7": 8, "70": [8, 9], "700": [], "701": [], "702": [], "707470": [], "71": 8, "7100000": [], "713": [], "7141797": [], "7149": [], "72": 8, "72dpi": [], "73": 8, "73257": [], "733": [], "74": 8, "745": [], "75": [6, 8], "753": [], "7581382": [], "76": 8, "77": 8, "772": [], "772875": [], "78": 8, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 8, "793533": [], "796": [], "798": [], "7m": 8, "8": [5, 6, 8], "80": 8, "800": [5, 8, 9], "81": 8, "817": [], "82": 8, "8275l": 8, "83": 8, "830": [], "84": 8, "849": [], "85": 8, "8564453125": 8, "857": [], "85875": [], "86": 8, "860": [], "8603515625": 8, "862": [], "863": [], "87": 8, "8707": [], "875": [], "88": [], "89": 8, "8m": 8, "9": [], "90": 8, "90k": [], "90kdict32px": [], "91": 8, "913": [], "914085328578949": 8, "917": [], "92": 8, "921": [], "93": 8, "94": [1, 8], "95": 9, "9578408598899841": 8, "96": 8, "97": 8, "98": 8, "99": 8, "9949972033500671": 8, "A": [1, 2, 4, 5, 7], "And": 7, "As": [], "Be": [], "Being": [], "By": [], "For": [3, 8], "If": [3, 4, 5, 7], "In": 1, "It": 6, "Its": [2, 5], "No": 8, "Of": 1, "Or": [], "The": [4, 8, 9], "Then": [], "To": [3, 8], "_": [1, 5, 7], "__call__": [], "_build": [], "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": 1, "abl": 8, "about": 8, "abov": 8, "abstract": [], "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 4, 8], "account": 7, "accur": [], "accuraci": 9, "achiev": 7, "act": [], "action": [], "activ": 2, "ad": [5, 6], "adapt": [], "add": [6, 9], "add_hook": [], "add_label": 9, "addit": [], "addition": [7, 8], "address": 4, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": 8, "ag": [], "again": 3, "aggreg": [1, 9], "aggress": [], "align": 4, "all": [1, 4, 6, 8, 9], "allow": [], "along": 8, "alreadi": [], "also": 8, "alwai": [], "an": [1, 2, 4, 5, 7, 9], "analysi": 4, "ancient_greek": [], "angl": [4, 6], "ani": [1, 4, 5, 6, 7, 8, 9], "annot": 4, "anot": [], "anoth": [1, 3, 5], "answer": [], "anyascii": [], "anyon": 2, "anyth": [], "api": 2, "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": [2, 5], "appoint": [], "appreci": [], "appropri": 8, "ar": [1, 3, 4, 6, 8, 9], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [2, 5], "archiv": [], "area": [], "arg": 1, "argument": [1, 4], "around": [], "arrai": [4, 9], "art": 2, "artefact": [8, 9], "artefact_typ": 4, "artifici": [], "arxiv": 5, "as_imag": 4, "asarrai": 9, "ascii_lett": 1, "aspect": [2, 6], "assess": 9, "assign": 9, "associ": 4, "assum": [], "assume_straight_pag": [], "astyp": [5, 7, 8, 9], "attack": [], "attend": [2, 5], "attent": [], "autoclass": [], "autom": 2, "automat": [], "autoregress": [], "avail": [2, 6], "averag": [6, 8], "avoid": 3, "aw": [2, 8], "awar": [], "azur": [], "b": 9, "b_j": 9, "back": [], "backbon": [], "backend": 8, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": [2, 5], "baselin": [2, 5], "bash": [], "batch": [1, 5, 6, 8], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [8, 9], "befor": [1, 6], "begin": 9, "behavior": [], "being": [8, 9], "belong": 8, "below": 8, "benchmark": 8, "best": [], "beta": [], "better": 8, "between": [6, 9], "bgr": 4, "bilinear": 6, "bin_thresh": [], "binar": [2, 5], "binari": [4, 8], "bit": [], "blank": 9, "block": [8, 9], "block_1_1": [], "blue": 9, "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 4, 5, 6, 9], "boolean": 5, "both": [2, 6, 8], "bottom": 8, "bound": [1, 4, 6, 8, 9], "box": [1, 4, 6, 8, 9], "box_thresh": [], "brew": 3, "bright": 6, "browser": 2, "build": 3, "built": [], "byte": 4, "c": [], "c5": 8, "c_j": [], "cach": 1, "cache_sampl": 1, "cairo": 3, "call": [], "callabl": [1, 6], "can": [1, 3, 7, 8], "capabl": 8, "case": [1, 8, 9], "cf": 8, "cfg": 8, "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": [], "channel": [4, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 4, 8, 9], "charactergener": 1, "characterist": [], "charg": 8, "charset": [], "chart": 4, "check": 8, "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 4, 6, 9], "class_nam": [], "classif": [], "classmethod": 4, "clear": [], "clone": 3, "close": [], "co": [], "code": [2, 4], "codecov": [], "colab": [], "collate_fn": 1, "collect": 4, "color": [6, 9], "colorinvers": 6, "column": 4, "com": [3, 4], "combin": 8, "command": [], "comment": [], "commit": [], "common": [6, 9], "commun": [], "compar": 2, "comparison": [8, 9], "competit": 1, "compil": 8, "complaint": [], "complementari": 9, "complet": [], "compon": 8, "compos": [1, 8], "comprehens": 8, "comput": [8, 9], "conf_threshold": [], "confid": [4, 8, 9], "config": [], "configur": [], "confus": 9, "consecut": [6, 8], "consequ": [], "consid": [1, 4, 8, 9], "consist": 8, "consolid": [1, 2], "constant": 6, "constraint": 7, "construct": [], "contact": [], "contain": 8, "content": [1, 4, 9], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 7, "convers": 4, "convert": [4, 6, 7], "convert_page_to_numpi": 4, "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 5, "coordin": [4, 8], "cord": [1, 2, 8], "core": 9, "corner": 8, "correct": 6, "correspond": [3, 8], "could": [], "counterpart": 9, "cover": [], "coverag": [], "cpu": [2, 8], "creat": [], "crnn": [2, 5], "crnn_mobilenet_v3_larg": [5, 8], "crnn_mobilenet_v3_smal": [5, 8], "crnn_resnet31": [], "crnn_vgg16_bn": [5, 8], "crop": [6, 8], "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": 8, "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": 1, "daili": [], "danish": [], "data": [4, 6, 9], "dataload": 1, "dataset": [5, 8], "dataset_info": [], "date": 8, "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [5, 8], "db_resnet34": [], "db_resnet50": [5, 7, 8], "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [2, 5], "deal": [], "decis": [], "decod": 4, "decode_img_as_tensor": 4, "dedic": [], "deem": [], "deep": [5, 8], "def": 7, "default": [4, 7, 9], "defer": 1, "defin": 9, "deform": [], "degre": 6, "degress": 4, "delet": [], "delimit": 8, "delta": 6, "demo": 2, "demonstr": [], "depend": [2, 3], "deploi": [], "deploy": 2, "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": 4, "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": 8, "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": [5, 8], "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": 3, "developp": 3, "deviat": 6, "devic": [], "dict": [4, 8, 9], "dictionari": [4, 9], "differ": [], "differenti": [2, 5], "digit": 1, "dimens": [4, 8, 9], "dimension": 6, "direct": [], "directli": 8, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 8, "discuss": [], "disk": [], "disparag": [], "displai": [4, 9], "display_artefact": 9, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": 4, "do": [3, 7], "doc": [4, 8], "docartefact": [], "docstr": [], "doctr": [3, 7, 8], "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 8, 9], "documentbuild": [], "documentfil": 4, "doesn": [], "don": 8, "done": 6, "download": 1, "downsiz": 5, "draw": [6, 9], "draw_proba": 9, "drop": 1, "drop_last": 1, "dtype": [4, 5, 7], "dual": [], "dummi": [], "dummy_img": 8, "dummy_input": [], "dure": [], "dutch": [], "dynam": 1, "dynamic_seq_length": 1, "e": [3, 4, 5], "each": [1, 2, 4, 6, 8, 9], "eas": [], "easi": [2, 9], "easier": [], "easili": [4, 7, 8, 9], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 2, 5], "either": 8, "element": [1, 4, 8, 9], "els": [], "email": [], "empathi": [], "en": [], "enabl": [1, 4], "enclos": 4, "encod": [1, 2, 4, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 2, 5, 9], "english": 1, "enough": 8, "ensur": [], "entir": 4, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 4, "ethnic": [], "evalu": [1, 8], "event": [], "everyon": [], "everyth": 8, "exact": [8, 9], "exactmatch": [], "exampl": [1, 4, 5, 6, 9], "exchang": [], "exclud": [], "execut": [], "exist": [], "expand": 6, "expect": [4, 6], "experi": [], "explan": 8, "explicit": [], "exploit": [2, 5], "export": [4, 7, 8, 9], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 4, "extern": [], "extra": 3, "extract": [1, 2], "extract_arch": [], "extractor": 5, "f_": 9, "f_a": 9, "factor": 6, "fair": [], "fairli": [], "fallback": 7, "fals": [1, 5, 6, 7, 9], "famili": 9, "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": 8, "featur": [3, 5, 9], "feed": [], "feedback": [], "feel": [], "felix92": [], "few": [3, 7], "figsiz": 9, "figur": 9, "file": 1, "file_hash": [], "file_nam": [], "final": 7, "find": 3, "fine": [], "finnish": [], "first": [], "firsthand": 1, "fit": [], "fitz": 4, "flag": 8, "flexibl": 9, "flip": [], "float": [4, 6, 7, 9], "float16": 7, "float32": [4, 5, 7], "fn": 6, "focu": [], "focus": [], "folder": [1, 7], "follow": [1, 3, 6, 7, 8, 9], "font": 9, "font_famili": 9, "font_siz": 9, "foral": 9, "forc": [], "forg": [], "form": [1, 2, 8], "format": [4, 7, 8], "forpost": [1, 2], "forum": [], "fp": 8, "fp16": [], "frac": 9, "frame": 8, "framework": [1, 3, 8], "free": [], "french": [1, 8], "friendli": 2, "from": [1, 2, 4, 5, 6, 7, 8, 9], "from_hub": [], "from_imag": 4, "from_keras_model": 7, "from_pdf": 4, "from_url": 4, "full": [1, 8, 9], "fulli": [], "function": [6, 9], "funsd": [1, 2, 8], "further": [], "futur": [], "g": [4, 5], "g_": 9, "g_x": 9, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 3, "gen": [], "gender": [], "gener": 1, "generic_cyrillic_lett": [], "geometri": [4, 8], "geq": 9, "german": 1, "get": [4, 8], "get_artefact": 4, "get_word": 4, "gettextword": 4, "git": [], "github": 3, "give": [], "given": [1, 4, 8, 9], "global": [], "go": [], "good": 7, "googl": [], "googlevis": 2, "gpu": 2, "gracefulli": [], "graph": 4, "grayscal": 6, "ground": 9, "groung": 9, "group": [], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": [], "guidanc": [], "gvision": 8, "h": [4, 5, 6], "h_": 9, "ha": [1, 9], "half": [], "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 9, "have": [1, 7, 8, 9], "head": [], "healthi": [], "hebrew": [], "height": 4, "hello": 9, "help": 7, "here": [1, 3, 6, 8], "hf": [], "hf_hub_download": [], "high": 4, "higher": 3, "hindi": [], "hindi_digit": [], "hocr": [], "homebrew": 3, "hook": [], "horizont": 4, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [3, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 4, 5, 6, 7, 9], "i7": [], "ic03": [], "ic13": [], "icdar": 2, "icdar2019": 1, "id": 8, "ident": [], "identifi": 2, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 4, 5, 6, 8, 9], "imagenet": 5, "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": 4, "img_fold": 1, "img_path": 4, "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 4, 6, 7, 8, 9], "import": [1, 4, 5, 6, 7, 8, 9], "improv": [], "inappropri": [], "incid": [], "includ": 3, "inclus": [], "increas": 6, "independ": [], "index": 4, "indic": 9, "individu": [], "infer": [2, 6], "inference_input_typ": 7, "inference_output_typ": 7, "inform": [1, 2, 8], "inherit": 7, "input": [4, 6, 8], "input_crop": [], "input_pag": [5, 8, 9], "input_shap": 7, "input_t": 7, "input_tensor": 5, "inspir": 6, "instal": [], "instanc": 8, "instanti": [], "instead": [1, 4], "insult": [], "int": [1, 4, 6, 9], "int64": [], "int8": 7, "integ": [7, 9], "integr": 2, "intel": [], "interact": [4, 9], "interfac": [], "interoper": [], "interpol": 6, "interpret": [1, 4], "intersect": 9, "invert": 6, "investig": [], "invis": [], "invoic": 8, "involv": 8, "io": [], "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [2, 5], "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 4, 6, 8, 9], "itself": [], "j": 9, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 4], "json": 8, "json_output": 8, "jump": [], "just": 7, "kei": [], "kera": [5, 7], "kernel": [], "kernel_s": 7, "kernel_shap": [], "keywoard": [], "keyword": [1, 4], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 4, 5, 9], "l": 9, "l_j": 9, "label": [1, 9], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 4, 8], "larg": 5, "largest": 9, "last": [1, 3], "latenc": [], "later": [], "latest": 3, "latin": 1, "layer": [], "layout": 8, "lead": [], "leader": [], "learn": 8, "least": 3, "left": [8, 9], "legacy_french": 1, "length": 1, "less": [], "let": [], "letter": [], "level": [8, 9], "levenshtein": [], "leverag": [], "lf": [], "libffi": 3, "librari": 3, "light": 2, "lightweight": [], "like": [], "limits_": 9, "line": [2, 8, 9], "line_1_1": [], "link": [], "linknet": [2, 5], "linknet16": [5, 8], "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 3, "list": [1, 4, 6, 9], "ll": 9, "load": [2, 7], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 2, 5, 8, 9], "localis": [], "localizationconfus": 9, "locat": 4, "login": [], "login_to_hub": [], "logo": 4, "look": 8, "love": [], "lower": [6, 9], "m": [8, 9], "m1": [], "macbook": [], "machin": [], "maco": 3, "made": 2, "magc_resnet31": [], "mai": 8, "mail": [], "main": [], "maintain": 2, "mainten": [], "make": [7, 8, 9], "mani": 8, "manipul": [], "map": 1, "map_loc": [], "mask_shap": 9, "master": [2, 5, 8], "match": [8, 9], "mathcal": 9, "matplotlib": 9, "max": 9, "max_angl": 6, "max_area": 6, "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": 6, "maximum": [1, 6], "maxval": [5, 6, 7], "mbox": 9, "mean": [6, 9], "meaniou": 9, "meant": [4, 7], "measur": 8, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [8, 9], "middl": [], "might": [7, 8], "min": [], "min_area": 6, "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": 6, "min_val": 6, "minde": [2, 3], "minim": 2, "minimalist": [], "minimum": [3, 9], "minval": 6, "miss": 3, "mistak": [], "mix": [], "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": 5, "mobilenet_v3_larg": 5, "mobilenet_v3_large_r": 5, "mobilenet_v3_smal": 5, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 5, "mobilenetv3": 5, "mobilenetv3_larg": 5, "mobilenetv3_smal": 5, "modal": [], "mode": 3, "model": [1, 9], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": 5, "modul": [4, 6, 8, 9], "moment": 8, "more": 8, "most": 8, "mozilla": [], "multi": 2, "multilingu": [], "multipl": [1, 4, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 9], "na": [], "name": [1, 5], "nation": [], "natur": 2, "ndarrai": [1, 4, 9], "necessari": 3, "need": [3, 9], "neg": 6, "nest": 8, "nestedobject": [], "network": [2, 5], "neural": [2, 5], "new": 9, "newer": [], "next": 1, "nois": [], "noisi": [1, 2], "non": [2, 4, 6, 9], "none": [1, 4, 8, 9], "normal": [5, 6], "norwegian": [], "note": 0, "now": [], "np": [5, 7, 8, 9], "num_output_channel": [], "num_sampl": 1, "number": [1, 6, 8, 9], "numpi": [4, 5, 8, 9], "o": 3, "obb": [], "obj_detect": [], "object": [1, 8, 9], "objectness_scor": [], "oblig": [], "obtain": 8, "occupi": [], "ocr": [1, 2, 5, 9], "ocr_carea": [], "ocr_db_crnn": 9, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": [5, 8], "ocrdataset": 1, "ocrmetr": 9, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 8, "one": [1, 5, 6, 8], "oneof": 6, "ones": 1, "onli": [6, 9], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "oper": 7, "opinion": [], "opsset": 7, "optic": [2, 8], "optim": [2, 7], "option": 1, "order": [1, 4], "org": 5, "organ": 4, "orient": [4, 8], "orientationpredictor": [], "other": [], "otherwis": 9, "our": [5, 8], "out": [5, 6, 8, 9], "outpout": [], "output": [4, 6], "output_s": [4, 6], "outsid": [], "over": [1, 3, 8, 9], "overal": [], "overlai": 4, "overview": [], "overwrit": [], "overwritten": [], "own": 2, "p": [6, 9], "packag": [2, 7, 9], "pad": [1, 6], "page": [3, 8, 9], "page1": 4, "page2": 4, "page_1": [], "page_idx": [4, 8], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [6, 8], "paramet": [1, 2, 4, 5, 6, 9], "pars": [1, 2], "parseq": [], "part": 6, "parti": 3, "partial": [], "particip": [], "pass": [1, 8], "password": [], "patch": [], "path": [1, 4, 7], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [4, 5], "pdfpage": [], "peopl": [], "per": [6, 8], "perform": [2, 4, 6, 7, 8, 9], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": 8, "photo": [], "physic": 4, "pick": 6, "pictur": 4, "pip": 3, "pipelin": [], "pixbuf": 3, "pixel": [4, 6, 8], "platinum": 8, "pleas": [], "plot": 9, "plt": 9, "plug": [], "plugin": [], "png": 4, "point": [], "polici": [], "polish": [], "polit": [], "polygon": [1, 8], "pool": 5, "portugues": 1, "posit": 9, "possibl": 9, "post": 8, "postprocessor": [], "potenti": [], "power": 2, "ppageno": [], "pre": 5, "precis": [8, 9], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 1, "predict": [4, 9], "predictor": [2, 5], "prefer": 1, "preinstal": [], "preprocessor": 8, "prerequisit": [], "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [2, 5, 7, 8, 9], "pretrained_backbon": [], "print": 8, "prior": [], "privaci": [], "privat": 8, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 4, 8], "processor": 8, "produc": 8, "product": 7, "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 7, "provid": [2, 7, 8], "public": 2, "publicli": 8, "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 9, "python": [], "python3": [], "pytorch": [2, 3, 8], "q": [], "qr": 4, "qr_code": [], "qualiti": 6, "quantiz": [], "quantize_model": [], "question": [], "quickli": 2, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7, 8, 9], "random": [5, 6, 7, 8, 9], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": 6, "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": 6, "randomsatur": 6, "randomshadow": [], "rang": [6, 7], "rassi": [], "ratio": 6, "raw": [4, 9], "re": [], "read": [2, 5], "read_html": 4, "read_img": 4, "read_img_as_numpi": 4, "read_img_as_tensor": 4, "read_pdf": 4, "readi": 7, "real": [2, 5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [8, 9], "receipt": [1, 2, 8], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 9, "recognition_predictor": [5, 8], "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": 5, "recurr": [], "red": 9, "reduc": [3, 6], "refer": [3, 8], "regardless": [], "region": [], "regroup": 9, "regular": [], "reject": [], "rel": [4, 6], "relat": 4, "releas": [0, 3], "relev": [], "religion": [], "relu": 7, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [4, 8, 9], "represent": [2, 5], "representative_dataset": 7, "request": [], "requir": [3, 6], "research": 2, "residu": [], "resiz": 6, "resnet": 5, "resnet18": [], "resnet31": 5, "resnet34": [], "resnet50": [], "resolv": 4, "resolve_block": [], "resolve_lin": [], "resourc": 7, "respect": [], "respons": 9, "rest": [6, 9], "restrict": [], "result": [4, 8], "return": [1, 4, 5, 8, 9], "reusabl": 8, "review": [], "rgb": [4, 6], "rgb_mode": [], "rgb_output": 4, "right": 9, "robust": 2, "root": 1, "rotat": [1, 4, 6], "rotated_bbox": [1, 9], "run": 3, "same": [4, 8, 9], "sampl": [1, 8], "sample_transform": 1, "sar": [2, 5], "sar_resnet31": [5, 8], "sar_vgg16_bn": [], "satur": 6, "save": [1, 7], "saved_model": 7, "scale": [5, 6, 9], "scale_rang": [], "scan": [1, 2], "scene": [2, 5], "scheme": [], "score": 9, "scratch": [], "script": [], "seamless": 2, "seamlessli": 8, "search": 5, "searchabl": [], "sec": [], "second": 8, "section": [7, 8], "secur": [], "see": [], "seemlessli": 2, "seen": 8, "segment": [2, 5, 8], "self": [], "semant": [2, 5], "send": [], "sens": 9, "sensit": 8, "separ": 8, "sequenc": [1, 2, 4, 5, 8, 9], "sequenti": [6, 7], "seri": [], "serial": 7, "serialized_model": 7, "seriou": [], "set": [1, 8, 9], "set_global_polici": [], "sever": [4, 6, 8], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [4, 5, 6, 7, 8, 9], "share": 8, "shift": 6, "shm": [], "should": [1, 4, 6, 9], "show": [2, 4, 5, 9], "showcas": [], "shuffl": 1, "side": 9, "signatur": 4, "signific": 1, "simpl": [2, 5], "simpler": [], "sinc": [1, 8], "singl": [], "single_img_doc": [], "size": [1, 4, 6, 8, 9], "skew": [], "slack": [], "slightli": [], "small": 5, "smallest": 4, "snapshot_download": [], "snippet": [], "so": [1, 3], "social": [], "socio": [], "some": 3, "someth": [], "somewher": [], "sort": [], "sourc": [1, 4, 5, 6, 9], "space": [], "span": [], "spanish": 1, "spatial": 4, "special": [], "specif": [1, 3, 8, 9], "specifi": [1, 4], "speed": 2, "sphinx": [], "sroie": [1, 2], "stabl": 3, "stackoverflow": [], "stage": 2, "standard": 6, "start": 1, "state": [2, 9], "static": 9, "statist": [], "statu": [], "std": 6, "step": [], "still": [], "str": [1, 4, 5, 6, 9], "straight": [1, 8], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 4, "street": [], "strict": [], "strictli": 9, "string": [1, 4, 9], "strive": 3, "strong": [2, 5], "structur": 8, "subset": [1, 8], "suggest": [], "sum": 9, "summari": 9, "support": 8, "supported_op": 7, "supported_typ": 7, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthes": 9, "synthesize_pag": 9, "synthet": [], "synthtext": [], "system": [], "t": [1, 8], "tabl": [], "take": [1, 7, 8], "target": [1, 4, 6], "target_s": 1, "target_spec": 7, "task": [1, 2, 8], "task2": [], "tax": 8, "team": [], "techminde": [], "templat": [2, 4], "tensor": [1, 4, 6, 8], "tensorflow": [2, 3, 4, 5, 6, 8], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [4, 5, 9], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [2, 8], "textstylebrush": [], "textual": [1, 2, 4, 8], "tf": [3, 4, 5, 6, 7], "tf_model": 7, "tflite": 7, "tflite_builtins_int8": 7, "tfliteconvert": 7, "than": [3, 9], "thank": [], "thei": 8, "them": [1, 3, 8], "thi": [3, 7, 8, 9], "thing": [7, 8], "third": 3, "those": [3, 4, 8], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 2, 5, 9], "tini": [], "titl": 4, "tm": [], "tmp": [], "togeth": 4, "tograi": 6, "tool": [], "top": [8, 9], "topic": [], "torch": 3, "torchvis": 6, "total": [], "toward": 3, "train": [1, 5, 6, 8], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": [2, 5], "tranform": 6, "transcrib": 8, "transfer": [], "transfo": 6, "transform": 1, "translat": [], "troll": [], "true": [1, 4, 5, 6, 7, 8, 9], "truth": 9, "tune": 7, "tupl": [4, 6, 9], "turn": [], "two": 4, "txt": [], "type": [4, 8], "typic": 8, "u": 8, "ucsd": [], "udac": [], "uint8": [4, 5, 8, 9], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 4, "understand": [1, 2, 8], "unfortun": 8, "unidecod": 9, "uniform": [5, 6, 7], "uniformli": 6, "uninterrupt": [4, 8], "union": 9, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 8, "updat": 9, "upgrad": [], "upper": [1, 6], "uppercas": [], "url": 4, "us": [1, 3, 5, 8, 9], "usabl": 8, "usag": [], "use_polygon": [], "useabl": 8, "user": [2, 3, 4], "utf": [], "util": 7, "v0": [], "v1": [], "v3": 5, "valid": [], "valu": [4, 6, 8], "valuabl": 2, "variabl": [], "varieti": [], "variou": 8, "veri": 5, "verifi": [], "version": 7, "vgg": 5, "vgg16": [], "vgg16_bn": 5, "vgg16_bn_r": [], "via": [], "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": [], "visualize_pag": 9, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": 8, "vocabulari": 1, "w": [4, 5, 6, 9], "w3": [], "wa": [], "wai": [1, 2], "want": 7, "warm": [], "warmup": 8, "wasn": [], "we": [2, 3, 4, 6, 8], "weasyprint": [], "web": 4, "websit": [], "welcom": [], "well": 7, "were": [4, 8], "what": [], "when": [], "whenev": [], "where": [4, 6, 8, 9], "whether": [1, 4, 6, 9], "which": 8, "whichev": 3, "while": [6, 8], "why": [], "width": 4, "wiki": [], "wildreceipt": [], "window": [3, 5, 9], "wish": 7, "within": [], "without": [], "wonder": [], "word": [2, 8, 9], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 9, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 9, "worth": [], "wrap": 8, "wrapper": [1, 6], "write": [], "written": 4, "www": 4, "x": [4, 6, 9], "x12larg": 8, "x_ascend": [], "x_descend": [], "x_i": 9, "x_size": [], "x_wconf": [], "xeon": 8, "xhtml": [], "xmax": 4, "xmin": 4, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": 7, "ymax": 4, "ymin": 4, "yolov8": [], "you": [3, 7, 8], "your": [1, 2, 4, 8, 9], "yoursit": 4, "zero": 6, "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 1, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 1, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 1, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 1, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "DocTR: Document Text Recognition", "Installation", "doctr.io", "doctr.models", "doctr.transforms", "Preparing your model for inference", "Choosing the right model", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": 0, "28": 0, "29": [], "3": 0, "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 8, "architectur": 8, "arg": [], "artefact": 4, "artefactdetect": [], "attribut": [], "avail": [1, 8], "aw": [], "backbon": 5, "ban": [], "block": 4, "bug": [], "build": [], "changelog": 0, "choos": 8, "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 7, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 2], "detect": [2, 5, 8], "develop": [], "do": 8, "doctr": [1, 2, 4, 5, 6, 9], "document": [2, 4], "end": 8, "enforc": [], "evalu": 9, "export": [], "factori": [], "featur": 2, "feedback": [], "file": 4, "from": [], "gener": [], "get": [], "git": 3, "guidelin": [], "half": 7, "hub": [], "huggingfac": [], "i": 8, "implement": [], "infer": 7, "instal": 3, "integr": [], "io": 4, "lambda": [], "let": [], "line": 4, "linux": [], "lite": 7, "load": 1, "loader": [], "main": 2, "mode": [], "model": [2, 5, 7, 8], "modifi": [], "modul": [], "name": [], "note": [], "notebook": [], "object": [], "ocr": 8, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": 8, "own": [], "packag": 3, "page": 4, "perman": [], "pipelin": [], "pledg": [], "post": 7, "pre": [], "precis": 7, "predictor": 8, "prepar": 7, "prerequisit": 3, "pretrain": [], "process": [], "push": [], "python": 3, "qualiti": [], "quantiz": 7, "question": [], "read": 4, "readi": [], "recognit": [2, 5, 8], "refer": [], "report": [], "request": [], "respons": [], "return": [], "right": 8, "savedmodel": 7, "scope": [], "share": [], "should": 8, "stage": 8, "standard": [], "start": [], "structur": 4, "style": [], "support": [1, 2, 6], "synthet": [], "task": 9, "temporari": [], "tensorflow": 7, "test": [], "text": [2, 8], "train": 7, "transform": 6, "two": 8, "unit": [], "us": 7, "util": 9, "v0": 0, "verif": [], "via": 3, "visual": 9, "vocab": 1, "warn": [], "what": 8, "word": 4, "your": 7, "zoo": [2, 5, 8]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file diff --git a/v0.4.0/transforms.html b/v0.4.0/transforms.html index 9a27f93a4e..d42da50481 100644 --- a/v0.4.0/transforms.html +++ b/v0.4.0/transforms.html @@ -227,27 +227,21 @@ @@ -523,35 +517,6 @@

                                                Supported transformations -
                                                -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
                                                -

                                                Randomly rotate a tensor image and its boxes

                                                -
                                                -
                                                Parameters:
                                                -
                                                  -
                                                • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

                                                • -
                                                • expand – whether the image should be padded before the rotation

                                                • -
                                                -
                                                -
                                                -
                                                - -
                                                -
                                                -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
                                                -

                                                Randomly crop a tensor image and its boxes

                                                -
                                                -
                                                Parameters:
                                                -
                                                  -
                                                • scale – tuple of floats, relative (min_area, max_area) of the crop

                                                • -
                                                • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

                                                • -
                                                -
                                                -
                                                -
                                                -

                                      Composing transformations

                                      @@ -690,8 +655,6 @@

                                      Composing transformationsRandomHue
                                    • RandomGamma
                                    • RandomJpegQuality
                                    • -
                                    • RandomRotate
                                    • -
                                    • RandomCrop
                                  • Composing transformations
                                      @@ -711,7 +674,7 @@

                                      Composing transformations + diff --git a/v0.4.0/using_doctr/using_model_export.html b/v0.4.0/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.4.0/using_doctr/using_model_export.html +++ b/v0.4.0/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

                                      Half-precision
                                      import tensorflow as tf
                                      -from keras import mixed_precision
                                      +from tensorflow.keras import mixed_precision
                                       mixed_precision.set_global_policy('mixed_float16')
                                       predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
                                       
                                      diff --git a/v0.4.0/using_model_export.html b/v0.4.0/using_model_export.html deleted file mode 100644 index a162025215..0000000000 --- a/v0.4.0/using_model_export.html +++ /dev/null @@ -1,435 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                      Skip to content - - - -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      -
                                      -
                                      - - - - - Back to top - -
                                      - -
                                      - -
                                      - -
                                      -
                                      -
                                      -

                                      Preparing your model for inference

                                      -

                                      A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

                                      -
                                      -

                                      Model compression

                                      -

                                      This section is meant to help you perform inference with compressed versions of your model.

                                      -
                                      -

                                      TensorFlow Lite

                                      -

                                      TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

                                      -
                                      >>> import tensorflow as tf
                                      ->>> from tensorflow.keras import Sequential
                                      ->>> from doctr.models import conv_sequence
                                      ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                                      ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
                                      ->>> serialized_model = converter.convert()
                                      -
                                      -
                                      -
                                      -
                                      -

                                      Half-precision

                                      -

                                      If you want to convert it to half-precision using your TFLite converter

                                      -
                                      >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                                      ->>> converter.target_spec.supported_types = [tf.float16]
                                      ->>> serialized_model = converter.convert()
                                      -
                                      -
                                      -
                                      -
                                      -

                                      Post-training quantization

                                      -

                                      Finally if you wish to quantize the model with your TFLite converter

                                      -
                                      >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                                      ->>> # Float fallback for operators that do not have an integer implementation
                                      ->>> def representative_dataset():
                                      ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
                                      ->>> converter.representative_dataset = representative_dataset
                                      ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
                                      ->>> converter.inference_input_type = tf.int8
                                      ->>> converter.inference_output_type = tf.int8
                                      ->>> serialized_model = converter.convert()
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -

                                      Using SavedModel

                                      -

                                      Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

                                      -
                                      >>> import tensorflow as tf
                                      ->>> from doctr.models import db_resnet50
                                      ->>> model = db_resnet50(pretrained=True)
                                      ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                                      ->>> _ = model(input_t, training=False)
                                      ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
                                      -
                                      -
                                      -

                                      And loaded just as easily:

                                      -
                                      >>> import tensorflow as tf
                                      ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
                                      -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/using_models.html b/v0.4.0/using_models.html deleted file mode 100644 index d0fc5d15f9..0000000000 --- a/v0.4.0/using_models.html +++ /dev/null @@ -1,803 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      -
                                      -
                                      - - - - - Back to top - -
                                      - -
                                      - -
                                      - -
                                      -
                                      -
                                      -

                                      Choosing the right model

                                      -

                                      The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

                                      -

                                      For a given task, DocTR provides a Predictor, which is composed of 2 components:

                                      -
                                        -
                                      • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

                                      • -
                                      • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

                                      • -
                                      -
                                      -

                                      Text Detection

                                      -

                                      The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in DocTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

                                      -
                                      -

                                      Available architectures

                                      -

                                      The following architectures are currently supported:

                                      - -

                                      For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                      -
                                      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                      FUNSD

                                      CORD

                                      Architecture

                                      Input shape

                                      # params

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      FPS

                                      db_resnet50

                                      (1024, 1024, 3)

                                      25.2 M

                                      82.14

                                      87.64

                                      92.49

                                      89.66

                                      2.1

                                      db_mobilenet_v3_large

                                      (1024, 1024, 3)

                                      4.2 M

                                      79.35

                                      84.03

                                      81.14

                                      66.85

                                      -
                                      -

                                      All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                                      -

                                      Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

                                      -

                                      FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                      -
                                      -
                                      -

                                      Detection predictors

                                      -

                                      detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

                                      -
                                      >>> import numpy as np
                                      ->>> from doctr.models import detection_predictor
                                      ->>> predictor = detection_predictor('db_resnet50')
                                      ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                                      ->>> out = model([dummy_img])
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -

                                      Text Recognition

                                      -

                                      The task consists of transcribing the character sequence in a given image.

                                      -
                                      -

                                      Available architectures

                                      -

                                      The following architectures are currently supported:

                                      - -

                                      For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                      -
                                      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                      Text recognition model zoo

                                      Architecture

                                      Input shape

                                      # params

                                      FUNSD

                                      CORD

                                      FPS

                                      crnn_vgg16_bn

                                      (32, 128, 3)

                                      15.8M

                                      87.15

                                      92.92

                                      12.8

                                      crnn_mobilenet_v3_small

                                      (32, 128, 3)

                                      2.1M

                                      crnn_mobilenet_v3_large

                                      (32, 128, 3)

                                      4.5M

                                      sar_resnet31

                                      (32, 128, 3)

                                      56.2M

                                      87.70

                                      93.41

                                      2.7

                                      master

                                      (32, 128, 3)

                                      67.7M

                                      87.62

                                      93.27

                                      -
                                      -

                                      All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

                                      -

                                      While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

                                      -
                                      >>> from doctr.models import recognition_predictor
                                      ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                                      ->>> print(predictor.model.cfg['vocab'])
                                      -
                                      -
                                      -

                                      Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

                                      -

                                      FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                      -
                                      -
                                      -

                                      Recognition predictors

                                      -

                                      recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

                                      -
                                      >>> import numpy as np
                                      ->>> from doctr.models import recognition_predictor
                                      ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                                      ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
                                      ->>> out = model([dummy_img])
                                      -
                                      -
                                      -
                                      -
                                      -
                                      -

                                      End-to-End OCR

                                      -

                                      The task consists of both localizing and transcribing textual elements in a given image.

                                      -
                                      -

                                      Available architectures

                                      -

                                      You can use any combination of detection and recognition models supporte by DocTR.

                                      -

                                      For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                      -
                                      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                      FUNSD

                                      CORD

                                      Architecture

                                      Recall

                                      Precision

                                      FPS

                                      Recall

                                      Precision

                                      FPS

                                      db_resnet50 + crnn_vgg16_bn

                                      71.00

                                      76.02

                                      0.85

                                      83.87

                                      81.34

                                      1.6

                                      db_resnet50 + master

                                      71.03

                                      76.06

                                      84.49

                                      81.94

                                      db_resnet50 + sar_resnet31

                                      71.25

                                      76.29

                                      0.27

                                      84.50

                                      81.96

                                      0.83

                                      db_mobilenet_v3_large + crnn_vgg16_bn

                                      67.73

                                      71.73

                                      71.65

                                      59.03

                                      Gvision text detection

                                      59.50

                                      62.50

                                      75.30

                                      70.00

                                      Gvision doc. text detection

                                      64.00

                                      53.30

                                      68.90

                                      61.10

                                      AWS textract

                                      78.10

                                      83.00

                                      87.50

                                      66.00

                                      -
                                      -

                                      All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                                      -

                                      Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                                      -

                                      FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                      -

                                      Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

                                      -
                                      - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                      Receipts

                                      Invoices

                                      IDs

                                      US Tax Forms

                                      Architecture

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      Recall

                                      Precision

                                      db_resnet50 + crnn_vgg16_bn (ours)

                                      78.70

                                      81.12

                                      65.80

                                      70.70

                                      50.25

                                      51.78

                                      79.08

                                      92.83

                                      db_resnet50 + master (ours)

                                      79.00

                                      81.42

                                      65.57

                                      69.86

                                      51.34

                                      52.90

                                      78.86

                                      92.57

                                      db_resnet50 + sar_resnet31 (ours)

                                      78.94

                                      81.37

                                      65.89

                                      70.79

                                      51.78

                                      53.35

                                      79.04

                                      92.78

                                      db_mobilenet_v3_large + crnn_vgg16_bn (ours)

                                      78.36

                                      74.93

                                      63.04

                                      68.41

                                      39.36

                                      41.75

                                      72.14

                                      89.97

                                      Gvision doc. text detection

                                      68.91

                                      59.89

                                      63.20

                                      52.85

                                      43.70

                                      29.21

                                      69.79

                                      65.68

                                      AWS textract

                                      75.77

                                      77.70

                                      70.47

                                      69.13

                                      46.39

                                      43.32

                                      84.31

                                      98.11

                                      -
                                      -
                                      -
                                      -

                                      Two-stage approaches

                                      -

                                      Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

                                      -
                                      >>> import numpy as np
                                      ->>> from doctr.models import ocr_predictor
                                      ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                                      ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                                      ->>> out = model([input_page])
                                      -
                                      -
                                      -
                                      -
                                      -

                                      What should I do with the output?

                                      -

                                      The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

                                      -

                                      Here is a typical Document layout:

                                      -
                                      Document(
                                      -  (pages): [Page(
                                      -    dimensions=(340, 600)
                                      -    (blocks): [Block(
                                      -      (lines): [Line(
                                      -        (words): [
                                      -          Word(value='No.', confidence=0.91),
                                      -          Word(value='RECEIPT', confidence=0.99),
                                      -          Word(value='DATE', confidence=0.96),
                                      -        ]
                                      -      )]
                                      -      (artefacts): []
                                      -    )]
                                      -  )]
                                      -)
                                      -
                                      -
                                      -

                                      You can also export them as a nested dict, more appropriate for JSON format:

                                      -
                                      json_output = result.export()
                                      -
                                      -
                                      -

                                      For reference, here is the JSON export for the same Document as above:

                                      -
                                      {
                                      -  'pages': [
                                      -      {
                                      -          'page_idx': 0,
                                      -          'dimensions': (340, 600),
                                      -          'orientation': {'value': None, 'confidence': None},
                                      -          'language': {'value': None, 'confidence': None},
                                      -          'blocks': [
                                      -              {
                                      -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                                      -                  'lines': [
                                      -                      {
                                      -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                                      -                          'words': [
                                      -                              {
                                      -                                  'value': 'No.',
                                      -                                  'confidence': 0.914085328578949,
                                      -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
                                      -                              },
                                      -                              {
                                      -                                  'value': 'RECEIPT',
                                      -                                  'confidence': 0.9949972033500671,
                                      -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
                                      -                              },
                                      -                              {
                                      -                                  'value': 'DATE',
                                      -                                  'confidence': 0.9578408598899841,
                                      -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
                                      -                              }
                                      -                          ]
                                      -                      }
                                      -                  ],
                                      -                  'artefacts': []
                                      -              }
                                      -          ]
                                      -      }
                                      -  ]
                                      -}
                                      -
                                      -
                                      -
                                      -
                                      -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - - - - - - - - \ No newline at end of file diff --git a/v0.4.0/utils.html b/v0.4.0/utils.html index c64028b1c8..1908ef4ff4 100644 --- a/v0.4.0/utils.html +++ b/v0.4.0/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,27 +227,21 @@ @@ -326,25 +320,6 @@

                                      Visualization -
                                      -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
                                      -

                                      Draw a the content of the element page (OCR response) on a blank page.

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • page – exported Page object to represent

                                      • -
                                      • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                                      • -
                                      • font_size – size of the font, default font = 13

                                      • -
                                      • font_family – family of the font

                                      • -
                                      -
                                      -
                                      Returns:
                                      -

                                      the synthesized page

                                      -
                                      -
                                      -
                                      -

                                      Task evaluation

                                      @@ -381,20 +356,6 @@

                                      Visualization -
                                      -update(gt: List[str], pred: List[str]) None[source]
                                      -

                                      Update the state of the metric with new predictions

                                      -
                                      -
                                      Parameters:
                                      -
                                        -
                                      • gt – list of groung-truth character sequences

                                      • -
                                      • pred – list of predicted character sequences

                                      • -
                                      -
                                      -
                                      -
                                      -
                                      summary() Dict[str, float][source]
                                      @@ -450,11 +411,6 @@

                                      Visualization

                                      iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                      -
                                      -
                                      -update(gts: ndarray, preds: ndarray) None[source]
                                      -
                                      -
                                      summary() Tuple[float | None, float | None, float | None][source]
                                      @@ -513,11 +469,6 @@

                                      Visualization

                                      iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                      -
                                      -
                                      -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
                                      -
                                      -
                                      summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]
                                      @@ -539,15 +490,7 @@

                                      Visualization - -
                                      -
                                      - Next -
                                      -
                                      Changelog
                                      -
                                      - -
                                      + diff --git a/v0.4.1/_modules/doctr/datasets/classification/tensorflow.html b/v0.4.1/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.4.1/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                      -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      - -
                                      -
                                      - -
                                      -
                                      -
                                      - - - - - Back to top - -
                                      -
                                      - -
                                      - -
                                      -
                                      -

                                      Source code for doctr.datasets.classification.tensorflow

                                      -# Copyright (C) 2021, Mindee.
                                      -
                                      -# This program is licensed under the Apache License version 2.
                                      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                      -
                                      -import tensorflow as tf
                                      -
                                      -from .base import _CharacterGenerator
                                      -
                                      -__all__ = ['CharacterGenerator']
                                      -
                                      -
                                      -
                                      -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
                                      - -
                                      -
                                      -
                                      -
                                      - - -
                                      -
                                      - - Made with Sphinx and @pradyunsg's - - Furo - -
                                      -
                                      - -
                                      -
                                      - -
                                      -
                                      - -
                                      -
                                      - - - - - - - - - \ No newline at end of file diff --git a/v0.4.1/_modules/doctr/datasets/cord.html b/v0.4.1/_modules/doctr/datasets/cord.html index 10c806e5b7..3b89955bd8 100644 --- a/v0.4.1/_modules/doctr/datasets/cord.html +++ b/v0.4.1/_modules/doctr/datasets/cord.html @@ -226,28 +226,21 @@ @@ -287,16 +280,14 @@

                                      Source code for doctr.datasets.cord

                                       # This program is licensed under the Apache License version 2.
                                       # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                       
                                      -import json
                                       import os
                                      -from pathlib import Path
                                      -from typing import Any, Callable, Dict, List, Optional, Tuple
                                      -
                                      +import json
                                       import numpy as np
                                      -
                                      -from doctr.utils.geometry import fit_rbbox
                                      +from pathlib import Path
                                      +from typing import List, Dict, Any, Tuple, Optional, Callable
                                       
                                       from .datasets import VisionDataset
                                      +from doctr.utils.geometry import fit_rbbox
                                       
                                       __all__ = ['CORD']
                                       
                                      @@ -336,17 +327,17 @@ 

                                      Source code for doctr.datasets.cord

                                               super().__init__(url, None, sha256, True, **kwargs)
                                       
                                               # # List images
                                      -        tmp_root = os.path.join(self.root, 'image')
                                      +        self.root = os.path.join(self._root, 'image')
                                               self.data: List[Tuple[str, Dict[str, Any]]] = []
                                               self.train = train
                                               self.sample_transforms = sample_transforms
                                      -        for img_path in os.listdir(tmp_root):
                                      +        for img_path in os.listdir(self.root):
                                                   # File existence check
                                      -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                                      -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                                      +            if not os.path.exists(os.path.join(self.root, img_path)):
                                      +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                                   stem = Path(img_path).stem
                                                   _targets = []
                                      -            with open(os.path.join(self.root, 'json', f"{stem}.json"), 'rb') as f:
                                      +            with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f:
                                                       label = json.load(f)
                                                       for line in label["valid_line"]:
                                                           for word in line["words"]:
                                      @@ -371,7 +362,6 @@ 

                                      Source code for doctr.datasets.cord

                                                       img_path,
                                                       dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets)
                                                   ))
                                      -        self.root = tmp_root
                                       
                                           def extra_repr(self) -> str:
                                               return f"train={self.train}"
                                      @@ -408,7 +398,7 @@

                                      Source code for doctr.datasets.cord

                                             
                                           
                                         
                                      -
                                      +
                                      diff --git a/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.4.1/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

                                      Package Reference

                                      • doctr.datasets
                                      • -
                                      • doctr.io
                                      • +
                                      • doctr.documents
                                      • doctr.models
                                      • doctr.transforms
                                      • doctr.utils
                                      • @@ -284,7 +284,6 @@

                                        Source code for doctr.datasets.datasets.tensorflow

                                        from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

                                        Source code for doctr.datasets.datasets.tensorflow

                                        class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

                                        Source code for doctr.datasets.datasets.tensorflow

                                        +
                                        diff --git a/v0.4.1/_modules/doctr/datasets/doc_artefacts.html b/v0.4.1/_modules/doctr/datasets/doc_artefacts.html index cf466cefaa..172122a216 100644 --- a/v0.4.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.4.1/_modules/doctr/datasets/doc_artefacts.html @@ -228,21 +228,32 @@ diff --git a/v0.4.1/_modules/doctr/datasets/sroie.html b/v0.4.1/_modules/doctr/datasets/sroie.html index 34b331d6c9..0425870abb 100644 --- a/v0.4.1/_modules/doctr/datasets/sroie.html +++ b/v0.4.1/_modules/doctr/datasets/sroie.html @@ -226,28 +226,21 @@ @@ -287,12 +280,11 @@

                                        Source code for doctr.datasets.sroie

                                         # This program is licensed under the Apache License version 2.
                                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                         
                                        -import csv
                                         import os
                                        -from pathlib import Path
                                        -from typing import Any, Callable, Dict, List, Optional, Tuple
                                        -
                                        +import csv
                                         import numpy as np
                                        +from pathlib import Path
                                        +from typing import List, Dict, Any, Tuple, Optional, Callable
                                         
                                         from .datasets import VisionDataset
                                         
                                        @@ -339,16 +331,15 @@ 

                                        Source code for doctr.datasets.sroie

                                                     raise NotImplementedError
                                         
                                                 # # List images
                                        -        tmp_root = os.path.join(self.root, 'images')
                                        +        self.root = os.path.join(self._root, 'images')
                                                 self.data: List[Tuple[str, Dict[str, Any]]] = []
                                        -        np_dtype = np.float32
                                        -        for img_path in os.listdir(tmp_root):
                                        +        for img_path in os.listdir(self.root):
                                                     # File existence check
                                        -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                                        -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                                        +            if not os.path.exists(os.path.join(self.root, img_path)):
                                        +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                                     stem = Path(img_path).stem
                                                     _targets = []
                                        -            with open(os.path.join(self.root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                                        +            with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                                                         for row in csv.reader(f, delimiter=','):
                                                             # Safeguard for blank lines
                                                             if len(row) > 0:
                                        @@ -363,8 +354,7 @@ 

                                        Source code for doctr.datasets.sroie

                                         
                                                     text_targets, box_targets = zip(*_targets)
                                         
                                        -            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=text_targets)))
                                        -        self.root = tmp_root
                                        +            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                                         
                                             def extra_repr(self) -> str:
                                                 return f"train={self.train}"
                                        @@ -401,7 +391,7 @@

                                        Source code for doctr.datasets.sroie

                                               
                                             
                                           
                                        -
                                        +
                                        diff --git a/v0.4.1/_modules/doctr/datasets/svt.html b/v0.4.1/_modules/doctr/datasets/svt.html index d28fc1bca4..ff75309df4 100644 --- a/v0.4.1/_modules/doctr/datasets/svt.html +++ b/v0.4.1/_modules/doctr/datasets/svt.html @@ -228,21 +228,32 @@ +
                                        diff --git a/v0.4.1/_modules/doctr/io/reader.html b/v0.4.1/_modules/doctr/io/reader.html index 2f82858584..49cdc7d152 100644 --- a/v0.4.1/_modules/doctr/io/reader.html +++ b/v0.4.1/_modules/doctr/io/reader.html @@ -228,21 +228,32 @@ @@ -677,7 +599,7 @@

                                        Source code for doctr.models.detection.differentiable_binarization.tensorflo

                                        -

                                      +
                                      diff --git a/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

                                      Source code for doctr.models.detection.fast.tensorflow

                                      import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html index f150c725b7..9f836ce462 100644 --- a/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -226,28 +226,21 @@ @@ -290,17 +283,14 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - -import numpy as np import tensorflow as tf from tensorflow import keras -from tensorflow.keras import Sequential, layers +from tensorflow.keras import layers, Sequential +from typing import Dict, Any, Tuple, Optional, List +from doctr.utils.repr import NestedObject from doctr.models.backbones import ResnetStage from doctr.models.utils import conv_sequence, load_pretrained_params -from doctr.utils.repr import NestedObject - from .base import LinkNetPostProcessor, _LinkNet __all__ = ['LinkNet', 'linknet16'] @@ -310,7 +300,9 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      'linknet16': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'num_classes': 1, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': None, }, } @@ -422,7 +414,7 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      def compute_loss( self, out_map: tf.Tensor, - target: List[np.ndarray], + target: List[Dict[str, Any]], focal_loss: bool = False, alpha: float = .5, gamma: float = 2., @@ -443,7 +435,7 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      A loss tensor """ seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) @@ -471,7 +463,7 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      else: # Compute BCE loss with highlighted edges loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, out_map.dtype), + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), bce ) loss = tf.reduce_mean(loss) @@ -481,7 +473,7 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, return_boxes: bool = False, focal_loss: bool = True, @@ -514,8 +506,12 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters @@ -579,7 +575,7 @@

                                      Source code for doctr.models.detection.linknet.tensorflow

                                      +
                                      diff --git a/v0.4.1/_modules/doctr/models/detection/zoo.html b/v0.4.1/_modules/doctr/models/detection/zoo.html index 984642c748..23a2f451e3 100644 --- a/v0.4.1/_modules/doctr/models/detection/zoo.html +++ b/v0.4.1/_modules/doctr/models/detection/zoo.html @@ -226,28 +226,21 @@ @@ -290,18 +283,18 @@

                                      Source code for doctr.models.detection.zoo

                                       from typing import Any
                                       
                                       from doctr.file_utils import is_tf_available, is_torch_available
                                      -
                                      -from .. import detection
                                      +from .core import DetectionPredictor
                                       from ..preprocessor import PreProcessor
                                      -from .predictor import DetectionPredictor
                                      +from .. import detection
                                      +
                                       
                                       __all__ = ["detection_predictor"]
                                       
                                       
                                       if is_tf_available():
                                      -    ARCHS = ['db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
                                      +    ARCHS = ['db_resnet50', 'linknet16']
                                       elif is_torch_available():
                                      -    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3_large', 'linknet16']
                                      +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
                                       
                                       
                                       def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
                                      @@ -315,7 +308,7 @@ 

                                      Source code for doctr.models.detection.zoo

                                           kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                           kwargs['batch_size'] = kwargs.get('batch_size', 1)
                                           predictor = DetectionPredictor(
                                      -        PreProcessor(_model.cfg['input_shape'][:-1] if is_tf_available() else _model.cfg['input_shape'][1:], **kwargs),
                                      +        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
                                               _model
                                           )
                                           return predictor
                                      @@ -329,12 +322,12 @@ 

                                      Source code for doctr.models.detection.zoo

                                           Example::
                                               >>> import numpy as np
                                               >>> from doctr.models import detection_predictor
                                      -        >>> model = detection_predictor(arch='db_resnet50', pretrained=True)
                                      +        >>> model = detection_predictor(pretrained=True)
                                               >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                               >>> out = model([input_page])
                                       
                                           Args:
                                      -        arch: name of the architecture to use (e.g. 'db_resnet50')
                                      +        arch: name of the architecture to use ('db_resnet50')
                                               pretrained: If True, returns a model pre-trained on our text detection dataset
                                       
                                           Returns:
                                      @@ -375,7 +368,7 @@ 

                                      Source code for doctr.models.detection.zoo

                                             
                                           
                                         
                                      -
                                      +
                                      diff --git a/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html index 64bc9d5b7a..7b8529c26d 100644 --- a/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,44 +281,35 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf from tensorflow.keras import layers -from tensorflow.keras.models import Model, Sequential +from tensorflow.keras.models import Sequential, Model +from typing import Tuple, Dict, Any, Optional, List -from ....datasets import VOCABS -from ...backbones import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor -__all__ = ['CRNN', 'crnn_vgg16_bn', 'CTCPostProcessor', 'crnn_mobilenet_v3_small', - 'crnn_mobilenet_v3_large'] +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'crnn_vgg16_bn': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': vgg16_bn, 'rnn_units': 128, - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip', - }, - 'crnn_mobilenet_v3_small': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_small_r, 'rnn_units': 128, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip', }, - 'crnn_mobilenet_v3_large': { + 'crnn_resnet31': { 'mean': (0.694, 0.695, 0.693), 'std': (0.299, 0.296, 0.301), - 'backbone': mobilenet_v3_large_r, 'rnn_units': 128, + 'backbone': 'resnet31', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': None, + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip', }, } @@ -433,7 +417,7 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      """ gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -470,15 +454,7 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      return out -def _crnn( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -487,10 +463,9 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) # Feature extractor - feat_extractor = _cfg['backbone']( + feat_extractor = backbones.__dict__[_cfg['backbone']]( input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) kwargs['vocab'] = _cfg['vocab'] @@ -529,16 +504,14 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      -[docs] -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based +def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Example:: >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> from doctr.models import crnn_resnet31 + >>> model = crnn_resnet31(pretrained=True) >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) @@ -549,32 +522,7 @@

                                      Source code for doctr.models.recognition.crnn.tensorflow

                                      text recognition architecture """ - return _crnn('crnn_mobilenet_v3_small', pretrained, **kwargs)
                                      - - - -
                                      -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_large', pretrained, **kwargs)
                                      - + return _crnn('crnn_resnet31', pretrained, **kwargs)

                            @@ -607,7 +555,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            +
                            diff --git a/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html index f542b916dd..6d9bff4577 100644 --- a/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -226,28 +226,21 @@ @@ -287,29 +280,29 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -import math -from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf -from tensorflow.keras import Model, Sequential, layers +from tensorflow.keras import layers, Sequential, Model +from typing import Tuple, List, Dict, Any, Optional +from copy import deepcopy -from ....datasets import VOCABS +from ..core import RecognitionPostProcessor from ...backbones.resnet import ResnetStage from ...utils import conv_sequence, load_pretrained_params -from ..transformer import Decoder, create_look_ahead_mask, create_padding_mask, positional_encoding +from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask +from ....datasets import VOCABS from .base import _MASTER, _MASTERPostProcessor + __all__ = ['MASTER', 'master', 'MASTERPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'master': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/master-bade6eae.zip', + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'input_shape': (48, 160, 3), + 'vocab': VOCABS['french'], + 'url': None, }, } @@ -329,9 +322,8 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            def __init__( self, inplanes: int, - headers: int = 8, + headers: int = 1, att_scale: bool = False, - ratio: float = 0.0625, # bottleneck ratio of 1/16 as described in paper **kwargs ) -> None: super().__init__(**kwargs) @@ -339,7 +331,6 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.headers = headers # h self.inplanes = inplanes # C self.att_scale = att_scale - self.planes = int(inplanes * ratio) self.single_header_inplanes = int(inplanes / headers) # C / h @@ -352,7 +343,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.transform = tf.keras.Sequential( [ tf.keras.layers.Conv2D( - filters=self.planes, + filters=self.inplanes, kernel_size=1, kernel_initializer=tf.initializers.he_normal() ), @@ -367,6 +358,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            name='transform' ) + @tf.function def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor: b, h, w, c = (tf.shape(inputs)[i] for i in range(4)) @@ -389,7 +381,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1)) # scale variance if self.att_scale and self.headers > 1: - context_mask = context_mask / math.sqrt(self.single_header_inplanes) + context_mask = context_mask / tf.sqrt(self.single_header_inplanes) # B*h, 1, H*W, 1 context_mask = tf.keras.activations.softmax(context_mask, axis=2) @@ -423,8 +415,8 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            def __init__( self, - headers: int = 8, - input_shape: Tuple[int, int, int] = (32, 128, 3), + headers: int = 1, + input_shape: Tuple[int, int, int] = (48, 160, 3), ) -> None: _layers = [ # conv_1x @@ -473,13 +465,12 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self, vocab: str, d_model: int = 512, - headers: int = 8, # number of multi-aspect context + headers: int = 1, dff: int = 2048, - num_heads: int = 8, # number of heads in the transformer decoder + num_heads: int = 8, num_layers: int = 3, max_length: int = 50, - dropout: float = 0.2, - input_shape: Tuple[int, int, int] = (32, 128, 3), + input_shape: Tuple[int, int, int] = (48, 160, 3), cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() @@ -489,7 +480,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.cfg = cfg self.vocab_size = len(vocab) - self.feat_extractor = MAGCResnet(headers=headers, input_shape=input_shape) + self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape) self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model) # 3 more classes: EOS/PAD/SOS self.decoder = Decoder( @@ -499,13 +490,13 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            dff=dff, vocab_size=self.vocab_size, maximum_position_encoding=max_length, - dropout=dropout, ) self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model) self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform()) self.postprocessor = MASTERPostProcessor(vocab=self.vocab) + @tf.function def make_mask(self, target: tf.Tensor) -> tf.Tensor: look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1]) target_padding_mask = create_padding_mask(target, self.vocab_size + 2) # Pad symbol @@ -542,7 +533,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -567,7 +558,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            """ # Encode - feature = self.feat_extractor(x, **kwargs) + feature = self.feature_extractor(x, **kwargs) b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) encoded = feature + self.feature_pe[:, :h * w, :] @@ -621,7 +612,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=encoded.dtype) # 3 symbols + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): ys_mask = self.make_mask(ys) @@ -641,7 +632,6 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures - Args: vocab: string containing the ordered sequence of supported characters ignore_case: if True, ignore case of letters @@ -692,17 +682,14 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            [docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: >>> import tensorflow as tf >>> from doctr.models import master >>> model = master(pretrained=False) >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) - Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - Returns: text recognition architecture """ @@ -741,7 +728,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            +
                            diff --git a/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html index 8d93b52a15..3a9989ef30 100644 --- a/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,28 +281,35 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf -from tensorflow.keras import Model, Sequential, layers +from tensorflow.keras import Sequential, layers, Model +from typing import Tuple, Dict, List, Any, Optional -from doctr.utils.repr import NestedObject - -from ....datasets import VOCABS -from ...backbones import resnet31 +from ... import backbones from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor +from doctr.utils.repr import NestedObject -__all__ = ['SAR', 'SARPostProcessor', 'sar_resnet31'] +__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31'] default_cfgs: Dict[str, Dict[str, Any]] = { + 'sar_vgg16_bn': { + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'input_shape': (32, 128, 3), + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip', + }, 'sar_resnet31': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'backbone': resnet31, 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/sar_resnet31-9ee49970.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip', }, } @@ -390,7 +390,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            super().__init__() self.vocab_size = vocab_size self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_layers)] + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) @@ -411,12 +411,12 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            # initialize states (each of shape (N, rnn_units)) states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype + inputs=None, batch_size=features.shape[0], dtype=tf.float32 ) # run first step of lstm # holistic: shape (N, rnn_units) _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos> so that the one-hot is only zeros) + # Initialize with the index of virtual START symbol (placed after <eos>) symbol = tf.fill(features.shape[0], self.vocab_size + 1) logits_list = [] if kwargs.get('training') and gt is None: @@ -526,7 +526,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -591,15 +591,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - pretrained_backbone: bool = True, - input_shape: Tuple[int, int, int] = None, - **kwargs: Any -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) @@ -612,9 +604,8 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = default_cfgs[arch]['backbone']( + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( input_shape=_cfg['input_shape'], - pretrained=pretrained_backbone, include_top=False, ) @@ -634,6 +625,30 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return model +
                            +[docs] +def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: + """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong + Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. + + Example:: + >>> import tensorflow as tf + >>> from doctr.models import sar_vgg16_bn + >>> model = sar_vgg16_bn(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + + Returns: + text recognition architecture + """ + + return _sar('sar_vgg16_bn', pretrained, **kwargs)
                            + + +
                            [docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: @@ -688,7 +703,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            +
                            diff --git a/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.4.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

                            Source code for doctr.models.recognition.vitstr.tensorflow

                            from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.4.1/_modules/doctr/models/recognition/zoo.html b/v0.4.1/_modules/doctr/models/recognition/zoo.html index 84482d3f87..0f1bff8861 100644 --- a/v0.4.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.4.1/_modules/doctr/models/recognition/zoo.html @@ -226,28 +226,21 @@ @@ -289,16 +282,19 @@

                            Source code for doctr.models.recognition.zoo

                            from typing import Any
                             
                            -from doctr.file_utils import is_tf_available
                            -from doctr.models.preprocessor import PreProcessor
                            -
                            +from doctr.file_utils import is_tf_available, is_torch_available
                            +from .core import RecognitionPredictor
                            +from ..preprocessor import PreProcessor
                             from .. import recognition
                            -from .predictor import RecognitionPredictor
                            +
                             
                             __all__ = ["recognition_predictor"]
                             
                             
                            -ARCHS = ['crnn_vgg16_bn', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large', 'sar_resnet31', 'master']
                            +if is_tf_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
                            +elif is_torch_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
                             
                             
                             def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                            @@ -310,9 +306,8 @@ 

                            Source code for doctr.models.recognition.zoo

                            kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                 kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                 kwargs['batch_size'] = kwargs.get('batch_size', 32)
                            -    input_shape = _model.cfg['input_shape'][:2] if is_tf_available() else _model.cfg['input_shape'][-2:]
                                 predictor = RecognitionPredictor(
                            -        PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs),
                            +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                                     _model
                                 )
                             
                            @@ -332,7 +327,7 @@ 

                            Source code for doctr.models.recognition.zoo

                                    >>> out = model([input_page])
                             
                                 Args:
                            -        arch: name of the architecture to use (e.g. 'crnn_vgg16_bn')
                            +        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
                                     pretrained: If True, returns a model pre-trained on our text recognition dataset
                             
                                 Returns:
                            @@ -373,7 +368,7 @@ 

                            Source code for doctr.models.recognition.zoo

                               
                            -
                            +
                            diff --git a/v0.4.1/_modules/doctr/models/zoo.html b/v0.4.1/_modules/doctr/models/zoo.html index 42b22148a8..bfa5a6fdf4 100644 --- a/v0.4.1/_modules/doctr/models/zoo.html +++ b/v0.4.1/_modules/doctr/models/zoo.html @@ -226,28 +226,16 @@ @@ -288,22 +276,15 @@

                            Source code for doctr.models.zoo

                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                             from typing import Any
                            -
                            +from .core import OCRPredictor
                             from .detection.zoo import detection_predictor
                            -from .predictor import OCRPredictor
                             from .recognition.zoo import recognition_predictor
                             
                            +
                             __all__ = ["ocr_predictor"]
                             
                             
                            -def _predictor(
                            -    det_arch: str,
                            -    reco_arch: str,
                            -    pretrained: bool,
                            -    det_bs: int = 2,
                            -    reco_bs: int = 128,
                            -    **kwargs,
                            -) -> OCRPredictor:
                            +def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
                             
                                 # Detection
                                 det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
                            @@ -311,17 +292,15 @@ 

                            Source code for doctr.models.zoo

                                 # Recognition
                                 reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
                             
                            -    return OCRPredictor(det_predictor, reco_predictor, **kwargs)
                            +    return OCRPredictor(det_predictor, reco_predictor)
                             
                             
                             
                            -[docs] +[docs] def ocr_predictor( det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, - assume_straight_pages: bool = True, - export_as_straight_boxes: bool = False, **kwargs: Any ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. @@ -329,31 +308,19 @@

                            Source code for doctr.models.zoo

                                 Example::
                                     >>> import numpy as np
                                     >>> from doctr.models import ocr_predictor
                            -        >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                            +        >>> model = ocr_predictor(pretrained=True)
                                     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                     >>> out = model([input_page])
                             
                                 Args:
                            -        det_arch: name of the detection architecture to use (e.g. 'db_resnet50', 'db_mobilenet_v3_large')
                            -        reco_arch: name of the recognition architecture to use (e.g. 'crnn_vgg16_bn', 'sar_resnet31')
                            +        arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet')
                                     pretrained: If True, returns a model pre-trained on our OCR dataset
                            -        assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
                            -            without rotated textual elements.
                            -        export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions
                            -            (potentially rotated) as straight bounding boxes.
                             
                                 Returns:
                                     OCR predictor
                                 """
                             
                            -    return _predictor(
                            -        det_arch,
                            -        reco_arch,
                            -        pretrained,
                            -        assume_straight_pages=assume_straight_pages,
                            -        export_as_straight_boxes=export_as_straight_boxes,
                            -        **kwargs,
                            -    )
                            + return _predictor(det_arch, reco_arch, pretrained, **kwargs)
                            @@ -387,7 +354,7 @@

                            Source code for doctr.models.zoo

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.1/_modules/doctr/transforms/modules/base.html b/v0.4.1/_modules/doctr/transforms/modules/base.html index 52d1f8fb2e..e7b5ea10d9 100644 --- a/v0.4.1/_modules/doctr/transforms/modules/base.html +++ b/v0.4.1/_modules/doctr/transforms/modules/base.html @@ -226,28 +226,21 @@ @@ -287,17 +280,14 @@

                            Source code for doctr.transforms.modules.base

                            # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -import math import random -from typing import Any, Callable, Dict, List, Tuple - -import numpy as np +from typing import List, Any, Callable from doctr.utils.repr import NestedObject - from .. import functional as F -__all__ = ['ColorInversion', 'OneOf', 'RandomApply', 'RandomRotate', 'RandomCrop'] + +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
                            @@ -381,67 +371,6 @@

                            Source code for doctr.transforms.modules.base

                            return self.transform(img) return img
                            - - -
                            -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image and its boxes - - .. image:: https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png - :align: center - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - def __init__(self, max_angle: float = 5., expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: - angle = random.uniform(-self.max_angle, self.max_angle) - r_img, r_boxes = F.rotate(img, target, angle, self.expand) - return r_img, r_boxes
                            - - - -
                            -[docs] -class RandomCrop(NestedObject): - """Randomly crop a tensor image and its boxes - - Args: - scale: tuple of floats, relative (min_area, max_area) of the crop - ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w - """ - def __init__(self, scale: Tuple[float, float] = (0.08, 1.), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: - self.scale = scale - self.ratio = ratio - - def extra_repr(self) -> str: - return f"scale={self.scale}, ratio={self.ratio}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - h, w = img.shape[:2] - scale = random.uniform(self.scale[0], self.scale[1]) - ratio = random.uniform(self.ratio[0], self.ratio[1]) - crop_h = math.sqrt(scale * ratio) - crop_w = math.sqrt(scale / ratio) - start_x, start_y = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h) - crop_box = ( - max(0, int(round(start_x * w))), - max(0, int(round(start_y * h))), - min(int(round((start_x + crop_w) * w)), w - 1), - min(int(round((start_y + crop_h) * h)), h - 1) - ) - croped_img, crop_boxes = F.crop_detection(img, target["boxes"], crop_box) - return croped_img, dict(boxes=crop_boxes)
                            -
                            @@ -474,7 +403,7 @@

                            Source code for doctr.transforms.modules.base

                            -
                            +
                            diff --git a/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html index 10d4f6e1d1..51b31b4fc4 100644 --- a/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.4.1/_modules/doctr/transforms/modules/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,12 +281,12 @@

                            Source code for doctr.transforms.modules.tensorflow

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. import random -from typing import Any, Callable, List, Tuple - import tensorflow as tf +from typing import List, Any, Tuple, Callable from doctr.utils.repr import NestedObject + __all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality'] @@ -362,7 +355,6 @@

                            Source code for doctr.transforms.modules.tensorflow

                            return _repr def __call__(self, img: tf.Tensor) -> tf.Tensor: - input_dtype = img.dtype img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: # pad width @@ -373,7 +365,7 @@

                            Source code for doctr.transforms.modules.tensorflow

                            else: offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return tf.cast(img, dtype=input_dtype)
                            + return img
                            @@ -393,15 +385,15 @@

                            Source code for doctr.transforms.modules.tensorflow

                            std: standard deviation per channel """ def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
                            @@ -647,7 +639,7 @@

                            Source code for doctr.transforms.modules.tensorflow

                            +
                            diff --git a/v0.4.1/_modules/doctr/utils/metrics.html b/v0.4.1/_modules/doctr/utils/metrics.html index 5cf468803f..20af9416ea 100644 --- a/v0.4.1/_modules/doctr/utils/metrics.html +++ b/v0.4.1/_modules/doctr/utils/metrics.html @@ -226,28 +226,21 @@ @@ -287,17 +280,15 @@

                            Source code for doctr.utils.metrics

                             # This program is licensed under the Apache License version 2.
                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                            -from typing import Dict, List, Optional, Tuple
                            -
                            -import cv2
                             import numpy as np
                            -from scipy.optimize import linear_sum_assignment
                            +import cv2
                            +from typing import List, Tuple, Dict, Optional
                             from unidecode import unidecode
                            -
                            +from scipy.optimize import linear_sum_assignment
                             from doctr.utils.geometry import rbbox_to_polygon
                             
                             __all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
                            -           'nms', 'LocalizationConfusion', 'OCRMetric', 'DetectionMetric']
                            +           'nms', 'LocalizationConfusion', 'OCRMetric']
                             
                             
                             def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
                            @@ -324,26 +315,26 @@ 

                            Source code for doctr.utils.metrics

                             
                            [docs] class TextMatch: - r"""Implements text match metric (word-level accuracy) for recognition task. + """Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \forall X, Y \in \mathcal{W}^N, - TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) + \\forall X, Y \\in \\mathcal{W}^N, + TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \forall a, x \in \mathcal{W}, - f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{W}` is the set of all possible character sequences, + \\forall a, x \\in \\mathcal{W}, + f_a(x) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } x = a \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. Example:: @@ -356,8 +347,6 @@

                            Source code for doctr.utils.metrics

                                 def __init__(self) -> None:
                                     self.reset()
                             
                            -
                            -[docs] def update( self, gt: List[str], @@ -379,8 +368,7 @@

                            Source code for doctr.utils.metrics

                                         self.unidecode += int(_unidecode)
                                         self.unicase += int(_unicase)
                             
                            -        self.total += len(gt)
                            - + self.total += len(gt)
                            [docs] @@ -566,29 +554,29 @@

                            Source code for doctr.utils.metrics

                             
                            [docs] class LocalizationConfusion: - r"""Implements common confusion metrics and mean IoU for localization evaluation. + """Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ - Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ - Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ - meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) + \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ + Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \forall y \in \mathcal{B}, - g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, + \\forall y \\in \\mathcal{B}, + g_X(y) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. Example:: @@ -613,8 +601,6 @@

                            Source code for doctr.utils.metrics

                                     self.mask_shape = mask_shape
                                     self.reset()
                             
                            -
                            -[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: if preds.shape[0] > 0: @@ -633,8 +619,7 @@

                            Source code for doctr.utils.metrics

                             
                                     # Update counts
                                     self.num_gts += gts.shape[0]
                            -        self.num_preds += preds.shape[0]
                            - + self.num_preds += preds.shape[0]
                            [docs] @@ -668,32 +653,32 @@

                            Source code for doctr.utils.metrics

                             
                            [docs] class OCRMetric: - r"""Implements an end-to-end OCR metric. + """Implements end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, - \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ - Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, + \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ + Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \forall (b, l) \in \mathcal{B} \times \mathcal{L}, - h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{L}` is the set of possible character sequences, + \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, + h_{B,L}(b, l) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ + & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, + :math:`\\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. Example:: @@ -719,8 +704,6 @@

                            Source code for doctr.utils.metrics

                                     self.mask_shape = mask_shape
                                     self.reset()
                             
                            -
                            -[docs] def update( self, gt_boxes: np.ndarray, @@ -756,8 +739,7 @@

                            Source code for doctr.utils.metrics

                                             self.unicase_matches += int(_unicase)
                             
                                     self.num_gts += gt_boxes.shape[0]
                            -        self.num_preds += pred_boxes.shape[0]
                            - + self.num_preds += pred_boxes.shape[0]
                            [docs] @@ -765,7 +747,7 @@

                            Source code for doctr.utils.metrics

                                     """Computes the aggregated metrics
                             
                                     Returns:
                            -            a tuple with the recall & precision for each string comparison and the mean IoU
                            +            a tuple with the recall & precision for each string comparison flexibility and the mean IoU
                                     """
                             
                                     # Recall
                            @@ -799,124 +781,6 @@ 

                            Source code for doctr.utils.metrics

                                     self.unidecode_matches = 0
                                     self.unicase_matches = 0
                            - - -
                            -[docs] -class DetectionMetric: - r"""Implements an object detection metric. - - The aggregated metrics are computed as follows: - - .. math:: - \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, - \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ - Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) - - with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and - :math:`y`, and the function :math:`h_{B, C}` defined as: - - .. math:: - \forall (b, c) \in \mathcal{B} \times \mathcal{C}, - h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{C}` is the set of possible class indices, - :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - - Example:: - >>> import numpy as np - >>> from doctr.utils import DetectionMetric - >>> metric = DetectionMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) - >>> metric.summary() - - Args: - iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - """ - - def __init__( - self, - iou_thresh: float = 0.5, - rotated_bbox: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - ) -> None: - self.iou_thresh = iou_thresh - self.rotated_bbox = rotated_bbox - self.mask_shape = mask_shape - self.reset() - -
                            -[docs] - def update( - self, - gt_boxes: np.ndarray, - pred_boxes: np.ndarray, - gt_labels: np.ndarray, - pred_labels: np.ndarray, - ) -> None: - - if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: - raise AssertionError("there should be the same number of boxes and string both for the ground truth " - "and the predictions") - - # Compute IoU - if pred_boxes.shape[0] > 0: - if self.rotated_bbox: - mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape) - mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape) - iou_mat = mask_iou(mask_gts, mask_preds) - else: - iou_mat = box_iou(gt_boxes, pred_boxes) - - self.tot_iou += float(iou_mat.max(axis=1).sum()) - - # Assign pairs - gt_indices, pred_indices = linear_sum_assignment(-iou_mat) - is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh - # Category comparison - self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) - - self.num_gts += gt_boxes.shape[0] - self.num_preds += pred_boxes.shape[0]
                            - - -
                            -[docs] - def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: - """Computes the aggregated metrics - - Returns: - a tuple with the recall & precision for each class prediction and the mean IoU - """ - - # Recall - recall = self.num_matches / self.num_gts if self.num_gts > 0 else None - - # Precision - precision = self.num_matches / self.num_preds if self.num_preds > 0 else None - - # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None - - return recall, precision, mean_iou
                            - - - def reset(self) -> None: - self.num_gts = 0 - self.num_preds = 0 - self.tot_iou = 0. - self.num_matches = 0
                            -
                            @@ -949,7 +813,7 @@

                            Source code for doctr.utils.metrics

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.1/_modules/doctr/utils/visualization.html b/v0.4.1/_modules/doctr/utils/visualization.html index 4605453c73..21743f6182 100644 --- a/v0.4.1/_modules/doctr/utils/visualization.html +++ b/v0.4.1/_modules/doctr/utils/visualization.html @@ -226,28 +226,21 @@ @@ -287,136 +280,70 @@

                            Source code for doctr.utils.visualization

                             # This program is licensed under the Apache License version 2.
                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                            -from copy import deepcopy
                            -from typing import Any, Dict, List, Optional, Tuple, Union
                            -
                            -import cv2
                            -import matplotlib.patches as patches
                             import matplotlib.pyplot as plt
                            +from matplotlib.figure import Figure
                            +import matplotlib.patches as patches
                             import mplcursors
                            +from PIL import ImageFont, ImageDraw, Image
                             import numpy as np
                            -from matplotlib.figure import Figure
                            -from PIL import Image, ImageDraw
                            -from unidecode import unidecode
                            +import cv2
                            +from typing import Tuple, List, Dict, Any, Union
                             
                             from .common_types import BoundingBox, RotatedBbox
                            -from .fonts import get_font
                             
                            -__all__ = ['visualize_page', 'synthesize_page', 'draw_boxes']
                            +__all__ = ['visualize_page', 'synthetize_page']
                             
                             
                            -def rect_patch(
                            -    geometry: BoundingBox,
                            +def create_rect_patch(
                            +    geometry: Union[BoundingBox, RotatedBbox],
                            +    label: str,
                                 page_dimensions: Tuple[int, int],
                            -    label: Optional[str] = None,
                            -    color: Tuple[float, float, float] = (0, 0, 0),
                            +    color: Tuple[int, int, int],
                                 alpha: float = 0.3,
                                 linewidth: int = 2,
                                 fill: bool = True,
                            -) -> patches.Rectangle:
                            -    """Create a matplotlib rectangular patch for the element
                            +) -> patches.Patch:
                            +    """Create a matplotlib patch (rectangle) bounding the element
                             
                                 Args:
                                     geometry: bounding box of the element
                            -        page_dimensions: dimensions of the Page in format (height, width)
                                     label: label to display when hovered
                            +        page_dimensions: dimensions of the Page
                                     color: color to draw box
                                     alpha: opacity parameter to fill the boxes, 0 = transparent
                                     linewidth: line width
                            -        fill: whether the patch should be filled
                             
                                 Returns:
                                     a rectangular Patch
                                 """
                            -
                            -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
                            -        raise ValueError("invalid geometry format")
                            -
                            -    # Unpack
                                 height, width = page_dimensions
                            -    (xmin, ymin), (xmax, ymax) = geometry
                            -    # Switch to absolute coords
                            -    xmin, w = xmin * width, (xmax - xmin) * width
                            -    ymin, h = ymin * height, (ymax - ymin) * height
                            -
                            -    return patches.Rectangle(
                            -        (xmin, ymin),
                            -        w,
                            -        h,
                            -        fill=fill,
                            -        linewidth=linewidth,
                            -        edgecolor=(*color, alpha),
                            -        facecolor=(*color, alpha),
                            -        label=label,
                            -    )
                            -
                            -
                            -def polygon_patch(
                            -    geometry: RotatedBbox,
                            -    page_dimensions: Tuple[int, int],
                            -    label: Optional[str] = None,
                            -    color: Tuple[float, float, float] = (0, 0, 0),
                            -    alpha: float = 0.3,
                            -    linewidth: int = 2,
                            -    fill: bool = True,
                            -) -> patches.Polygon:
                            -    """Create a matplotlib polygon patch for the element
                            -
                            -    Args:
                            -        geometry: bounding box of the element
                            -        page_dimensions: dimensions of the Page in format (height, width)
                            -        label: label to display when hovered
                            -        color: color to draw box
                            -        alpha: opacity parameter to fill the boxes, 0 = transparent
                            -        linewidth: line width
                            -        fill: whether the patch should be filled
                            -
                            -    Returns:
                            -        a polygon Patch
                            -    """
                            -
                            -    if len(geometry) != 5 or any(not isinstance(elt, float) for elt in geometry):
                            -        raise ValueError("invalid geometry format")
                            -
                            -    # Unpack
                            -    height, width = page_dimensions
                            -    x, y, w, h, a = geometry
                            -    # Switch to absolute coords
                            -    x, w = x * width, w * width
                            -    y, h = y * height, h * height
                            -    points = cv2.boxPoints(((x, y), (w, h), a))
                            -
                            -    return patches.Polygon(
                            -        points,
                            -        fill=fill,
                            -        linewidth=linewidth,
                            -        edgecolor=(*color, alpha),
                            -        facecolor=(*color, alpha),
                            -        label=label,
                            -    )
                            -
                            -
                            -def create_obj_patch(
                            -    geometry: Union[BoundingBox, RotatedBbox],
                            -    page_dimensions: Tuple[int, int],
                            -    **kwargs: Any,
                            -) -> patches.Patch:
                            -    """Create a matplotlib patch for the element
                            -
                            -    Args:
                            -        geometry: bounding box (straight or rotated) of the element
                            -        page_dimensions: dimensions of the page in format (height, width)
                            -
                            -    Returns:
                            -        a matplotlib Patch
                            -    """
                            -    if isinstance(geometry, tuple):
                            -        if len(geometry) == 2:
                            -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                            -        elif len(geometry) == 5:
                            -            return polygon_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                            -
                            -    raise ValueError("invalid geometry format")
                            +    if len(geometry) == 5:
                            +        x, y, w, h, a = geometry  # type: ignore[misc]
                            +        x, w = x * width, w * width
                            +        y, h = y * height, h * height
                            +        points = cv2.boxPoints(((x, y), (w, h), a))
                            +        return patches.Polygon(
                            +            points,
                            +            fill=fill,
                            +            linewidth=linewidth,
                            +            edgecolor=(*color, alpha),
                            +            facecolor=(*color, alpha),
                            +            label=label
                            +        )
                            +    else:
                            +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
                            +        xmin, xmax = xmin * width, xmax * width
                            +        ymin, ymax = ymin * height, ymax * height
                            +        return patches.Rectangle(
                            +            (xmin, ymin),
                            +            xmax - xmin,
                            +            ymax - ymin,
                            +            fill=fill,
                            +            linewidth=linewidth,
                            +            edgecolor=(*color, alpha),
                            +            facecolor=(*color, alpha),
                            +            label=label
                            +        )
                             
                             
                             
                            @@ -467,8 +394,7 @@

                            Source code for doctr.utils.visualization

                             
                                 for block in page['blocks']:
                                     if not words_only:
                            -            rect = create_obj_patch(block['geometry'], page['dimensions'],
                            -                                    label='block', color=(0, 1, 0), linewidth=1, **kwargs)
                            +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                                         # add patch on figure
                                         ax.add_patch(rect)
                                         if interactive:
                            @@ -477,16 +403,14 @@ 

                            Source code for doctr.utils.visualization

                             
                                     for line in block['lines']:
                                         if not words_only:
                            -                rect = create_obj_patch(line['geometry'], page['dimensions'],
                            -                                        label='line', color=(1, 0, 0), linewidth=1, **kwargs)
                            +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                                             ax.add_patch(rect)
                                             if interactive:
                                                 artists.append(rect)
                             
                                         for word in line['words']:
                            -                rect = create_obj_patch(word['geometry'], page['dimensions'],
                            -                                        label=f"{word['value']} (confidence: {word['confidence']:.2%})",
                            -                                        color=(0, 0, 1), **kwargs)
                            +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
                            +                                         page['dimensions'], (0, 0, 1), **kwargs)
                                             ax.add_patch(rect)
                                             if interactive:
                                                 artists.append(rect)
                            @@ -511,11 +435,11 @@ 

                            Source code for doctr.utils.visualization

                             
                                     if display_artefacts:
                                         for artefact in block['artefacts']:
                            -                rect = create_obj_patch(
                            +                rect = create_rect_patch(
                                                 artefact['geometry'],
                            +                    'artefact',
                                                 page['dimensions'],
                            -                    label='artefact',
                            -                    color=(0.5, 0.5, 0.5),
                            +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                                                 linewidth=1,
                                                 **kwargs
                                             )
                            @@ -532,13 +456,10 @@ 

                            Source code for doctr.utils.visualization

                             
                             
                             
                            -
                            -[docs] -def synthesize_page( +def synthetize_page( page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, - font_family: Optional[str] = None, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -546,12 +467,10 @@

                            Source code for doctr.utils.visualization

                                     page: exported Page object to represent
                                     draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
                                     font_size: size of the font, default font = 13
                            -        font_family: family of the font
                             
                                 Return:
                            -        the synthesized page
                            +        A np array (drawn page)
                                 """
                            -
                                 # Draw template
                                 h, w = page["dimensions"]
                                 response = 255 * np.ones((h, w, 3), dtype=np.int32)
                            @@ -562,19 +481,20 @@ 

                            Source code for doctr.utils.visualization

                                         for word in line["words"]:
                                             # Get aboslute word geometry
                                             (xmin, ymin), (xmax, ymax) = word["geometry"]
                            -                xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
                            -                ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
                            +                xmin, xmax = int(w * xmin), int(w * xmax)
                            +                ymin, ymax = int(h * ymin), int(h * ymax)
                             
                                             # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
                            -                font = get_font(font_family, int(0.75 * (ymax - ymin)))
                            -                img = Image.new('RGB', (xmax - xmin, ymax - ymin), color=(255, 255, 255))
                            +                h_box, w_box = ymax - ymin, xmax - xmin
                            +                h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75))
                            +                img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255))
                                             d = ImageDraw.Draw(img)
                            +
                                             # Draw in black the value of the word
                            -                try:
                            -                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
                            -                except UnicodeEncodeError:
                            -                    # When character cannot be encoded, use its unidecode version
                            -                    d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
                            +                d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0))
                            +
                            +                # Resize back to box size
                            +                img = img.resize((w_box, h_box), Image.NEAREST)
                             
                                             # Colorize if draw_proba
                                             if draw_proba:
                            @@ -588,39 +508,7 @@ 

                            Source code for doctr.utils.visualization

                                             # Write to response page
                                             response[ymin:ymax, xmin:xmax, :] = np.array(img)
                             
                            -    return response
                            - - - -def draw_boxes( - boxes: np.ndarray, - image: np.ndarray, - color: Optional[Tuple] = None, - **kwargs -) -> None: - """Draw an array of relative straight boxes on an image - - Args: - boxes: array of relative boxes, of shape (*, 4) - image: np array, float32 or uint8 - """ - h, w = image.shape[:2] - # Convert boxes to absolute coords - _boxes = deepcopy(boxes) - _boxes[:, [0, 2]] *= w - _boxes[:, [1, 3]] *= h - _boxes = _boxes.astype(np.int32) - for box in _boxes.tolist(): - xmin, ymin, xmax, ymax = box - image = cv2.rectangle( - image, - (xmin, ymin), - (xmax, ymax), - color=color if isinstance(color, tuple) else (0, 0, 255), - thickness=2 - ) - plt.imshow(image) - plt.plot(**kwargs) + return response
                            @@ -653,7 +541,7 @@

                            Source code for doctr.utils.visualization

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.4.1/_modules/index.html b/v0.4.1/_modules/index.html index 7a2005aecd..c887b618c2 100644 --- a/v0.4.1/_modules/index.html +++ b/v0.4.1/_modules/index.html @@ -226,28 +226,21 @@ @@ -282,28 +275,19 @@

                            All modules for which code is available

                            -
                            -
                            +
                            diff --git a/v0.4.1/_sources/changelog.rst.txt b/v0.4.1/_sources/changelog.rst.txt index 0ab898b83e..430097d6c8 100644 --- a/v0.4.1/_sources/changelog.rst.txt +++ b/v0.4.1/_sources/changelog.rst.txt @@ -1,18 +1,6 @@ Changelog ========= -v0.4.0 (2021-10-01) -------------------- -Release note: `v0.4.0 `_ - -v0.3.1 (2021-08-27) -------------------- -Release note: `v0.3.1 `_ - -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.4.1/_sources/datasets.rst.txt b/v0.4.1/_sources/datasets.rst.txt index d0bd5c4358..354122f1e5 100644 --- a/v0.4.1/_sources/datasets.rst.txt +++ b/v0.4.1/_sources/datasets.rst.txt @@ -11,21 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. + +.. autoclass:: doctr.datasets.datasets.VisionDataset + + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD .. autoclass:: OCRDataset -.. autoclass:: CharacterGenerator -.. autoclass:: DocArtefacts -.. autoclass:: IIIT5K -.. autoclass:: SVT Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -35,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -58,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.4.1/_sources/index.rst.txt b/v0.4.1/_sources/index.rst.txt index ecb49b2b12..fc3ff89fdf 100644 --- a/v0.4.1/_sources/index.rst.txt +++ b/v0.4.1/_sources/index.rst.txt @@ -1,7 +1,7 @@ -docTR: Document Text Recognition +DocTR: Document Text Recognition ================================ -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -12,6 +12,9 @@ DocTR provides an easy and powerful way to extract valuable information from you * |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. +Welcome to the documentation of `DocTR `_! + + Main Features ------------- @@ -20,18 +23,24 @@ Main Features * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor * |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, minimal dependencies -* |:tools:| Actively maintained by Mindee -* |:factory:| Easy integration (available templates for browser demo & API deployment) +* |:bird:| Light package, small dependencies +* |:tools:| Daily maintained +* |:factory:| Easy integration +Getting Started +--------------- + .. toctree:: :maxdepth: 2 - :caption: Getting started - :hidden: installing - notebooks + + +Build & train your predictor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) +* Fine-tune or train from scratch any detection or recognition model to specialize on your data Model zoo @@ -39,14 +48,14 @@ Model zoo Text detection models """"""""""""""""""""" - * DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ - * LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ + * `DBNet `_ (Differentiable Binarization) + * `LinkNet `_ Text recognition models """"""""""""""""""""""" - * SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ - * CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ - * MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ + * `SAR `_ (Show, Attend and Read) + * `CRNN `_ (Convolutional Recurrent Neural Network) + * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) Supported datasets @@ -58,28 +67,17 @@ Supported datasets .. toctree:: :maxdepth: 2 - :caption: Using docTR - :hidden: + :caption: Notes - using_models - using_model_export + changelog .. toctree:: :maxdepth: 2 :caption: Package Reference - :hidden: datasets - io + documents models transforms utils - - -.. toctree:: - :maxdepth: 2 - :caption: Notes - :hidden: - - changelog diff --git a/v0.4.1/_sources/installing.rst.txt b/v0.4.1/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.4.1/_sources/installing.rst.txt +++ b/v0.4.1/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.4.1/_sources/io.rst.txt b/v0.4.1/_sources/io.rst.txt deleted file mode 100644 index d23e11bdb9..0000000000 --- a/v0.4.1/_sources/io.rst.txt +++ /dev/null @@ -1,92 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_artefacts diff --git a/v0.4.1/_sources/models.rst.txt b/v0.4.1/_sources/models.rst.txt index 77ec8c16e8..9830c6c153 100644 --- a/v0.4.1/_sources/models.rst.txt +++ b/v0.4.1/_sources/models.rst.txt @@ -1,54 +1,215 @@ doctr.models ============ +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. + .. currentmodule:: doctr.models +For a given task, DocTR provides a Predictor, which is composed of 2 components: -doctr.models.backbones ----------------------- +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.backbones.vgg16_bn -.. autofunction:: doctr.models.backbones.resnet31 +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.backbones.mobilenet_v3_small ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.backbones.mobilenet_v3_large +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.backbones.mobilenet_v3_small_r +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.backbones.mobilenet_v3_large_r +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: -doctr.models.detection ----------------------- +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -.. autofunction:: doctr.models.detection.linknet16 + +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.1/_sources/notebooks.md.txt b/v0.4.1/_sources/notebooks.md.txt deleted file mode 100644 index bf88396c85..0000000000 --- a/v0.4.1/_sources/notebooks.md.txt +++ /dev/null @@ -1,8 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | - diff --git a/v0.4.1/_sources/transforms.rst.txt b/v0.4.1/_sources/transforms.rst.txt index cc83c8310b..0230fe75f5 100644 --- a/v0.4.1/_sources/transforms.rst.txt +++ b/v0.4.1/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,8 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop Composing transformations diff --git a/v0.4.1/_sources/using_doctr/using_model_export.rst.txt b/v0.4.1/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.4.1/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.4.1/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.4.1/_sources/using_model_export.rst.txt b/v0.4.1/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.4.1/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.4.1/_sources/using_models.rst.txt b/v0.4.1/_sources/using_models.rst.txt deleted file mode 100644 index c44627f466..0000000000 --- a/v0.4.1/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet16 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.15 - - 92.92 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.00 | 76.02 | 0.85 | 83.87 | 81.34 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
                            -
                            -

                            - - Hello - XML - World - -

                            -
                            - - \ No newline at end of file diff --git a/v0.4.1/_sources/utils.rst.txt b/v0.4.1/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.4.1/_sources/utils.rst.txt +++ b/v0.4.1/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.4.1/_static/documentation_options.js b/v0.4.1/_static/documentation_options.js index 83231357df..a7b5cbe04a 100644 --- a/v0.4.1/_static/documentation_options.js +++ b/v0.4.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.4.1a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.4.1/changelog.html b/v0.4.1/changelog.html index 030f1f2f73..6ed2620fb7 100644 --- a/v0.4.1/changelog.html +++ b/v0.4.1/changelog.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Changelog - docTR documentation @@ -227,28 +227,21 @@ @@ -290,18 +283,6 @@

                            Changelog

                            -
                            -

                            v0.4.0 (2021-10-01)

                            -

                            Release note: v0.4.0

                            -
                            -
                            -

                            v0.3.1 (2021-08-27)

                            -

                            Release note: v0.3.1

                            -
                            -
                            -

                            v0.3.0 (2021-07-02)

                            -

                            Release note: v0.3.0

                            -

                            v0.2.1 (2021-05-28)

                            Release note: v0.2.1

                            @@ -325,15 +306,23 @@

                            v0.1.0 (2021-03-05) - - + +
                            +
                            + Next +
                            +
                            doctr.datasets
                            +
                            + +
                            +
                            Previous
                            -
                            doctr.utils
                            +
                            Installation
                            @@ -368,9 +357,6 @@

                            v0.1.0 (2021-03-05)

                            diff --git a/v0.4.1/datasets.html b/v0.4.1/datasets.html index 60ae87ed1a..640791680a 100644 --- a/v0.4.1/datasets.html +++ b/v0.4.1/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,7 +287,13 @@

                            doctr.datasets

                            Available Datasets

                            -

                            Here are all datasets that are available through docTR:

                            +

                            The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

                            +
                            +
                            +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
                            +
                            + +

                            Here are all datasets that are available through DocTR:

                            class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
                            @@ -369,7 +368,7 @@

                            doctr.datasets
                            -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None)[source]
                            +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                            Implements an OCR dataset

                            Parameters:
                            @@ -377,103 +376,6 @@

                            doctr.datasets -
                            -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
                            -

                            Implements a character image generation dataset

                            -
                            -
                            Example::
                            >>> from doctr.datasets import CharacterGenerator
                            ->>> ds = CharacterGenerator(vocab='abdef')
                            ->>> img, target = ds[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • vocab – vocabulary to take the character from

                            • -
                            • num_samples – number of samples that will be generated iterating over the dataset

                            • -
                            • cache_samples – whether generated images should be cached firsthand

                            • -
                            • sample_transforms – composable transformations that will be applied to each image

                            • -
                            -
                            -
                            -

                            - -
                            -
                            -class doctr.datasets.DocArtefacts(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
                            -

                            Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import DocArtefacts
                            ->>> train_set = DocArtefacts(download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • sample_transforms – composable transformations that will be applied to each image

                            • -
                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.IIIT5K(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
                            -

                            IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

                            -
                            -
                            Example::
                            >>> # NOTE: this dataset is for character-level localization
                            ->>> from doctr.datasets import IIIT5K
                            ->>> train_set = IIIT5K(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • sample_transforms – composable transformations that will be applied to each image

                            • -
                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.SVT(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]
                            -

                            SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import SVT
                            ->>> train_set = SVT(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • sample_transforms – composable transformations that will be applied to each image

                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • **kwargs – keyword arguments from VisionDataset.

                            @@ -484,10 +386,10 @@

                            doctr.datasets

                            Data Loading

                            -

                            Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

                            +

                            Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

                            -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None, collate_fn: Callable | None = None)[source]
                            +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

                            Implements a dataset wrapper for fast data loading

                            Example::
                            >>> from doctr.datasets import FUNSD, DataLoader
                            @@ -515,11 +417,11 @@ 

                            Data Loading

                            Supported Vocabs

                            -

                            Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

                            Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

                            - +@@ -549,39 +451,19 @@

                            Data Loading

                            - - - - - - - - - - + + - - - - - - - - - - - - - - + +
                            docTR VocabsDocTR Vocabs

                            latin

                            94

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

                            english

                            100

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

                            legacy_french

                            123

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                            96

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

                            french

                            126

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

                            portuguese

                            131

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

                            spanish

                            116

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

                            german

                            108

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

                            154

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                            -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
                            +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

                            Encode character sequences using a given vocab as mapping

                            Parameters:
                            @@ -592,7 +474,6 @@

                            Data LoadingReturns: @@ -609,23 +490,23 @@

                            Data Loading - +
                            Next
                            -
                            doctr.io
                            +
                            doctr.documents
                            - +
                            Previous
                            -
                            Preparing your model for inference
                            +
                            Changelog
                            @@ -661,14 +542,11 @@

                            Data Loadingdoctr.datasets

                            diff --git a/v0.4.1/genindex.html b/v0.4.1/genindex.html index ca3225362a..10d0739337 100644 --- a/v0.4.1/genindex.html +++ b/v0.4.1/genindex.html @@ -225,28 +225,21 @@ @@ -283,17 +276,17 @@

                            Index

                            -
                            A | B | C | D | E | F | G | I | L | M | N | O | P | R | S | T | U | V | W
                            +
                            A | B | C | D | E | F | G | L | M | N | O | P | Q | R | S | T | V | W

                            A

                            @@ -303,7 +296,7 @@

                            A

                            B

                            @@ -313,19 +306,17 @@

                            B

                            C

                            -
                            -

                            U

                            - - -
                            -
                            -

                            V

                            +
                            +

                        • Composing transformations
                            @@ -713,7 +674,7 @@

                            Composing transformations + diff --git a/v0.4.1/using_doctr/using_model_export.html b/v0.4.1/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.4.1/using_doctr/using_model_export.html +++ b/v0.4.1/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

                            Half-precision
                            import tensorflow as tf
                            -from keras import mixed_precision
                            +from tensorflow.keras import mixed_precision
                             mixed_precision.set_global_policy('mixed_float16')
                             predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
                             
                            diff --git a/v0.4.1/using_model_export.html b/v0.4.1/using_model_export.html deleted file mode 100644 index d96b24d6a0..0000000000 --- a/v0.4.1/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                            Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            - -
                            - -
                            - -
                            -
                            -
                            -

                            Preparing your model for inference

                            -

                            A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

                            -
                            -

                            Model compression

                            -

                            This section is meant to help you perform inference with compressed versions of your model.

                            -
                            -

                            TensorFlow Lite

                            -

                            TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

                            -
                            >>> import tensorflow as tf
                            ->>> from tensorflow.keras import Sequential
                            ->>> from doctr.models import conv_sequence
                            ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                            ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
                            ->>> serialized_model = converter.convert()
                            -
                            -
                            -
                            -
                            -

                            Half-precision

                            -

                            If you want to convert it to half-precision using your TFLite converter

                            -
                            >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                            ->>> converter.target_spec.supported_types = [tf.float16]
                            ->>> serialized_model = converter.convert()
                            -
                            -
                            -
                            -
                            -

                            Post-training quantization

                            -

                            Finally if you wish to quantize the model with your TFLite converter

                            -
                            >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                            ->>> # Float fallback for operators that do not have an integer implementation
                            ->>> def representative_dataset():
                            ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
                            ->>> converter.representative_dataset = representative_dataset
                            ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
                            ->>> converter.inference_input_type = tf.int8
                            ->>> converter.inference_output_type = tf.int8
                            ->>> serialized_model = converter.convert()
                            -
                            -
                            -
                            -
                            -
                            -

                            Using SavedModel

                            -

                            Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

                            -
                            >>> import tensorflow as tf
                            ->>> from doctr.models import db_resnet50
                            ->>> model = db_resnet50(pretrained=True)
                            ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                            ->>> _ = model(input_t, training=False)
                            ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
                            -
                            -
                            -

                            And loaded just as easily:

                            -
                            >>> import tensorflow as tf
                            ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
                            -
                            -
                            -
                            -
                            - -
                            -
                            - -
                            - -
                            -
                            - - - - - - - - \ No newline at end of file diff --git a/v0.4.1/using_models.html b/v0.4.1/using_models.html deleted file mode 100644 index cd9f4516b8..0000000000 --- a/v0.4.1/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            - -
                            - -
                            - -
                            -
                            -
                            -

                            Choosing the right model

                            -

                            The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

                            -

                            For a given task, docTR provides a Predictor, which is composed of 2 components:

                            -
                              -
                            • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

                            • -
                            • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

                            • -
                            -
                            -

                            Text Detection

                            -

                            The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

                            -
                            -

                            Available architectures

                            -

                            The following architectures are currently supported:

                            - -

                            For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                            -
                            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                            FUNSD

                            CORD

                            Architecture

                            Input shape

                            # params

                            Recall

                            Precision

                            Recall

                            Precision

                            FPS

                            db_resnet50

                            (1024, 1024, 3)

                            25.2 M

                            82.14

                            87.64

                            92.49

                            89.66

                            2.1

                            db_mobilenet_v3_large

                            (1024, 1024, 3)

                            4.2 M

                            79.35

                            84.03

                            81.14

                            66.85

                            -
                            -

                            All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                            -

                            Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

                            -

                            FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                            -
                            -
                            -

                            Detection predictors

                            -

                            detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

                            -
                            >>> import numpy as np
                            ->>> from doctr.models import detection_predictor
                            ->>> predictor = detection_predictor('db_resnet50')
                            ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                            ->>> out = model([dummy_img])
                            -
                            -
                            -
                            -
                            -
                            -

                            Text Recognition

                            -

                            The task consists of transcribing the character sequence in a given image.

                            -
                            -

                            Available architectures

                            -

                            The following architectures are currently supported:

                            - -

                            For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                            -
                            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                            Text recognition model zoo

                            Architecture

                            Input shape

                            # params

                            FUNSD

                            CORD

                            FPS

                            crnn_vgg16_bn

                            (32, 128, 3)

                            15.8M

                            87.15

                            92.92

                            12.8

                            crnn_mobilenet_v3_small

                            (32, 128, 3)

                            2.1M

                            86.21

                            90.56

                            crnn_mobilenet_v3_large

                            (32, 128, 3)

                            4.5M

                            86.95

                            92.03

                            sar_resnet31

                            (32, 128, 3)

                            56.2M

                            87.70

                            93.41

                            2.7

                            master

                            (32, 128, 3)

                            67.7M

                            87.62

                            93.27

                            -
                            -

                            All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

                            -

                            While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

                            -
                            >>> from doctr.models import recognition_predictor
                            ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                            ->>> print(predictor.model.cfg['vocab'])
                            -
                            -
                            -

                            Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

                            -

                            FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                            -
                            -
                            -

                            Recognition predictors

                            -

                            recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

                            -
                            >>> import numpy as np
                            ->>> from doctr.models import recognition_predictor
                            ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                            ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
                            ->>> out = model([dummy_img])
                            -
                            -
                            -
                            -
                            -
                            -

                            End-to-End OCR

                            -

                            The task consists of both localizing and transcribing textual elements in a given image.

                            -
                            -

                            Available architectures

                            -

                            You can use any combination of detection and recognition models supporte by docTR.

                            -

                            For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                            -
                            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                            FUNSD

                            CORD

                            Architecture

                            Recall

                            Precision

                            FPS

                            Recall

                            Precision

                            FPS

                            db_resnet50 + crnn_vgg16_bn

                            71.00

                            76.02

                            0.85

                            83.87

                            81.34

                            1.6

                            db_resnet50 + master

                            71.03

                            76.06

                            84.49

                            81.94

                            db_resnet50 + sar_resnet31

                            71.25

                            76.29

                            0.27

                            84.50

                            81.96

                            0.83

                            db_resnet50 + crnn_mobilenet_v3_small

                            69.85

                            74.80

                            80.85

                            78.42

                            0.83

                            db_resnet50 + crnn_mobilenet_v3_large

                            70.57

                            75.57

                            82.57

                            80.08

                            0.83

                            db_mobilenet_v3_large + crnn_vgg16_bn

                            67.73

                            71.73

                            71.65

                            59.03

                            Gvision text detection

                            59.50

                            62.50

                            75.30

                            70.00

                            Gvision doc. text detection

                            64.00

                            53.30

                            68.90

                            61.10

                            AWS textract

                            78.10

                            83.00

                            87.50

                            66.00

                            -
                            -

                            All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                            -

                            Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                            -

                            FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                            -

                            Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

                            -
                            - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                            Receipts

                            Invoices

                            IDs

                            US Tax Forms

                            Resumes

                            Road Fines

                            Architecture

                            Recall

                            Precision

                            Recall

                            Precision

                            Recall

                            Precision

                            Recall

                            Precision

                            Recall

                            Precision

                            Recall

                            Precision

                            db_resnet50 + crnn_vgg16_bn (ours)

                            78.70

                            81.12

                            65.80

                            70.70

                            50.25

                            51.78

                            79.08

                            92.83

                            db_resnet50 + master (ours)

                            79.00

                            81.42

                            65.57

                            69.86

                            51.34

                            52.90

                            78.86

                            92.57

                            db_resnet50 + sar_resnet31 (ours)

                            78.94

                            81.37

                            65.89

                            70.79

                            51.78

                            53.35

                            79.04

                            92.78

                            db_resnet50 + crnn_mobilenet_v3_small (ours)

                            76.81

                            79.15

                            64.89

                            69.61

                            45.03

                            46.38

                            78.96

                            92.11

                            85.91

                            87.20

                            84.85

                            85.86

                            db_resnet50 + crnn_mobilenet_v3_large (ours)

                            78.01

                            80.39

                            65.36

                            70.11

                            48.00

                            49.43

                            79.39

                            92.62

                            87.68

                            89.00

                            85.65

                            86.67

                            db_mobilenet_v3_large + crnn_vgg16_bn (ours)

                            78.36

                            74.93

                            63.04

                            68.41

                            39.36

                            41.75

                            72.14

                            89.97

                            Gvision doc. text detection

                            68.91

                            59.89

                            63.20

                            52.85

                            43.70

                            29.21

                            69.79

                            65.68

                            AWS textract

                            75.77

                            77.70

                            70.47

                            69.13

                            46.39

                            43.32

                            84.31

                            98.11

                            -
                            -
                            -
                            -

                            Two-stage approaches

                            -

                            Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

                            -
                            >>> import numpy as np
                            ->>> from doctr.models import ocr_predictor
                            ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                            ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                            ->>> out = model([input_page])
                            -
                            -
                            -
                            -
                            -

                            What should I do with the output?

                            -

                            The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

                            -

                            Here is a typical Document layout:

                            -
                            Document(
                            -  (pages): [Page(
                            -    dimensions=(340, 600)
                            -    (blocks): [Block(
                            -      (lines): [Line(
                            -        (words): [
                            -          Word(value='No.', confidence=0.91),
                            -          Word(value='RECEIPT', confidence=0.99),
                            -          Word(value='DATE', confidence=0.96),
                            -        ]
                            -      )]
                            -      (artefacts): []
                            -    )]
                            -  )]
                            -)
                            -
                            -
                            -

                            You can also export them as a nested dict, more appropriate for JSON format:

                            -
                            json_output = result.export()
                            -
                            -
                            -

                            For reference, here is the JSON export for the same Document as above:

                            -
                            {
                            -  'pages': [
                            -      {
                            -          'page_idx': 0,
                            -          'dimensions': (340, 600),
                            -          'orientation': {'value': None, 'confidence': None},
                            -          'language': {'value': None, 'confidence': None},
                            -          'blocks': [
                            -              {
                            -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                            -                  'lines': [
                            -                      {
                            -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                            -                          'words': [
                            -                              {
                            -                                  'value': 'No.',
                            -                                  'confidence': 0.914085328578949,
                            -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
                            -                              },
                            -                              {
                            -                                  'value': 'RECEIPT',
                            -                                  'confidence': 0.9949972033500671,
                            -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
                            -                              },
                            -                              {
                            -                                  'value': 'DATE',
                            -                                  'confidence': 0.9578408598899841,
                            -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
                            -                              }
                            -                          ]
                            -                      }
                            -                  ],
                            -                  'artefacts': []
                            -              }
                            -          ]
                            -      }
                            -  ]
                            -}
                            -
                            -
                            -

                            To export the outpout as XML (hocr-format) you can use the export_as_xml method:

                            -
                            xml_output = result.export_as_xml()
                            -for output in xml_output:
                            -  xml_bytes_string = output[0]
                            -  xml_element = output[1]
                            -
                            -
                            -

                            For reference, here is a sample XML byte string output:

                            -
                            <?xml version="1.0" encoding="UTF-8"?>
                            -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
                            -  <head>
                            -    <title>docTR - hOCR</title>
                            -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
                            -    <meta name="ocr-system" content="doctr 0.5.0" />
                            -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
                            -  </head>
                            -  <body>
                            -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
                            -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
                            -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
                            -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
                            -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
                            -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
                            -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
                            -        </span>
                            -      </p>
                            -    </div>
                            -  </body>
                            -</html>
                            -
                            -
                            -
                            -
                            -
                            - -
                            -
                            - -
                            - -
                            -
                            - - - - - - - - \ No newline at end of file diff --git a/v0.4.1/utils.html b/v0.4.1/utils.html index bd308760a3..1908ef4ff4 100644 --- a/v0.4.1/utils.html +++ b/v0.4.1/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

                            Visualization -
                            -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
                            -

                            Draw a the content of the element page (OCR response) on a blank page.

                            -
                            -
                            Parameters:
                            -
                              -
                            • page – exported Page object to represent

                            • -
                            • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                            • -
                            • font_size – size of the font, default font = 13

                            • -
                            • font_family – family of the font

                            • -
                            -
                            -
                            Returns:
                            -

                            the synthesized page

                            -
                            -
                            -
                            -

                            Task evaluation

                            @@ -382,20 +356,6 @@

                            Visualization -
                            -update(gt: List[str], pred: List[str]) None[source]
                            -

                            Update the state of the metric with new predictions

                            -
                            -
                            Parameters:
                            -
                              -
                            • gt – list of groung-truth character sequences

                            • -
                            • pred – list of predicted character sequences

                            • -
                            -
                            -
                            -
                            -
                            summary() Dict[str, float][source]
                            @@ -451,11 +411,6 @@

                            Visualization

                            iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                            -
                            -
                            -update(gts: ndarray, preds: ndarray) None[source]
                            -
                            -
                            summary() Tuple[float | None, float | None, float | None][source]
                            @@ -472,7 +427,7 @@

                            Visualization
                            class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
                            -

                            Implements an end-to-end OCR metric.

                            +

                            Implements end-to-end OCR metric.

                            The aggregated metrics are computed as follows:

                            -
                            -
                            -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
                            -
                            -
                            summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

                            Computes the aggregated metrics

                            Returns:
                            -

                            a tuple with the recall & precision for each string comparison and the mean IoU

                            -
                            -
                            -
                            - - - -
                            -
                            -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]
                            -

                            Implements an object detection metric.

                            -

                            The aggregated metrics are computed as follows:

                            -
                            -
                            -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
                            -
                            -

                            with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

                            -
                            -
                            -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
                            -
                            -

                            where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

                            -
                            -
                            Example::
                            >>> import numpy as np
                            ->>> from doctr.utils import DetectionMetric
                            ->>> metric = DetectionMetric(iou_thresh=0.5)
                            ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
                            -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
                            ->>> metric.summary()
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -

                            iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                            -
                            -
                            -
                            -
                            -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
                            -
                            - -
                            -
                            -summary() Tuple[float | None, float | None, float | None][source]
                            -

                            Computes the aggregated metrics

                            -
                            -
                            Returns:
                            -

                            a tuple with the recall & precision for each class prediction and the mean IoU

                            +

                            a tuple with the recall & precision for each string comparison flexibility and the mean IoU

                            @@ -603,15 +490,7 @@

                            Visualization - -
                            -
                            - Next -
                            -
                            Changelog
                            -
                            - -
                            + diff --git a/v0.5.0/_modules/doctr/datasets/classification/tensorflow.html b/v0.5.0/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.5.0/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                            -
                            -
                            - -
                            - -
                            -
                            - -
                            - -
                            -
                            - -
                            -
                            -
                            - - - - - Back to top - -
                            -
                            - -
                            - -
                            -
                            -

                            Source code for doctr.datasets.classification.tensorflow

                            -# Copyright (C) 2021, Mindee.
                            -
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            -
                            -import tensorflow as tf
                            -
                            -from .base import _CharacterGenerator
                            -
                            -__all__ = ['CharacterGenerator']
                            -
                            -
                            -
                            -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
                            - -
                            -
                            -
                            -
                            - - -
                            -
                            - - Made with Sphinx and @pradyunsg's - - Furo - -
                            -
                            - -
                            -
                            - -
                            -
                            - -
                            -
                            - - - - - - - - - \ No newline at end of file diff --git a/v0.5.0/_modules/doctr/datasets/cord.html b/v0.5.0/_modules/doctr/datasets/cord.html index c846254bad..3b89955bd8 100644 --- a/v0.5.0/_modules/doctr/datasets/cord.html +++ b/v0.5.0/_modules/doctr/datasets/cord.html @@ -226,28 +226,21 @@ @@ -287,15 +280,14 @@

                            Source code for doctr.datasets.cord

                             # This program is licensed under the Apache License version 2.
                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                            -import json
                             import os
                            -from pathlib import Path
                            -from typing import Any, Dict, List, Tuple
                            -
                            +import json
                             import numpy as np
                            +from pathlib import Path
                            +from typing import List, Dict, Any, Tuple, Optional, Callable
                             
                             from .datasets import VisionDataset
                            -from .utils import convert_target_to_relative
                            +from doctr.utils.geometry import fit_rbbox
                             
                             __all__ = ['CORD']
                             
                            @@ -313,7 +305,8 @@ 

                            Source code for doctr.datasets.cord

                             
                                 Args:
                                     train: whether the subset should be the training one
                            -        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                            +        sample_transforms: composable transformations that will be applied to each image
                            +        rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                     **kwargs: keyword arguments from `VisionDataset`.
                                 """
                                 TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip',
                            @@ -325,38 +318,39 @@ 

                            Source code for doctr.datasets.cord

                                 def __init__(
                                     self,
                                     train: bool = True,
                            -        use_polygons: bool = False,
                            +        sample_transforms: Optional[Callable[[Any], Any]] = None,
                            +        rotated_bbox: bool = False,
                                     **kwargs: Any,
                                 ) -> None:
                             
                                     url, sha256 = self.TRAIN if train else self.TEST
                            -        super().__init__(url, None, sha256, True, pre_transforms=convert_target_to_relative, **kwargs)
                            +        super().__init__(url, None, sha256, True, **kwargs)
                             
                                     # # List images
                            -        tmp_root = os.path.join(self.root, 'image')
                            +        self.root = os.path.join(self._root, 'image')
                                     self.data: List[Tuple[str, Dict[str, Any]]] = []
                                     self.train = train
                            -        for img_path in os.listdir(tmp_root):
                            +        self.sample_transforms = sample_transforms
                            +        for img_path in os.listdir(self.root):
                                         # File existence check
                            -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                            -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                            -
                            +            if not os.path.exists(os.path.join(self.root, img_path)):
                            +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                         stem = Path(img_path).stem
                                         _targets = []
                            -            with open(os.path.join(self.root, 'json', f"{stem}.json"), 'rb') as f:
                            +            with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f:
                                             label = json.load(f)
                                             for line in label["valid_line"]:
                                                 for word in line["words"]:
                                                     if len(word["text"]) > 0:
                                                         x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
                                                         y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
                            -                            if use_polygons:
                            -                                box = np.array([
                            +                            if rotated_bbox:
                            +                                box = list(fit_rbbox(np.array([
                                                                 [x[0], y[0]],
                                                                 [x[1], y[1]],
                                                                 [x[2], y[2]],
                                                                 [x[3], y[3]],
                            -                                ], dtype=np.float32)
                            +                                ], dtype=np.float32)))
                                                         else:
                                                             # Reduce 8 coords to 4
                                                             box = [min(x), min(y), max(x), max(y)]
                            @@ -366,9 +360,8 @@ 

                            Source code for doctr.datasets.cord

                             
                                         self.data.append((
                                             img_path,
                            -                dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))
                            +                dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets)
                                         ))
                            -        self.root = tmp_root
                             
                                 def extra_repr(self) -> str:
                                     return f"train={self.train}"
                            @@ -405,7 +398,7 @@

                            Source code for doctr.datasets.cord

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.5.0/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

                            Package Reference

                            • doctr.datasets
                            • -
                            • doctr.io
                            • +
                            • doctr.documents
                            • doctr.models
                            • doctr.transforms
                            • doctr.utils
                            • @@ -284,7 +284,6 @@

                              Source code for doctr.datasets.datasets.tensorflow

                              from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

                              Source code for doctr.datasets.datasets.tensorflow

                              class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

                              Source code for doctr.datasets.datasets.tensorflow

                              +
                              diff --git a/v0.5.0/_modules/doctr/datasets/detection.html b/v0.5.0/_modules/doctr/datasets/detection.html index 3beb1cffa4..43e148dc88 100644 --- a/v0.5.0/_modules/doctr/datasets/detection.html +++ b/v0.5.0/_modules/doctr/datasets/detection.html @@ -228,21 +228,32 @@ +
                              diff --git a/v0.5.0/_modules/doctr/datasets/doc_artefacts.html b/v0.5.0/_modules/doctr/datasets/doc_artefacts.html index fd37b61369..172122a216 100644 --- a/v0.5.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.5.0/_modules/doctr/datasets/doc_artefacts.html @@ -228,21 +228,32 @@
                            @@ -691,7 +599,7 @@

                            Source code for doctr.models.detection.differentiable_binarization.tensorflo

                            -
                            + diff --git a/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

                            Source code for doctr.models.detection.fast.tensorflow

                            import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html index cb2c85edd5..9f836ce462 100644 --- a/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -226,28 +226,21 @@ @@ -290,41 +283,40 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - -import numpy as np import tensorflow as tf from tensorflow import keras -from tensorflow.keras import Model, Sequential, layers +from tensorflow.keras import layers, Sequential +from typing import Dict, Any, Tuple, Optional, List -from doctr.models.classification import resnet18 -from doctr.models.utils import IntermediateLayerGetter, conv_sequence, load_pretrained_params from doctr.utils.repr import NestedObject - +from doctr.models.backbones import ResnetStage +from doctr.models.utils import conv_sequence, load_pretrained_params from .base import LinkNetPostProcessor, _LinkNet -__all__ = ['LinkNet', 'linknet_resnet18'] +__all__ = ['LinkNet', 'linknet16'] default_cfgs: Dict[str, Dict[str, Any]] = { - 'linknet_resnet18': { + 'linknet16': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'num_classes': 1, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': None, }, } -def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential: +def decoder_block(in_chan: int, out_chan: int) -> Sequential: """Creates a LinkNet decoder block""" return Sequential([ - *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1, **kwargs), + *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1), layers.Conv2DTranspose( filters=in_chan // 4, kernel_size=3, - strides=stride, + strides=2, padding="same", use_bias=False, kernel_initializer='he_normal' @@ -335,36 +327,36 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            ]) -class LinkNetFPN(Model, NestedObject): - """LinkNet Decoder module""" +class LinkNetFPN(layers.Layer, NestedObject): + """LinkNet Encoder-Decoder module""" def __init__( self, - out_chans: int, - in_shapes: List[Tuple[int, ...]], ) -> None: super().__init__() - self.out_chans = out_chans - strides = [2] * (len(in_shapes) - 1) + [1] - i_chans = [s[-1] for s in in_shapes[::-1]] - o_chans = i_chans[1:] + [out_chans] - self.decoders = [ - decoder_block(in_chan, out_chan, s, input_shape=in_shape) - for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1]) - ] + self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True) + self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True) + self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True) + self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True) + self.decoder_1 = decoder_block(in_chan=64, out_chan=64) + self.decoder_2 = decoder_block(in_chan=128, out_chan=64) + self.decoder_3 = decoder_block(in_chan=256, out_chan=128) + self.decoder_4 = decoder_block(in_chan=512, out_chan=256) def call( self, - x: List[tf.Tensor] + x: tf.Tensor ) -> tf.Tensor: - out = 0 - for decoder, fmap in zip(self.decoders, x[::-1]): - out = decoder(out + fmap) - return out - - def extra_repr(self) -> str: - return f"out_chans={self.out_chans}" + x_1 = self.encoder_1(x) + x_2 = self.encoder_2(x_1) + x_3 = self.encoder_3(x_2) + x_4 = self.encoder_4(x_3) + y_4 = self.decoder_4(x_4) + y_3 = self.decoder_3(y_4 + x_3) + y_2 = self.decoder_2(y_3 + x_2) + y_1 = self.decoder_1(y_2 + x_1) + return y_1 class LinkNet(_LinkNet, keras.Model): @@ -375,24 +367,25 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            num_classes: number of channels for the output """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'classifier', 'postprocessor'] + _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor'] def __init__( self, - feat_extractor: IntermediateLayerGetter, - fpn_channels: int = 64, num_classes: int = 1, - assume_straight_pages: bool = True, + input_shape: Tuple[int, int, int] = (512, 512, 3), + rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(cfg=cfg) - self.assume_straight_pages = assume_straight_pages + self.rotated_bbox = rotated_bbox - self.feat_extractor = feat_extractor + self.stem = Sequential([ + *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape), + layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'), + ]) - self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape]) - self.fpn.build(self.feat_extractor.output_shape) + self.fpn = LinkNetFPN() self.classifier = Sequential([ layers.Conv2DTranspose( @@ -401,28 +394,30 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal', - input_shape=self.fpn.decoders[-1].output_shape[1:], + kernel_initializer='he_normal' ), layers.BatchNormalization(), layers.Activation('relu'), - *conv_sequence(32, 'relu', True, kernel_size=3, strides=1), + *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=True, + use_bias=False, kernel_initializer='he_normal' ), ]) - self.postprocessor = LinkNetPostProcessor(assume_straight_pages=assume_straight_pages) + self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, - target: List[np.ndarray], + target: List[Dict[str, Any]], + focal_loss: bool = False, + alpha: float = .5, + gamma: float = 2., edge_factor: float = 2., ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on @@ -431,88 +426,94 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            Args: out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry + focal_loss: if True, use focal loss instead of BCE edge_factor: boost factor for box edges (in case of BCE) + alpha: balancing factor in the focal loss formula + gammma: modulating factor in the focal loss formula Returns: A loss tensor """ - seg_target, seg_mask, edge_mask = self.build_target(target, out_map.shape[1:3]) - - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - if edge_factor > 0: - edge_mask = tf.convert_to_tensor(edge_mask, dtype=tf.bool) # Get the cross_entropy for each entry - loss = tf.keras.losses.binary_crossentropy(seg_target, out_map, from_logits=True)[..., None] + bce = tf.keras.losses.binary_crossentropy( + seg_target[seg_mask], + tf.squeeze(out_map, axis=[-1])[seg_mask], + from_logits=True) + + if focal_loss: + if gamma and gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") - # Compute BCE loss with highlighted edges - if edge_factor > 0: + # Convert logits to prob, compute gamma factor + pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) + p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) + modulating_factor = tf.pow((1.0 - p_t), gamma) + + # Compute alpha factor + alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) + + # compute the final loss + loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) + + else: + # Compute BCE loss with highlighted edges loss = tf.math.multiply( - 1 + (edge_factor - 1) * tf.cast(edge_mask, out_map.dtype), - loss + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), + bce ) + loss = tf.reduce_mean(loss) - return tf.reduce_mean(loss[seg_mask]) + return loss def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, - return_preds: bool = False, + return_boxes: bool = False, + focal_loss: bool = True, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) - logits = self.fpn(feat_maps, **kwargs) - logits = self.classifier(logits, **kwargs) + logits = self.stem(x) + logits = self.fpn(logits) + logits = self.classifier(logits) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_preds: + if return_model_output or target is None or return_boxes: prob_map = tf.math.sigmoid(logits) if return_model_output: out["out_map"] = prob_map - if target is None or return_preds: + if target is None or return_boxes: # Post-process boxes - out["preds"] = [preds[0] for preds in self.postprocessor(prob_map.numpy())] + out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) if target is not None: - loss = self.compute_loss(logits, target) + loss = self.compute_loss(logits, target, focal_loss) out['loss'] = loss return out -def _linknet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> LinkNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or default_cfgs[arch]['input_shape'] - - # Feature extractor - feat_extractor = IntermediateLayerGetter( - backbone_fn( - pretrained=pretrained_backbone, - include_top=False, - input_shape=_cfg['input_shape'], - ), - fpn_layers, - ) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] + kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: load_pretrained_params(model, _cfg['url']) @@ -520,16 +521,16 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            return model -
                            -[docs] -def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
                            +[docs] +def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. Example:: >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet18 - >>> model = linknet_resnet18(pretrained=True) + >>> from doctr.models import linknet16 + >>> model = linknet16(pretrained=True) >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) @@ -540,13 +541,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            text detection architecture """ - return _linknet( - 'linknet_resnet18', - pretrained, - resnet18, - ['resnet_block_1', 'resnet_block_3', 'resnet_block_5', 'resnet_block_7'], - **kwargs, - )
                            + return _linknet('linknet16', pretrained, **kwargs)
                            @@ -580,7 +575,7 @@

                            Source code for doctr.models.detection.linknet.tensorflow

                            +
                            diff --git a/v0.5.0/_modules/doctr/models/detection/zoo.html b/v0.5.0/_modules/doctr/models/detection/zoo.html index cc6d954559..23a2f451e3 100644 --- a/v0.5.0/_modules/doctr/models/detection/zoo.html +++ b/v0.5.0/_modules/doctr/models/detection/zoo.html @@ -226,28 +226,21 @@ @@ -290,44 +283,32 @@

                            Source code for doctr.models.detection.zoo

                             from typing import Any
                             
                             from doctr.file_utils import is_tf_available, is_torch_available
                            -
                            -from .. import detection
                            +from .core import DetectionPredictor
                             from ..preprocessor import PreProcessor
                            -from .predictor import DetectionPredictor
                            +from .. import detection
                            +
                             
                             __all__ = ["detection_predictor"]
                             
                             
                             if is_tf_available():
                            -    ARCHS = ['db_resnet50', 'db_mobilenet_v3_large', 'linknet_resnet18']
                            -    ROT_ARCHS = []
                            +    ARCHS = ['db_resnet50', 'linknet16']
                             elif is_torch_available():
                            -    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3_large', 'linknet_resnet18', 'db_resnet50_rotation']
                            -    ROT_ARCHS = ['db_resnet50_rotation']
                            +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
                             
                             
                            -def _predictor(
                            -    arch: str,
                            -    pretrained: bool,
                            -    assume_straight_pages: bool = True,
                            -    **kwargs: Any
                            -) -> DetectionPredictor:
                            +def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
                             
                                 if arch not in ARCHS:
                                     raise ValueError(f"unknown architecture '{arch}'")
                             
                            -    if arch not in ROT_ARCHS and not assume_straight_pages:
                            -        raise AssertionError("You are trying to use a model trained on straight pages while not assuming"
                            -                             " your pages are straight. If you have only straight documents, don't pass"
                            -                             f" assume_straight_pages=False, otherwise you should use one of these archs: {ROT_ARCHS}")
                            -
                                 # Detection
                            -    _model = detection.__dict__[arch](pretrained=pretrained, assume_straight_pages=assume_straight_pages)
                            +    _model = detection.__dict__[arch](pretrained=pretrained)
                                 kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                 kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                 kwargs['batch_size'] = kwargs.get('batch_size', 1)
                                 predictor = DetectionPredictor(
                            -        PreProcessor(_model.cfg['input_shape'][:-1] if is_tf_available() else _model.cfg['input_shape'][1:], **kwargs),
                            +        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
                                     _model
                                 )
                                 return predictor
                            @@ -335,31 +316,25 @@ 

                            Source code for doctr.models.detection.zoo

                             
                             
                            [docs] -def detection_predictor( - arch: str = 'db_resnet50', - pretrained: bool = False, - assume_straight_pages: bool = True, - **kwargs: Any -) -> DetectionPredictor: +def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: """Text detection architecture. Example:: >>> import numpy as np >>> from doctr.models import detection_predictor - >>> model = detection_predictor(arch='db_resnet50', pretrained=True) + >>> model = detection_predictor(pretrained=True) >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) >>> out = model([input_page]) Args: - arch: name of the architecture to use (e.g. 'db_resnet50') + arch: name of the architecture to use ('db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset - assume_straight_pages: If True, fit straight boxes to the page Returns: Detection predictor """ - return _predictor(arch, pretrained, assume_straight_pages, **kwargs)
                            + return _predictor(arch, pretrained, **kwargs)
                            @@ -393,7 +368,7 @@

                            Source code for doctr.models.detection.zoo

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html index 975fbde8d0..7b8529c26d 100644 --- a/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,42 +281,35 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf from tensorflow.keras import layers -from tensorflow.keras.models import Model, Sequential - -from doctr.datasets import VOCABS +from tensorflow.keras.models import Sequential, Model +from typing import Tuple, Dict, Any, Optional, List -from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r -from ...utils.tensorflow import load_pretrained_params +from ... import backbones +from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor -__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_mobilenet_v3_small', - 'crnn_mobilenet_v3_large'] +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'crnn_vgg16_bn': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip', }, - 'crnn_mobilenet_v3_small': { + 'crnn_resnet31': { 'mean': (0.694, 0.695, 0.693), 'std': (0.299, 0.296, 0.301), + 'backbone': 'resnet31', 'rnn_units': 128, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip', - }, - 'crnn_mobilenet_v3_large': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['french'], - 'url': None, + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip', }, } @@ -422,15 +408,16 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            """Compute CTC loss for the model. Args: + gt: the encoded tensor with gt labels model_output: predicted logits of the model - target: lengths of each gt word inside the batch + seq_len: lengths of each gt word inside the batch Returns: The loss of the model on the batch """ - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -467,29 +454,23 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            return out -def _crnn( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained - - kwargs['vocab'] = kwargs.get('vocab', default_cfgs[arch]['vocab']) +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: + # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['vocab'] = kwargs['vocab'] - _cfg['input_shape'] = input_shape or default_cfgs[arch]['input_shape'] + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - feat_extractor = backbone_fn( + # Feature extractor + feat_extractor = backbones.__dict__[_cfg['backbone']]( input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) + kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters @@ -519,20 +500,18 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            text recognition architecture """ - return _crnn('crnn_vgg16_bn', pretrained, vgg16_bn_r, **kwargs)
                            + return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
                            -
                            -[docs] -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based +def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. Example:: >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) + >>> from doctr.models import crnn_resnet31 + >>> model = crnn_resnet31(pretrained=True) >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) @@ -543,32 +522,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            text recognition architecture """ - return _crnn('crnn_mobilenet_v3_small', pretrained, mobilenet_v3_small_r, **kwargs)
                            - - - -
                            -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_large', pretrained, mobilenet_v3_large_r, **kwargs)
                            - + return _crnn('crnn_resnet31', pretrained, **kwargs)
                            @@ -601,7 +555,7 @@

                            Source code for doctr.models.recognition.crnn.tensorflow

                            +
                            diff --git a/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html index 30dde4ac88..6d9bff4577 100644 --- a/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -226,28 +226,21 @@ @@ -287,62 +280,197 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf -from tensorflow.keras import Model, layers - -from doctr.datasets import VOCABS -from doctr.models.classification import magc_resnet31 +from tensorflow.keras import layers, Sequential, Model +from typing import Tuple, List, Dict, Any, Optional +from copy import deepcopy -from ...utils.tensorflow import load_pretrained_params -from ..transformer.tensorflow import Decoder, create_look_ahead_mask, create_padding_mask, positional_encoding +from ..core import RecognitionPostProcessor +from ...backbones.resnet import ResnetStage +from ...utils import conv_sequence, load_pretrained_params +from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask +from ....datasets import VOCABS from .base import _MASTER, _MASTERPostProcessor -__all__ = ['MASTER', 'master'] + +__all__ = ['MASTER', 'master', 'MASTERPostProcessor'] default_cfgs: Dict[str, Dict[str, Any]] = { 'master': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/master-bade6eae.zip', + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'input_shape': (48, 160, 3), + 'vocab': VOCABS['french'], + 'url': None, }, } +class MAGC(layers.Layer): + + """Implements the Multi-Aspect Global Context Attention, as described in + <https://arxiv.org/pdf/1910.02562.pdf>`_. + + Args: + inplanes: input channels + headers: number of headers to split channels + att_scale: if True, re-scale attention to counteract the variance distibutions + **kwargs + """ + + def __init__( + self, + inplanes: int, + headers: int = 1, + att_scale: bool = False, + **kwargs + ) -> None: + super().__init__(**kwargs) + + self.headers = headers # h + self.inplanes = inplanes # C + self.att_scale = att_scale + + self.single_header_inplanes = int(inplanes / headers) # C / h + + self.conv_mask = tf.keras.layers.Conv2D( + filters=1, + kernel_size=1, + kernel_initializer=tf.initializers.he_normal() + ) + + self.transform = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D( + filters=self.inplanes, + kernel_size=1, + kernel_initializer=tf.initializers.he_normal() + ), + tf.keras.layers.LayerNormalization([1, 2, 3]), + tf.keras.layers.ReLU(), + tf.keras.layers.Conv2D( + filters=self.inplanes, + kernel_size=1, + kernel_initializer=tf.initializers.he_normal() + ), + ], + name='transform' + ) + + @tf.function + def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor: + b, h, w, c = (tf.shape(inputs)[i] for i in range(4)) + + # B, H, W, C -->> B*h, H, W, C/h + x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes)) + x = tf.transpose(x, perm=(0, 3, 1, 2, 4)) + x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes)) + + # Compute shorcut + shortcut = x + # B*h, 1, H*W, C/h + shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes)) + # B*h, 1, C/h, H*W + shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2]) + + # Compute context mask + # B*h, H, W, 1, + context_mask = self.conv_mask(x) + # B*h, 1, H*W, 1 + context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1)) + # scale variance + if self.att_scale and self.headers > 1: + context_mask = context_mask / tf.sqrt(self.single_header_inplanes) + # B*h, 1, H*W, 1 + context_mask = tf.keras.activations.softmax(context_mask, axis=2) + + # Compute context + # B*h, 1, C/h, 1 + context = tf.matmul(shortcut, context_mask) + context = tf.reshape(context, shape=(b, 1, c, 1)) + # B, 1, 1, C + context = tf.transpose(context, perm=(0, 1, 3, 2)) + # Set shape to resolve shape when calling this module in the Sequential MAGCResnet + batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1] + context.set_shape([batch, 1, 1, chan]) + return context + + def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor: + # Context modeling: B, H, W, C -> B, 1, 1, C + context = self.context_modeling(inputs) + # Transform: B, 1, 1, C -> B, 1, 1, C + transformed = self.transform(context) + return inputs + transformed + + +class MAGCResnet(Sequential): + + """Implements the modified resnet with MAGC layers, as described in paper. + + Args: + headers: number of header to split channels in MAGC layers + input_shape: shape of the model input (without batch dim) + """ + + def __init__( + self, + headers: int = 1, + input_shape: Tuple[int, int, int] = (48, 160, 3), + ) -> None: + _layers = [ + # conv_1x + *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape), + *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3), + layers.MaxPooling2D((2, 2), (2, 2)), + # conv_2x + ResnetStage(num_blocks=1, output_channels=256), + MAGC(inplanes=256, headers=headers, att_scale=True), + *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3), + layers.MaxPooling2D((2, 2), (2, 2)), + # conv_3x + ResnetStage(num_blocks=2, output_channels=512), + MAGC(inplanes=512, headers=headers, att_scale=True), + *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3), + layers.MaxPooling2D((2, 1), (2, 1)), + # conv_4x + ResnetStage(num_blocks=5, output_channels=512), + MAGC(inplanes=512, headers=headers, att_scale=True), + *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3), + # conv_5x + ResnetStage(num_blocks=3, output_channels=512), + MAGC(inplanes=512, headers=headers, att_scale=True), + *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3), + ] + super().__init__(_layers) + + class MASTER(_MASTER, Model): """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_. Args: - feature_extractor: the backbone serving as feature extractor vocab: vocabulary, (without EOS, SOS, PAD) d_model: d parameter for the transformer decoder + headers: headers for the MAGC module dff: depth of the pointwise feed-forward layer num_heads: number of heads for the mutli-head attention module num_layers: number of decoder layers to stack max_length: maximum length of character sequence handled by the model - dropout: dropout probability of the decoder - input_shape: size of the image inputs - cfg: dictionary containing information about the model + input_size: size of the image inputs """ def __init__( self, - feature_extractor: tf.keras.Model, vocab: str, d_model: int = 512, + headers: int = 1, dff: int = 2048, - num_heads: int = 8, # number of heads in the transformer decoder + num_heads: int = 8, num_layers: int = 3, max_length: int = 50, - dropout: float = 0.2, - input_shape: Tuple[int, int, int] = (32, 128, 3), + input_shape: Tuple[int, int, int] = (48, 160, 3), cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() @@ -352,7 +480,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            self.cfg = cfg self.vocab_size = len(vocab) - self.feat_extractor = feature_extractor + self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape) self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model) # 3 more classes: EOS/PAD/SOS self.decoder = Decoder( @@ -362,21 +490,21 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            dff=dff, vocab_size=self.vocab_size, maximum_position_encoding=max_length, - dropout=dropout, ) self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model) self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform()) self.postprocessor = MASTERPostProcessor(vocab=self.vocab) + @tf.function def make_mask(self, target: tf.Tensor) -> tf.Tensor: look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1]) target_padding_mask = create_padding_mask(target, self.vocab_size + 2) # Pad symbol combined_mask = tf.maximum(target_padding_mask, look_ahead_mask) return combined_mask - @staticmethod def compute_loss( + self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: List[int], @@ -405,7 +533,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -430,16 +558,16 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            """ # Encode - feature = self.feat_extractor(x, **kwargs) + feature = self.feature_extractor(x, **kwargs) b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + tf.cast(self.feature_pe[:, :h * w, :], dtype=feature.dtype) + encoded = feature + self.feature_pe[:, :h * w, :] out: Dict[str, tf.Tensor] = {} if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) if kwargs.get('training', False): if target is None: @@ -484,7 +612,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=encoded.dtype) # 3 symbols + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): ys_mask = self.make_mask(ys) @@ -504,7 +632,6 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures - Args: vocab: string containing the ordered sequence of supported characters ignore_case: if True, ignore case of letters @@ -533,30 +660,17 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            return list(zip(word_values, probs.numpy().tolist())) -def _master( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - **kwargs: Any -) -> MASTER: - - pretrained_backbone = pretrained_backbone and not pretrained +def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = kwargs.get('input_shape', _cfg['input_shape']) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) kwargs['vocab'] = _cfg['vocab'] - kwargs['input_shape'] = _cfg['input_shape'] # Build the model - model = MASTER( - backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg['input_shape'], include_top=False), - cfg=_cfg, - **kwargs, - ) + model = MASTER(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: load_pretrained_params(model, default_cfgs[arch]['url']) @@ -568,22 +682,19 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            [docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - Example:: >>> import tensorflow as tf >>> from doctr.models import master >>> model = master(pretrained=False) >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) - Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - Returns: text recognition architecture """ - return _master('master', pretrained, magc_resnet31, **kwargs)
                            + return _master('master', pretrained, **kwargs)
                            @@ -617,7 +728,7 @@

                            Source code for doctr.models.recognition.master.tensorflow

                            +
                            diff --git a/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

                            Source code for doctr.models.recognition.parseq.tensorflow

                            target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html index 189ae983b7..3a9989ef30 100644 --- a/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,27 +281,35 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - import tensorflow as tf -from tensorflow.keras import Model, Sequential, layers +from tensorflow.keras import Sequential, layers, Model +from typing import Tuple, Dict, List, Any, Optional -from doctr.datasets import VOCABS -from doctr.utils.repr import NestedObject - -from ...classification import resnet31 -from ...utils.tensorflow import load_pretrained_params +from ... import backbones +from ...utils import load_pretrained_params from ..core import RecognitionModel, RecognitionPostProcessor +from doctr.utils.repr import NestedObject -__all__ = ['SAR', 'sar_resnet31'] +__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31'] default_cfgs: Dict[str, Dict[str, Any]] = { + 'sar_vgg16_bn': { + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, + 'input_shape': (32, 128, 3), + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip', + }, 'sar_resnet31': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), + 'mean': (.5, .5, .5), + 'std': (1., 1., 1.), + 'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2, 'input_shape': (32, 128, 3), - 'vocab': VOCABS['legacy_french'], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/sar_resnet31-9ee49970.zip', + 'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-' + 'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip', }, } @@ -389,7 +390,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            super().__init__() self.vocab_size = vocab_size self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_layers)] + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) @@ -410,12 +411,12 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            # initialize states (each of shape (N, rnn_units)) states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype + inputs=None, batch_size=features.shape[0], dtype=tf.float32 ) # run first step of lstm # holistic: shape (N, rnn_units) _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos> so that the one-hot is only zeros) + # Initialize with the index of virtual START symbol (placed after <eos>) symbol = tf.fill(features.shape[0], self.vocab_size + 1) logits_list = [] if kwargs.get('training') and gt is None: @@ -496,8 +497,8 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            self.postprocessor = SARPostProcessor(vocab=vocab) - @staticmethod def compute_loss( + self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -525,7 +526,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -541,7 +542,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling encoded = self.encoder(pooled_features, **kwargs) if target is not None: - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) seq_len = tf.cast(seq_len, tf.int32) decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) @@ -590,30 +591,30 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) + _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) + _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) + _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = backbone_fn( - pretrained=pretrained_backbone, + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( input_shape=_cfg['input_shape'], include_top=False, ) kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + kwargs['embedding_units'] = _cfg['embedding_units'] + kwargs['attention_units'] = _cfg['attention_units'] + kwargs['max_length'] = _cfg['max_length'] + kwargs['num_decoders'] = _cfg['num_decoders'] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) @@ -624,6 +625,30 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            return model +
                            +[docs] +def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: + """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong + Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. + + Example:: + >>> import tensorflow as tf + >>> from doctr.models import sar_vgg16_bn + >>> model = sar_vgg16_bn(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + + Returns: + text recognition architecture + """ + + return _sar('sar_vgg16_bn', pretrained, **kwargs)
                            + + +
                            [docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: @@ -644,7 +669,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            text recognition architecture """ - return _sar('sar_resnet31', pretrained, resnet31, **kwargs)
                            + return _sar('sar_resnet31', pretrained, **kwargs)
                            @@ -678,7 +703,7 @@

                            Source code for doctr.models.recognition.sar.tensorflow

                            +
                            diff --git a/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.5.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

                            Source code for doctr.models.recognition.vitstr.tensorflow

                            from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.5.0/_modules/doctr/models/recognition/zoo.html b/v0.5.0/_modules/doctr/models/recognition/zoo.html index 84482d3f87..0f1bff8861 100644 --- a/v0.5.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.5.0/_modules/doctr/models/recognition/zoo.html @@ -226,28 +226,21 @@ @@ -289,16 +282,19 @@

                            Source code for doctr.models.recognition.zoo

                            from typing import Any
                             
                            -from doctr.file_utils import is_tf_available
                            -from doctr.models.preprocessor import PreProcessor
                            -
                            +from doctr.file_utils import is_tf_available, is_torch_available
                            +from .core import RecognitionPredictor
                            +from ..preprocessor import PreProcessor
                             from .. import recognition
                            -from .predictor import RecognitionPredictor
                            +
                             
                             __all__ = ["recognition_predictor"]
                             
                             
                            -ARCHS = ['crnn_vgg16_bn', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large', 'sar_resnet31', 'master']
                            +if is_tf_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
                            +elif is_torch_available():
                            +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
                             
                             
                             def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                            @@ -310,9 +306,8 @@ 

                            Source code for doctr.models.recognition.zoo

                            kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                 kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                 kwargs['batch_size'] = kwargs.get('batch_size', 32)
                            -    input_shape = _model.cfg['input_shape'][:2] if is_tf_available() else _model.cfg['input_shape'][-2:]
                                 predictor = RecognitionPredictor(
                            -        PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs),
                            +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                                     _model
                                 )
                             
                            @@ -332,7 +327,7 @@ 

                            Source code for doctr.models.recognition.zoo

                                    >>> out = model([input_page])
                             
                                 Args:
                            -        arch: name of the architecture to use (e.g. 'crnn_vgg16_bn')
                            +        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
                                     pretrained: If True, returns a model pre-trained on our text recognition dataset
                             
                                 Returns:
                            @@ -373,7 +368,7 @@ 

                            Source code for doctr.models.recognition.zoo

                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/models/zoo.html b/v0.5.0/_modules/doctr/models/zoo.html index b359dea4f2..bfa5a6fdf4 100644 --- a/v0.5.0/_modules/doctr/models/zoo.html +++ b/v0.5.0/_modules/doctr/models/zoo.html @@ -226,28 +226,16 @@ @@ -288,54 +276,31 @@

                            Source code for doctr.models.zoo

                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                             from typing import Any
                            -
                            +from .core import OCRPredictor
                             from .detection.zoo import detection_predictor
                            -from .predictor import OCRPredictor
                             from .recognition.zoo import recognition_predictor
                             
                            +
                             __all__ = ["ocr_predictor"]
                             
                             
                            -def _predictor(
                            -    det_arch: str,
                            -    reco_arch: str,
                            -    pretrained: bool,
                            -    assume_straight_pages: bool = True,
                            -    preserve_aspect_ratio: bool = False,
                            -    det_bs: int = 2,
                            -    reco_bs: int = 128,
                            -    **kwargs,
                            -) -> OCRPredictor:
                            +def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
                             
                                 # Detection
                            -    det_predictor = detection_predictor(
                            -        det_arch,
                            -        pretrained=pretrained,
                            -        batch_size=det_bs,
                            -        assume_straight_pages=assume_straight_pages,
                            -        preserve_aspect_ratio=preserve_aspect_ratio,
                            -    )
                            +    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
                             
                                 # Recognition
                                 reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
                             
                            -    return OCRPredictor(
                            -        det_predictor,
                            -        reco_predictor,
                            -        assume_straight_pages=assume_straight_pages,
                            -        **kwargs
                            -    )
                            +    return OCRPredictor(det_predictor, reco_predictor)
                             
                             
                             
                            -[docs] +[docs] def ocr_predictor( det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, - assume_straight_pages: bool = True, - export_as_straight_boxes: bool = False, - preserve_aspect_ratio: bool = False, **kwargs: Any ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. @@ -343,34 +308,19 @@

                            Source code for doctr.models.zoo

                                 Example::
                                     >>> import numpy as np
                                     >>> from doctr.models import ocr_predictor
                            -        >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                            +        >>> model = ocr_predictor(pretrained=True)
                                     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                     >>> out = model([input_page])
                             
                                 Args:
                            -        det_arch: name of the detection architecture to use (e.g. 'db_resnet50', 'db_mobilenet_v3_large')
                            -        reco_arch: name of the recognition architecture to use (e.g. 'crnn_vgg16_bn', 'sar_resnet31')
                            +        arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet')
                                     pretrained: If True, returns a model pre-trained on our OCR dataset
                            -        assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
                            -            without rotated textual elements.
                            -        export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions
                            -            (potentially rotated) as straight bounding boxes.
                            -        preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before
                            -            running the detection model on it.
                             
                                 Returns:
                                     OCR predictor
                                 """
                             
                            -    return _predictor(
                            -        det_arch,
                            -        reco_arch,
                            -        pretrained,
                            -        assume_straight_pages=assume_straight_pages,
                            -        export_as_straight_boxes=export_as_straight_boxes,
                            -        preserve_aspect_ratio=preserve_aspect_ratio,
                            -        **kwargs,
                            -    )
                            + return _predictor(det_arch, reco_arch, pretrained, **kwargs)
                            @@ -404,7 +354,7 @@

                            Source code for doctr.models.zoo

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/transforms/modules/base.html b/v0.5.0/_modules/doctr/transforms/modules/base.html index 1beac0790e..e7b5ea10d9 100644 --- a/v0.5.0/_modules/doctr/transforms/modules/base.html +++ b/v0.5.0/_modules/doctr/transforms/modules/base.html @@ -226,28 +226,21 @@ @@ -287,62 +280,14 @@

                            Source code for doctr.transforms.modules.base

                            # This program is licensed under the Apache License version 2. # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. -import math import random -from typing import Any, Callable, Dict, List, Tuple - -import numpy as np +from typing import List, Any, Callable from doctr.utils.repr import NestedObject - from .. import functional as F -__all__ = ['SampleCompose', 'ImageTransform', 'ColorInversion', 'OneOf', 'RandomApply', 'RandomRotate', 'RandomCrop'] - -class SampleCompose(NestedObject): - """Implements a wrapper that will apply transformations sequentially on both image and target - Example:: - >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate - >>> import tensorflow as tf - >>> import numpy as np - >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)]) - >>> out, out_boxes = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4))) - Args: - transforms: list of transformation modules - """ - - _children_names: List[str] = ['sample_transforms'] - - def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None: - self.sample_transforms = transforms - - def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]: - for t in self.sample_transforms: - x, target = t(x, target) - - return x, target - - -class ImageTransform(NestedObject): - """Implements a transform wrapper to turn an image-only transformation into an image+target transform - Example:: - >>> from doctr.transforms import ImageTransform, ColorInversion - >>> import tensorflow as tf - >>> transfo = ImageTransform(ColorInversion((32, 32))) - >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None) - Args: - transform: the image transformation module to wrap - """ - - _children_names: List[str] = ['img_transform'] - - def __init__(self, transform: Callable[[Any], Any]) -> None: - self.img_transform = transform - - def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]: - img = self.img_transform(img) - return img, target +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
                            @@ -426,66 +371,6 @@

                            Source code for doctr.transforms.modules.base

                            return self.transform(img) return img
                            - - -
                            -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image and its boxes - - .. image:: https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png - :align: center - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - def __init__(self, max_angle: float = 5., expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: - angle = random.uniform(-self.max_angle, self.max_angle) - r_img, r_boxes = F.rotate(img, target, angle, self.expand) - return r_img, r_boxes
                            - - - -
                            -[docs] -class RandomCrop(NestedObject): - """Randomly crop a tensor image and its boxes - - Args: - scale: tuple of floats, relative (min_area, max_area) of the crop - ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w - """ - def __init__(self, scale: Tuple[float, float] = (0.08, 1.), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: - self.scale = scale - self.ratio = ratio - - def extra_repr(self) -> str: - return f"scale={self.scale}, ratio={self.ratio}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - scale = random.uniform(self.scale[0], self.scale[1]) - ratio = random.uniform(self.ratio[0], self.ratio[1]) - # Those might overflow - crop_h = math.sqrt(scale * ratio) - crop_w = math.sqrt(scale / ratio) - xmin, ymin = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h) - xmax, ymax = xmin + crop_w, ymin + crop_h - # Clip them - xmin, ymin = max(xmin, 0), max(ymin, 0) - xmax, ymax = min(xmax, 1), min(ymax, 1) - - croped_img, crop_boxes = F.crop_detection(img, target["boxes"], (xmin, ymin, xmax, ymax)) - return croped_img, dict(boxes=crop_boxes)
                            -
                            @@ -518,7 +403,7 @@

                            Source code for doctr.transforms.modules.base

                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html index c641ce9879..51b31b4fc4 100644 --- a/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.5.0/_modules/doctr/transforms/modules/tensorflow.html @@ -226,28 +226,21 @@ @@ -288,16 +281,14 @@

                            Source code for doctr.transforms.modules.tensorflow

                            # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details. import random -from typing import Any, Callable, Iterable, List, Tuple, Union - import tensorflow as tf -import tensorflow_addons as tfa +from typing import List, Any, Tuple, Callable from doctr.utils.repr import NestedObject + __all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness', - 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality', 'GaussianBlur', - 'ChannelShuffle', 'GaussianNoise'] + 'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
                            @@ -364,7 +355,6 @@

                            Source code for doctr.transforms.modules.tensorflow

                            return _repr def __call__(self, img: tf.Tensor) -> tf.Tensor: - input_dtype = img.dtype img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: # pad width @@ -375,7 +365,7 @@

                            Source code for doctr.transforms.modules.tensorflow

                            else: offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - return tf.cast(img, dtype=input_dtype)
                            + return img
                            @@ -395,15 +385,15 @@

                            Source code for doctr.transforms.modules.tensorflow

                            std: standard deviation per channel """ def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
                            @@ -441,12 +431,8 @@

                            Source code for doctr.transforms.modules.tensorflow

                            >>> transfo = ToGray() >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ - def __init__(self, num_output_channels: int = 1): - self.num_output_channels = num_output_channels - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.rgb_to_grayscale(img) - return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
                            + return tf.image.rgb_to_grayscale(img)
                            @@ -621,86 +607,6 @@

                            Source code for doctr.transforms.modules.tensorflow

                            img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality )
                            - - -
                            -[docs] -class GaussianBlur(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - Example:: - >>> from doctr.transforms import GaussianBlur - >>> import tensorflow as tf - >>> transfo = GaussianBlur(3, (.1, 5)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - kernel_shape: size of the blurring kernel - std: min and max value of the standard deviation - """ - def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: - self.kernel_shape = kernel_shape - self.std = std - - def extra_repr(self) -> str: - return f"kernel_shape={self.kernel_shape}, std={self.std}" - - @tf.function - def __call__(self, img: tf.Tensor) -> tf.Tensor: - sigma = random.uniform(self.std[0], self.std[1]) - return tfa.image.gaussian_filter2d( - img, filter_shape=self.kernel_shape, sigma=sigma, - )
                            - - - -
                            -[docs] -class ChannelShuffle(NestedObject): - """Randomly shuffle channel order of a given image""" - - def __init__(self): - pass - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
                            - - - -
                            -[docs] -class GaussianNoise(NestedObject): - """Adds Gaussian Noise to the input tensor - - Example:: - >>> from doctr.transforms import GaussianNoise - >>> import tensorflow as tf - >>> transfo = GaussianNoise(0., 1.) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - mean : mean of the gaussian distribution - std : std of the gaussian distribution - """ - def __init__(self, mean: float = 0., std: float = 1.) -> None: - super().__init__() - self.std = std - self.mean = mean - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - # Reshape the distribution - noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std - if x.dtype == tf.uint8: - return tf.cast( - tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), - dtype=tf.uint8 - ) - else: - return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) - - def extra_repr(self) -> str: - return f"mean={self.mean}, std={self.std}"
                            -
                            @@ -733,7 +639,7 @@

                            Source code for doctr.transforms.modules.tensorflow

                            +
                            diff --git a/v0.5.0/_modules/doctr/utils/metrics.html b/v0.5.0/_modules/doctr/utils/metrics.html index 4c86cec23d..20af9416ea 100644 --- a/v0.5.0/_modules/doctr/utils/metrics.html +++ b/v0.5.0/_modules/doctr/utils/metrics.html @@ -226,28 +226,21 @@ @@ -287,19 +280,19 @@

                            Source code for doctr.utils.metrics

                             # This program is licensed under the Apache License version 2.
                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                            -from typing import Dict, List, Optional, Tuple
                            -
                            -import cv2
                             import numpy as np
                            -from scipy.optimize import linear_sum_assignment
                            +import cv2
                            +from typing import List, Tuple, Dict, Optional
                             from unidecode import unidecode
                            +from scipy.optimize import linear_sum_assignment
                            +from doctr.utils.geometry import rbbox_to_polygon
                             
                            -__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'polygon_iou',
                            -           'nms', 'LocalizationConfusion', 'OCRMetric', 'DetectionMetric']
                            +__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
                            +           'nms', 'LocalizationConfusion', 'OCRMetric']
                             
                             
                             def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
                            -    """Performs string comparison with multiple levels of tolerance
                            +    """Perform string comparison with multiple levels of tolerance
                             
                                 Args:
                                     word1: a string
                            @@ -322,26 +315,26 @@ 

                            Source code for doctr.utils.metrics

                             
                            [docs] class TextMatch: - r"""Implements text match metric (word-level accuracy) for recognition task. + """Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \forall X, Y \in \mathcal{W}^N, - TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) + \\forall X, Y \\in \\mathcal{W}^N, + TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \forall a, x \in \mathcal{W}, - f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{W}` is the set of all possible character sequences, + \\forall a, x \\in \\mathcal{W}, + f_a(x) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } x = a \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. Example:: @@ -354,8 +347,6 @@

                            Source code for doctr.utils.metrics

                                 def __init__(self) -> None:
                                     self.reset()
                             
                            -
                            -[docs] def update( self, gt: List[str], @@ -365,8 +356,7 @@

                            Source code for doctr.utils.metrics

                             
                                     Args:
                                         gt: list of groung-truth character sequences
                            -            pred: list of predicted character sequences
                            -        """
                            +            pred: list of predicted character sequences"""
                             
                                     if len(gt) != len(pred):
                                         raise AssertionError("prediction size does not match with ground-truth labels size")
                            @@ -378,8 +368,7 @@ 

                            Source code for doctr.utils.metrics

                                         self.unidecode += int(_unidecode)
                                         self.unicase += int(_unicase)
                             
                            -        self.total += len(gt)
                            - + self.total += len(gt)
                            [docs] @@ -411,7 +400,7 @@

                            Source code for doctr.utils.metrics

                             
                             
                             def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                            -    """Computes the IoU between two sets of bounding boxes
                            +    """Compute the IoU between two sets of bounding boxes
                             
                                 Args:
                                     boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
                            @@ -439,7 +428,7 @@ 

                            Source code for doctr.utils.metrics

                             
                             
                             def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                            -    """Computes the IoA (intersection over area) between two sets of bounding boxes:
                            +    """Compute the IoA (intersection over area) between two sets of bounding boxes:
                                 ioa(i, j) = inter(i, j) / area(i)
                             
                                 Args:
                            @@ -468,7 +457,7 @@ 

                            Source code for doctr.utils.metrics

                             
                             
                             def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
                            -    """Computes the IoU between two sets of boolean masks
                            +    """Compute the IoU between two sets of boolean masks
                             
                                 Args:
                                     masks_1: boolean masks of shape (N, H, W)
                            @@ -484,84 +473,19 @@ 

                            Source code for doctr.utils.metrics

                                 iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
                             
                                 if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
                            +        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
                            +        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
                                     axes = tuple(range(2, masks_1.ndim + 1))
                            -        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                            -        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                            -        iou_mat = intersection / union
                            +        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
                             
                                 return iou_mat
                             
                             
                            -def polygon_iou(
                            -    polys_1: np.ndarray,
                            -    polys_2: np.ndarray,
                            -    mask_shape: Tuple[int, int],
                            -    use_broadcasting: bool = False
                            -) -> np.ndarray:
                            -    """Computes the IoU between two sets of rotated bounding boxes
                            -
                            -    Args:
                            -        polys_1: rotated bounding boxes of shape (N, 4, 2)
                            -        polys_2: rotated bounding boxes of shape (M, 4, 2)
                            -        mask_shape: spatial shape of the intermediate masks
                            -        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
                            -
                            -    Returns:
                            -        the IoU matrix of shape (N, M)
                            -    """
                            -
                            -    if polys_1.ndim != 3 or polys_2.ndim != 3:
                            -        raise AssertionError("expects boxes to be in format (N, 4, 2)")
                            -
                            -    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
                            -
                            -    if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
                            -        if use_broadcasting:
                            -            masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
                            -            masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
                            -            iou_mat = mask_iou(masks_1, masks_2)
                            -        else:
                            -            # Save memory by doing the computation for each pair
                            -            for idx, b1 in enumerate(polys_1):
                            -                m1 = _rbox_to_mask(b1, mask_shape)
                            -                for _idx, b2 in enumerate(polys_2):
                            -                    m2 = _rbox_to_mask(b2, mask_shape)
                            -                    iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
                            -
                            -    return iou_mat
                            -
                            -
                            -def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                            -    """Converts a rotated bounding box to a boolean mask
                            -
                            -    Args:
                            -        box: rotated bounding box of shape (4, 2)
                            -        shape: spatial shapes of the output masks
                            -
                            -    Returns:
                            -        the boolean mask of the specified shape
                            -    """
                            -
                            -    mask = np.zeros(shape, dtype=np.uint8)
                            -    # Get absolute coords
                            -    if box.dtype != int:
                            -        abs_box = box.copy()
                            -        abs_box[:, 0] = abs_box[:, 0] * shape[1]
                            -        abs_box[:, 1] = abs_box[:, 1] * shape[0]
                            -        abs_box = abs_box.round().astype(int)
                            -    else:
                            -        abs_box = box
                            -        abs_box[2:] = abs_box[2:] + 1
                            -    cv2.fillPoly(mask, [abs_box - 1], 1)
                            -
                            -    return mask.astype(bool)
                            -
                            -
                             def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                            -    """Converts rotated bounding boxes to boolean masks
                            +    """Convert boxes to masks
                             
                                 Args:
                            -        boxes: rotated bounding boxes of shape (N, 4, 2)
                            +        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
                                     shape: spatial shapes of the output masks
                             
                                 Returns:
                            @@ -574,8 +498,8 @@ 

                            Source code for doctr.utils.metrics

                                     # Get absolute coordinates
                                     if boxes.dtype != np.int:
                                         abs_boxes = boxes.copy()
                            -            abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
                            -            abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
                            +            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
                            +            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
                                         abs_boxes = abs_boxes.round().astype(np.int)
                                     else:
                                         abs_boxes = boxes
                            @@ -583,7 +507,9 @@ 

                            Source code for doctr.utils.metrics

                             
                                     # TODO: optimize slicing to improve vectorization
                                     for idx, _box in enumerate(abs_boxes):
                            -            cv2.fillPoly(masks[idx], [_box - 1], 1)
                            +            box = rbbox_to_polygon(_box)
                            +            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
                            +
                                 return masks.astype(bool)
                             
                             
                            @@ -628,29 +554,29 @@ 

                            Source code for doctr.utils.metrics

                             
                            [docs] class LocalizationConfusion: - r"""Implements common confusion metrics and mean IoU for localization evaluation. + """Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ - Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ - Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ - meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) + \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ + Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \forall y \in \mathcal{B}, - g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, + \\forall y \\in \\mathcal{B}, + g_X(y) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. Example:: @@ -662,41 +588,30 @@

                            Source code for doctr.utils.metrics

                             
                                 Args:
                                     iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
                            -        use_polygons: if set to True, predictions and targets will be expected to have rotated format
                            -        mask_shape: if use_polygons is True, describes the spatial shape of the image used
                            -        use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
                                 """
                             
                                 def __init__(
                                     self,
                                     iou_thresh: float = 0.5,
                            -        use_polygons: bool = False,
                            +        rotated_bbox: bool = False,
                                     mask_shape: Tuple[int, int] = (1024, 1024),
                            -        use_broadcasting: bool = True,
                                 ) -> None:
                                     self.iou_thresh = iou_thresh
                            -        self.use_polygons = use_polygons
                            +        self.rotated_bbox = rotated_bbox
                                     self.mask_shape = mask_shape
                            -        self.use_broadcasting = use_broadcasting
                                     self.reset()
                             
                            -
                            -[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: - """Updates the metric - - Args: - gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - """ if preds.shape[0] > 0: # Compute IoU - if self.use_polygons: - iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting) + if self.rotated_bbox: + mask_gts = rbox_to_mask(gts, shape=self.mask_shape) + mask_preds = rbox_to_mask(preds, shape=self.mask_shape) + iou_mat = mask_iou(mask_gts, mask_preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=0).sum()) + self.tot_iou += float(iou_mat.max(axis=1).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -704,8 +619,7 @@

                            Source code for doctr.utils.metrics

                             
                                     # Update counts
                                     self.num_gts += gts.shape[0]
                            -        self.num_preds += preds.shape[0]
                            - + self.num_preds += preds.shape[0]
                            [docs] @@ -739,32 +653,32 @@

                            Source code for doctr.utils.metrics

                             
                            [docs] class OCRMetric: - r"""Implements an end-to-end OCR metric. + """Implements end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, - \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ - Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, + \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ + Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \forall (b, l) \in \mathcal{B} \times \mathcal{L}, - h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{L}` is the set of possible character sequences, + \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, + h_{B,L}(b, l) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ + & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, + :math:`\\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. Example:: @@ -777,26 +691,19 @@

                            Source code for doctr.utils.metrics

                             
                                 Args:
                                     iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match
                            -        use_polygons: if set to True, predictions and targets will be expected to have rotated format
                            -        mask_shape: if use_polygons is True, describes the spatial shape of the image used
                            -        use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory
                                 """
                             
                                 def __init__(
                                     self,
                                     iou_thresh: float = 0.5,
                            -        use_polygons: bool = False,
                            +        rotated_bbox: bool = False,
                                     mask_shape: Tuple[int, int] = (1024, 1024),
                            -        use_broadcasting: bool = True,
                                 ) -> None:
                                     self.iou_thresh = iou_thresh
                            -        self.use_polygons = use_polygons
                            +        self.rotated_bbox = rotated_bbox
                                     self.mask_shape = mask_shape
                            -        self.use_broadcasting = use_broadcasting
                                     self.reset()
                             
                            -
                            -[docs] def update( self, gt_boxes: np.ndarray, @@ -804,14 +711,6 @@

                            Source code for doctr.utils.metrics

                                     gt_labels: List[str],
                                     pred_labels: List[str],
                                 ) -> None:
                            -        """Updates the metric
                            -
                            -        Args:
                            -            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
                            -            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
                            -            gt_labels: a list of N string labels
                            -            pred_labels: a list of M string labels
                            -        """
                             
                                     if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
                                         raise AssertionError("there should be the same number of boxes and string both for the ground truth "
                            @@ -819,12 +718,14 @@ 

                            Source code for doctr.utils.metrics

                             
                                     # Compute IoU
                                     if pred_boxes.shape[0] > 0:
                            -            if self.use_polygons:
                            -                iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
                            +            if self.rotated_bbox:
                            +                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
                            +                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
                            +                iou_mat = mask_iou(mask_gts, mask_preds)
                                         else:
                                             iou_mat = box_iou(gt_boxes, pred_boxes)
                             
                            -            self.tot_iou += float(iou_mat.max(axis=0).sum())
                            +            self.tot_iou += float(iou_mat.max(axis=1).sum())
                             
                                         # Assign pairs
                                         gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
                            @@ -838,8 +739,7 @@ 

                            Source code for doctr.utils.metrics

                                             self.unicase_matches += int(_unicase)
                             
                                     self.num_gts += gt_boxes.shape[0]
                            -        self.num_preds += pred_boxes.shape[0]
                            - + self.num_preds += pred_boxes.shape[0]
                            [docs] @@ -847,7 +747,7 @@

                            Source code for doctr.utils.metrics

                                     """Computes the aggregated metrics
                             
                                     Returns:
                            -            a tuple with the recall & precision for each string comparison and the mean IoU
                            +            a tuple with the recall & precision for each string comparison flexibility and the mean IoU
                                     """
                             
                                     # Recall
                            @@ -881,135 +781,6 @@ 

                            Source code for doctr.utils.metrics

                                     self.unidecode_matches = 0
                                     self.unicase_matches = 0
                            - - -
                            -[docs] -class DetectionMetric: - r"""Implements an object detection metric. - - The aggregated metrics are computed as follows: - - .. math:: - \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, - \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ - Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) - - with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and - :math:`y`, and the function :math:`h_{B, C}` defined as: - - .. math:: - \forall (b, c) \in \mathcal{B} \times \mathcal{C}, - h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{C}` is the set of possible class indices, - :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - - Example:: - >>> import numpy as np - >>> from doctr.utils import DetectionMetric - >>> metric = DetectionMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) - >>> metric.summary() - - Args: - iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory - """ - - def __init__( - self, - iou_thresh: float = 0.5, - use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, - ) -> None: - self.iou_thresh = iou_thresh - self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting - self.reset() - -
                            -[docs] - def update( - self, - gt_boxes: np.ndarray, - pred_boxes: np.ndarray, - gt_labels: np.ndarray, - pred_labels: np.ndarray, - ) -> None: - """Updates the metric - - Args: - gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - gt_labels: an array of class indices of shape (N,) - pred_labels: an array of class indices of shape (M,) - """ - - if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: - raise AssertionError("there should be the same number of boxes and string both for the ground truth " - "and the predictions") - - # Compute IoU - if pred_boxes.shape[0] > 0: - if self.use_polygons: - iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting) - else: - iou_mat = box_iou(gt_boxes, pred_boxes) - - self.tot_iou += float(iou_mat.max(axis=0).sum()) - - # Assign pairs - gt_indices, pred_indices = linear_sum_assignment(-iou_mat) - is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh - # Category comparison - self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) - - self.num_gts += gt_boxes.shape[0] - self.num_preds += pred_boxes.shape[0]
                            - - -
                            -[docs] - def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: - """Computes the aggregated metrics - - Returns: - a tuple with the recall & precision for each class prediction and the mean IoU - """ - - # Recall - recall = self.num_matches / self.num_gts if self.num_gts > 0 else None - - # Precision - precision = self.num_matches / self.num_preds if self.num_preds > 0 else None - - # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None - - return recall, precision, mean_iou
                            - - - def reset(self) -> None: - self.num_gts = 0 - self.num_preds = 0 - self.tot_iou = 0. - self.num_matches = 0
                            -
                            @@ -1042,7 +813,7 @@

                            Source code for doctr.utils.metrics

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/doctr/utils/visualization.html b/v0.5.0/_modules/doctr/utils/visualization.html index 74860cacb0..21743f6182 100644 --- a/v0.5.0/_modules/doctr/utils/visualization.html +++ b/v0.5.0/_modules/doctr/utils/visualization.html @@ -226,28 +226,21 @@ @@ -287,140 +280,70 @@

                            Source code for doctr.utils.visualization

                             # This program is licensed under the Apache License version 2.
                             # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                             
                            -from copy import deepcopy
                            -from typing import Any, Dict, List, Optional, Tuple, Union
                            -
                            -import cv2
                            -import matplotlib.patches as patches
                             import matplotlib.pyplot as plt
                            +from matplotlib.figure import Figure
                            +import matplotlib.patches as patches
                             import mplcursors
                            +from PIL import ImageFont, ImageDraw, Image
                             import numpy as np
                            -from matplotlib.figure import Figure
                            -from PIL import Image, ImageDraw
                            -from unidecode import unidecode
                            +import cv2
                            +from typing import Tuple, List, Dict, Any, Union
                             
                            -from .common_types import BoundingBox, Polygon4P
                            -from .fonts import get_font
                            +from .common_types import BoundingBox, RotatedBbox
                             
                            -__all__ = ['visualize_page', 'synthesize_page', 'draw_boxes']
                            +__all__ = ['visualize_page', 'synthetize_page']
                             
                             
                            -def rect_patch(
                            -    geometry: BoundingBox,
                            +def create_rect_patch(
                            +    geometry: Union[BoundingBox, RotatedBbox],
                            +    label: str,
                                 page_dimensions: Tuple[int, int],
                            -    label: Optional[str] = None,
                            -    color: Tuple[float, float, float] = (0, 0, 0),
                            +    color: Tuple[int, int, int],
                                 alpha: float = 0.3,
                                 linewidth: int = 2,
                                 fill: bool = True,
                            -    preserve_aspect_ratio: bool = False
                            -) -> patches.Rectangle:
                            -    """Create a matplotlib rectangular patch for the element
                            +) -> patches.Patch:
                            +    """Create a matplotlib patch (rectangle) bounding the element
                             
                                 Args:
                                     geometry: bounding box of the element
                            -        page_dimensions: dimensions of the Page in format (height, width)
                                     label: label to display when hovered
                            +        page_dimensions: dimensions of the Page
                                     color: color to draw box
                                     alpha: opacity parameter to fill the boxes, 0 = transparent
                                     linewidth: line width
                            -        fill: whether the patch should be filled
                            -        preserve_aspect_ratio: pass True if you passed True to the predictor
                             
                                 Returns:
                                     a rectangular Patch
                                 """
                            -
                            -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
                            -        raise ValueError("invalid geometry format")
                            -
                            -    # Unpack
                                 height, width = page_dimensions
                            -    (xmin, ymin), (xmax, ymax) = geometry
                            -    # Switch to absolute coords
                            -    if preserve_aspect_ratio:
                            -        width = height = max(height, width)
                            -    xmin, w = xmin * width, (xmax - xmin) * width
                            -    ymin, h = ymin * height, (ymax - ymin) * height
                            -
                            -    return patches.Rectangle(
                            -        (xmin, ymin),
                            -        w,
                            -        h,
                            -        fill=fill,
                            -        linewidth=linewidth,
                            -        edgecolor=(*color, alpha),
                            -        facecolor=(*color, alpha),
                            -        label=label,
                            -    )
                            -
                            -
                            -def polygon_patch(
                            -    geometry: np.ndarray,
                            -    page_dimensions: Tuple[int, int],
                            -    label: Optional[str] = None,
                            -    color: Tuple[float, float, float] = (0, 0, 0),
                            -    alpha: float = 0.3,
                            -    linewidth: int = 2,
                            -    fill: bool = True,
                            -    preserve_aspect_ratio: bool = False
                            -) -> patches.Polygon:
                            -    """Create a matplotlib polygon patch for the element
                            -
                            -    Args:
                            -        geometry: bounding box of the element
                            -        page_dimensions: dimensions of the Page in format (height, width)
                            -        label: label to display when hovered
                            -        color: color to draw box
                            -        alpha: opacity parameter to fill the boxes, 0 = transparent
                            -        linewidth: line width
                            -        fill: whether the patch should be filled
                            -        preserve_aspect_ratio: pass True if you passed True to the predictor
                            -
                            -    Returns:
                            -        a polygon Patch
                            -    """
                            -
                            -    if not geometry.shape == (4, 2):
                            -        raise ValueError("invalid geometry format")
                            -
                            -    # Unpack
                            -    height, width = page_dimensions
                            -    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
                            -    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
                            -
                            -    return patches.Polygon(
                            -        geometry,
                            -        fill=fill,
                            -        linewidth=linewidth,
                            -        edgecolor=(*color, alpha),
                            -        facecolor=(*color, alpha),
                            -        label=label,
                            -    )
                            -
                            -
                            -def create_obj_patch(
                            -    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
                            -    page_dimensions: Tuple[int, int],
                            -    **kwargs: Any,
                            -) -> patches.Patch:
                            -    """Create a matplotlib patch for the element
                            -
                            -    Args:
                            -        geometry: bounding box (straight or rotated) of the element
                            -        page_dimensions: dimensions of the page in format (height, width)
                            -
                            -    Returns:
                            -        a matplotlib Patch
                            -    """
                            -    if isinstance(geometry, tuple):
                            -        if len(geometry) == 2:  # straight word BB (2 pts)
                            -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                            -        elif len(geometry) == 4:  # rotated word BB (4 pts)
                            -            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)  # type: ignore[arg-type]
                            -    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
                            -        return polygon_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                            -    raise ValueError("invalid geometry format")
                            +    if len(geometry) == 5:
                            +        x, y, w, h, a = geometry  # type: ignore[misc]
                            +        x, w = x * width, w * width
                            +        y, h = y * height, h * height
                            +        points = cv2.boxPoints(((x, y), (w, h), a))
                            +        return patches.Polygon(
                            +            points,
                            +            fill=fill,
                            +            linewidth=linewidth,
                            +            edgecolor=(*color, alpha),
                            +            facecolor=(*color, alpha),
                            +            label=label
                            +        )
                            +    else:
                            +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
                            +        xmin, xmax = xmin * width, xmax * width
                            +        ymin, ymax = ymin * height, ymax * height
                            +        return patches.Rectangle(
                            +            (xmin, ymin),
                            +            xmax - xmin,
                            +            ymax - ymin,
                            +            fill=fill,
                            +            linewidth=linewidth,
                            +            edgecolor=(*color, alpha),
                            +            facecolor=(*color, alpha),
                            +            label=label
                            +        )
                             
                             
                             
                            @@ -471,8 +394,7 @@

                            Source code for doctr.utils.visualization

                             
                                 for block in page['blocks']:
                                     if not words_only:
                            -            rect = create_obj_patch(block['geometry'], page['dimensions'],
                            -                                    label='block', color=(0, 1, 0), linewidth=1, **kwargs)
                            +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                                         # add patch on figure
                                         ax.add_patch(rect)
                                         if interactive:
                            @@ -481,16 +403,14 @@ 

                            Source code for doctr.utils.visualization

                             
                                     for line in block['lines']:
                                         if not words_only:
                            -                rect = create_obj_patch(line['geometry'], page['dimensions'],
                            -                                        label='line', color=(1, 0, 0), linewidth=1, **kwargs)
                            +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                                             ax.add_patch(rect)
                                             if interactive:
                                                 artists.append(rect)
                             
                                         for word in line['words']:
                            -                rect = create_obj_patch(word['geometry'], page['dimensions'],
                            -                                        label=f"{word['value']} (confidence: {word['confidence']:.2%})",
                            -                                        color=(0, 0, 1), **kwargs)
                            +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
                            +                                         page['dimensions'], (0, 0, 1), **kwargs)
                                             ax.add_patch(rect)
                                             if interactive:
                                                 artists.append(rect)
                            @@ -515,11 +435,11 @@ 

                            Source code for doctr.utils.visualization

                             
                                     if display_artefacts:
                                         for artefact in block['artefacts']:
                            -                rect = create_obj_patch(
                            +                rect = create_rect_patch(
                                                 artefact['geometry'],
                            +                    'artefact',
                                                 page['dimensions'],
                            -                    label='artefact',
                            -                    color=(0.5, 0.5, 0.5),
                            +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                                                 linewidth=1,
                                                 **kwargs
                                             )
                            @@ -536,13 +456,10 @@ 

                            Source code for doctr.utils.visualization

                             
                             
                             
                            -
                            -[docs] -def synthesize_page( +def synthetize_page( page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, - font_family: Optional[str] = None, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -550,12 +467,10 @@

                            Source code for doctr.utils.visualization

                                     page: exported Page object to represent
                                     draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
                                     font_size: size of the font, default font = 13
                            -        font_family: family of the font
                             
                                 Return:
                            -        the synthesized page
                            +        A np array (drawn page)
                                 """
                            -
                                 # Draw template
                                 h, w = page["dimensions"]
                                 response = 255 * np.ones((h, w, 3), dtype=np.int32)
                            @@ -566,19 +481,20 @@ 

                            Source code for doctr.utils.visualization

                                         for word in line["words"]:
                                             # Get aboslute word geometry
                                             (xmin, ymin), (xmax, ymax) = word["geometry"]
                            -                xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
                            -                ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
                            +                xmin, xmax = int(w * xmin), int(w * xmax)
                            +                ymin, ymax = int(h * ymin), int(h * ymax)
                             
                                             # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
                            -                font = get_font(font_family, int(0.75 * (ymax - ymin)))
                            -                img = Image.new('RGB', (xmax - xmin, ymax - ymin), color=(255, 255, 255))
                            +                h_box, w_box = ymax - ymin, xmax - xmin
                            +                h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75))
                            +                img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255))
                                             d = ImageDraw.Draw(img)
                            +
                                             # Draw in black the value of the word
                            -                try:
                            -                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
                            -                except UnicodeEncodeError:
                            -                    # When character cannot be encoded, use its unidecode version
                            -                    d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
                            +                d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0))
                            +
                            +                # Resize back to box size
                            +                img = img.resize((w_box, h_box), Image.NEAREST)
                             
                                             # Colorize if draw_proba
                                             if draw_proba:
                            @@ -592,40 +508,7 @@ 

                            Source code for doctr.utils.visualization

                                             # Write to response page
                                             response[ymin:ymax, xmin:xmax, :] = np.array(img)
                             
                            -    return response
                            - - - -def draw_boxes( - boxes: np.ndarray, - image: np.ndarray, - color: Optional[Tuple[int, int, int]] = None, - **kwargs -) -> None: - """Draw an array of relative straight boxes on an image - - Args: - boxes: array of relative boxes, of shape (*, 4) - image: np array, float32 or uint8 - color: color to use for bounding box edges - """ - h, w = image.shape[:2] - # Convert boxes to absolute coords - _boxes = deepcopy(boxes) - _boxes[:, [0, 2]] *= w - _boxes[:, [1, 3]] *= h - _boxes = _boxes.astype(np.int32) - for box in _boxes.tolist(): - xmin, ymin, xmax, ymax = box - image = cv2.rectangle( - image, - (xmin, ymin), - (xmax, ymax), - color=color if isinstance(color, tuple) else (0, 0, 255), - thickness=2 - ) - plt.imshow(image) - plt.plot(**kwargs) + return response
                            @@ -658,7 +541,7 @@

                            Source code for doctr.utils.visualization

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.0/_modules/index.html b/v0.5.0/_modules/index.html index b49f0d077f..c887b618c2 100644 --- a/v0.5.0/_modules/index.html +++ b/v0.5.0/_modules/index.html @@ -226,28 +226,21 @@ @@ -283,35 +276,18 @@

                            All modules for which code is available

                            -
                            +
                            diff --git a/v0.5.0/_sources/changelog.rst.txt b/v0.5.0/_sources/changelog.rst.txt index d98e3c66b6..430097d6c8 100644 --- a/v0.5.0/_sources/changelog.rst.txt +++ b/v0.5.0/_sources/changelog.rst.txt @@ -1,22 +1,6 @@ Changelog ========= -v0.4.1 (2021-11-22) -------------------- -Release note: `v0.4.1 `_ - -v0.4.0 (2021-10-01) -------------------- -Release note: `v0.4.0 `_ - -v0.3.1 (2021-08-27) -------------------- -Release note: `v0.3.1 `_ - -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.5.0/_sources/datasets.rst.txt b/v0.5.0/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.5.0/_sources/datasets.rst.txt +++ b/v0.5.0/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.5.0/_sources/index.rst.txt b/v0.5.0/_sources/index.rst.txt index 2be367403c..fc3ff89fdf 100644 --- a/v0.5.0/_sources/index.rst.txt +++ b/v0.5.0/_sources/index.rst.txt @@ -1,7 +1,7 @@ -docTR: Document Text Recognition +DocTR: Document Text Recognition ================================ -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -12,6 +12,9 @@ DocTR provides an easy and powerful way to extract valuable information from you * |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. +Welcome to the documentation of `DocTR `_! + + Main Features ------------- @@ -20,18 +23,24 @@ Main Features * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor * |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, minimal dependencies -* |:tools:| Actively maintained by Mindee -* |:factory:| Easy integration (available templates for browser demo & API deployment) +* |:bird:| Light package, small dependencies +* |:tools:| Daily maintained +* |:factory:| Easy integration +Getting Started +--------------- + .. toctree:: :maxdepth: 2 - :caption: Getting started - :hidden: installing - notebooks + + +Build & train your predictor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) +* Fine-tune or train from scratch any detection or recognition model to specialize on your data Model zoo @@ -39,14 +48,14 @@ Model zoo Text detection models """"""""""""""""""""" - * DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ - * LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ + * `DBNet `_ (Differentiable Binarization) + * `LinkNet `_ Text recognition models """"""""""""""""""""""" - * SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ - * CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ - * MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ + * `SAR `_ (Show, Attend and Read) + * `CRNN `_ (Convolutional Recurrent Neural Network) + * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) Supported datasets @@ -54,38 +63,21 @@ Supported datasets * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. * SROIE from `ICDAR 2019 `_. - * IIIT-5k from `CVIT `_. - * Street View Text from `"End-to-End Scene Text Recognition" `_. - * SynthText from `Visual Geometry Group `_. - * SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. - * IC03 from `ICDAR 2003 `_. - * IC13 from `ICDAR 2013 `_. .. toctree:: :maxdepth: 2 - :caption: Using docTR - :hidden: + :caption: Notes - using_models - using_model_export + changelog .. toctree:: :maxdepth: 2 :caption: Package Reference - :hidden: datasets - io + documents models transforms utils - - -.. toctree:: - :maxdepth: 2 - :caption: Notes - :hidden: - - changelog diff --git a/v0.5.0/_sources/installing.rst.txt b/v0.5.0/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.5.0/_sources/installing.rst.txt +++ b/v0.5.0/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.5.0/_sources/io.rst.txt b/v0.5.0/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.5.0/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.5.0/_sources/models.rst.txt b/v0.5.0/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.5.0/_sources/models.rst.txt +++ b/v0.5.0/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.0/_sources/notebooks.md.txt b/v0.5.0/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.5.0/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.5.0/_sources/transforms.rst.txt b/v0.5.0/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.5.0/_sources/transforms.rst.txt +++ b/v0.5.0/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.5.0/_sources/using_doctr/using_model_export.rst.txt b/v0.5.0/_sources/using_doctr/using_model_export.rst.txt index 48f570f699..c62c36169b 100644 --- a/v0.5.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.5.0/_sources/using_doctr/using_model_export.rst.txt @@ -31,7 +31,7 @@ Advantages: .. code:: python3 import tensorflow as tf - from keras import mixed_precision + from tensorflow.keras import mixed_precision mixed_precision.set_global_policy('mixed_float16') predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) diff --git a/v0.5.0/_sources/using_model_export.rst.txt b/v0.5.0/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.5.0/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.0/_sources/using_models.rst.txt b/v0.5.0/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.5.0/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
                            -
                            -

                            - - Hello - XML - World - -

                            -
                            - - \ No newline at end of file diff --git a/v0.5.0/_sources/utils.rst.txt b/v0.5.0/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.5.0/_sources/utils.rst.txt +++ b/v0.5.0/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.5.0/_static/documentation_options.js b/v0.5.0/_static/documentation_options.js index d02336db0b..a7b5cbe04a 100644 --- a/v0.5.0/_static/documentation_options.js +++ b/v0.5.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.5.0a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.5.0/changelog.html b/v0.5.0/changelog.html index 55482bbcda..6ed2620fb7 100644 --- a/v0.5.0/changelog.html +++ b/v0.5.0/changelog.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Changelog - docTR documentation @@ -227,28 +227,21 @@ @@ -290,22 +283,6 @@

                            Changelog

                            -
                            -

                            v0.4.1 (2021-11-22)

                            -

                            Release note: v0.4.1

                            -
                            -
                            -

                            v0.4.0 (2021-10-01)

                            -

                            Release note: v0.4.0

                            -
                            -
                            -

                            v0.3.1 (2021-08-27)

                            -

                            Release note: v0.3.1

                            -
                            -
                            -

                            v0.3.0 (2021-07-02)

                            -

                            Release note: v0.3.0

                            -

                            v0.2.1 (2021-05-28)

                            Release note: v0.2.1

                            @@ -329,15 +306,23 @@

                            v0.1.0 (2021-03-05) - - + +
                            +
                            + Next +
                            +
                            doctr.datasets
                            +
                            + +
                            +
                            Previous
                            -
                            doctr.utils
                            +
                            Installation
                            @@ -372,10 +357,6 @@

                            v0.1.0 (2021-03-05)

                            diff --git a/v0.5.0/datasets.html b/v0.5.0/datasets.html index 1f5855cc82..640791680a 100644 --- a/v0.5.0/datasets.html +++ b/v0.5.0/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,12 +287,16 @@

                            doctr.datasets

                            Available Datasets

                            -

                            Here are all datasets that are available through docTR:

                            -
                            -

                            Public datasets

                            +

                            The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

                            +
                            +
                            +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
                            +
                            + +

                            Here are all datasets that are available through DocTR:

                            -class doctr.datasets.FUNSD(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            +class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                            FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

                            Example::
                            >>> from doctr.datasets import FUNSD
                            @@ -313,7 +310,8 @@ 

                            Public datasetsParameters:
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • +
                            • sample_transforms – composable transformations that will be applied to each image

                            • +
                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • **kwargs – keyword arguments from VisionDataset.

                            @@ -322,7 +320,7 @@

                            Public datasets
                            -class doctr.datasets.SROIE(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            +class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                            SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

                            Example::
                            - -
                            -
                            -class doctr.datasets.IIIT5K(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

                            -
                            -
                            Example::
                            >>> # NOTE: this dataset is for character-level localization
                            ->>> from doctr.datasets import IIIT5K
                            ->>> train_set = IIIT5K(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.SVT(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import SVT
                            ->>> train_set = SVT(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.SVHN(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            SVHN dataset from “The Street View House Numbers (SVHN) Dataset”.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import SVHN
                            ->>> train_set = SVHN(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.SynthText(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            SynthText dataset from “Synthetic Data for Text Localisation in Natural Images” | “repository” | -“website”.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import SynthText
                            ->>> train_set = SynthText(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.IC03(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            IC03 dataset from “ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions”.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import IC03
                            ->>> train_set = IC03(train=True, download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            • **kwargs – keyword arguments from VisionDataset.

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.IC13(img_folder: str, label_folder: str, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            IC13 dataset from “ICDAR 2013 Robust Reading Competition”. -Example:

                            -
                            >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
                            ->>> from doctr.datasets import IC13
                            ->>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
                            ->>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
                            ->>> img, target = train_set[0]
                            ->>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
                            ->>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
                            ->>> img, target = test_set[0]
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • img_folder – folder with all the images of the dataset

                            • -
                            • label_folder – folder with all annotation files for the images

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            -
                            -
                            -
                            - -

                            -
                            -

                            docTR synthetic datasets

                            -
                            -
                            -class doctr.datasets.DocArtefacts(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

                            -
                            -
                            Example::
                            >>> from doctr.datasets import DocArtefacts
                            ->>> train_set = DocArtefacts(download=True)
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • train – whether the subset should be the training one

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • +
                            • sample_transforms – composable transformations that will be applied to each image

                            • +
                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • **kwargs – keyword arguments from VisionDataset.

                            -
                            -
                            -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
                            -

                            Implements a character image generation dataset

                            -
                            -
                            Example::
                            >>> from doctr.datasets import CharacterGenerator
                            ->>> ds = CharacterGenerator(vocab='abdef')
                            ->>> img, target = ds[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • vocab – vocabulary to take the character from

                            • -
                            • num_samples – number of samples that will be generated iterating over the dataset

                            • -
                            • cache_samples – whether generated images should be cached firsthand

                            • -
                            • font_family – font to use to generate the text images

                            • -
                            • img_transforms – composable transformations that will be applied to each image

                            • -
                            • sample_transforms – composable transformations that will be applied to both the image and the target

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.WordGenerator(vocab: str, min_chars: int, max_chars: int, num_samples: int, cache_samples: bool = False, font_family: str | List[str] | None = None, img_transforms: Callable[[Any], Any] | None = None, sample_transforms: Callable[[Any, Any], Tuple[Any, Any]] | None = None)[source]
                            -

                            Implements a character image generation dataset

                            -
                            -
                            Example::
                            >>> from doctr.datasets import WordGenerator
                            ->>> ds = WordGenerator(vocab='abdef')
                            ->>> img, target = ds[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • vocab – vocabulary to take the character from

                            • -
                            • min_chars – minimum number of characters in a word

                            • -
                            • max_chars – maximum number of characters in a word

                            • -
                            • num_samples – number of samples that will be generated iterating over the dataset

                            • -
                            • cache_samples – whether generated images should be cached firsthand

                            • -
                            • font_family – font to use to generate the text images

                            • -
                            • img_transforms – composable transformations that will be applied to each image

                            • -
                            • sample_transforms – composable transformations that will be applied to both the image and the target

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -

                            docTR private datasets

                            -

                            Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same.

                            -
                            -
                            -class doctr.datasets.DetectionDataset(img_folder: str, label_path: str, use_polygons: bool = False, **kwargs: Any)[source]
                            -

                            Implements a text detection dataset

                            -
                            -
                            Example::
                            >>> from doctr.datasets import DetectionDataset
                            ->>> train_set = DetectionDataset(img_folder="/path/to/images", label_path="/path/to/labels.json")
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • img_folder – folder with all the images of the dataset

                            • -
                            • label_path – path to the annotations of each image

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • -
                            -
                            -
                            -
                            - -
                            -
                            -class doctr.datasets.RecognitionDataset(img_folder: str, labels_path: str, **kwargs: Any)[source]
                            -

                            Dataset implementation for text recognition tasks

                            -
                            -
                            Example::
                            >>> from doctr.datasets import RecognitionDataset
                            ->>> train_set = RecognitionDataset(img_folder="/path/to/images", labels_path="/path/to/labels.json")
                            ->>> img, target = train_set[0]
                            -
                            -
                            -
                            -
                            -
                            -
                            Parameters:
                            -
                              -
                            • img_folder – path to the images folder

                            • -
                            • labels_path – pathe to the json file containing all labels (character sequences)

                            • -
                            -
                            -
                            -
                            -
                            -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]
                            +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                            Implements an OCR dataset

                            Parameters:
                            • img_folder – local path to image folder (all jpg at the root)

                            • label_file – local path to the label file

                            • -
                            • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • +
                            • sample_transforms – composable transformations that will be applied to each image

                            • +
                            • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                            • +
                            • **kwargs – keyword arguments from VisionDataset.

                            -

                            Data Loading

                            -

                            Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

                            +

                            Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

                            -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
                            +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

                            Implements a dataset wrapper for fast data loading

                            Example::
                            >>> from doctr.datasets import FUNSD, DataLoader
                            @@ -681,7 +408,7 @@ 

                            Data Loading

                            Supported Vocabs

                            -

                            Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

                            Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

                            - +@@ -724,39 +451,19 @@

                            Data Loading

                            - - - - - - - - - - + + - - - - - - - - - - - - - - + +
                            docTR VocabsDocTR Vocabs

                            latin

                            94

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

                            english

                            100

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

                            legacy_french

                            123

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                            96

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

                            french

                            126

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

                            portuguese

                            131

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

                            spanish

                            116

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

                            german

                            108

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

                            154

                            0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                            -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
                            +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

                            Encode character sequences using a given vocab as mapping

                            Parameters:
                            @@ -767,7 +474,6 @@

                            Data LoadingReturns: @@ -784,23 +490,23 @@

                            Data Loading - +
                            Next
                            -
                            doctr.io
                            +
                            doctr.documents
                            - +
                            Previous
                            -
                            Preparing your model for inference
                            +
                            Changelog
                            @@ -836,32 +542,13 @@

                            Data Loadingdoctr.datasets

                            diff --git a/v0.5.0/genindex.html b/v0.5.0/genindex.html index 71543108f3..10d0739337 100644 --- a/v0.5.0/genindex.html +++ b/v0.5.0/genindex.html @@ -225,28 +225,21 @@ @@ -283,17 +276,17 @@

                            Index

                            -
                            A | B | C | D | E | F | G | I | L | M | N | O | P | R | S | T | U | V | W
                            +
                            A | B | C | D | E | F | G | L | M | N | O | P | Q | R | S | T | V | W

                            A

                            @@ -303,7 +296,7 @@

                            A

                            B

                            @@ -313,25 +306,19 @@

                            B

                            C

                            @@ -342,26 +329,16 @@

                            D

                            @@ -381,13 +358,13 @@

                            E

                            F

                            + -
                            @@ -613,29 +544,11 @@

                            T

                        • -
                          -

                          U

                          - - -
                          -
                          -

                          V

                          + +

                      • Composing transformations
                          @@ -768,7 +674,7 @@

                          Composing transformations + diff --git a/v0.5.0/using_doctr/using_model_export.html b/v0.5.0/using_doctr/using_model_export.html index d467663403..75c81caa7c 100644 --- a/v0.5.0/using_doctr/using_model_export.html +++ b/v0.5.0/using_doctr/using_model_export.html @@ -316,7 +316,7 @@

                          Half-precision
                          import tensorflow as tf
                          -from keras import mixed_precision
                          +from tensorflow.keras import mixed_precision
                           mixed_precision.set_global_policy('mixed_float16')
                           predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
                           
                          diff --git a/v0.5.0/using_model_export.html b/v0.5.0/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.5.0/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                          Skip to content - - - -
                          -
                          -
                          - -
                          - -
                          -
                          - -
                          - -
                          -
                          - -
                          -
                          -
                          - - - - - Back to top - -
                          - -
                          - -
                          - -
                          -
                          -
                          -

                          Preparing your model for inference

                          -

                          A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

                          -
                          -

                          Model compression

                          -

                          This section is meant to help you perform inference with compressed versions of your model.

                          -
                          -

                          TensorFlow Lite

                          -

                          TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

                          -
                          >>> import tensorflow as tf
                          ->>> from tensorflow.keras import Sequential
                          ->>> from doctr.models import conv_sequence
                          ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                          ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
                          ->>> serialized_model = converter.convert()
                          -
                          -
                          -
                          -
                          -

                          Half-precision

                          -

                          If you want to convert it to half-precision using your TFLite converter

                          -
                          >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                          ->>> converter.target_spec.supported_types = [tf.float16]
                          ->>> serialized_model = converter.convert()
                          -
                          -
                          -
                          -
                          -

                          Post-training quantization

                          -

                          Finally if you wish to quantize the model with your TFLite converter

                          -
                          >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                          ->>> # Float fallback for operators that do not have an integer implementation
                          ->>> def representative_dataset():
                          ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
                          ->>> converter.representative_dataset = representative_dataset
                          ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
                          ->>> converter.inference_input_type = tf.int8
                          ->>> converter.inference_output_type = tf.int8
                          ->>> serialized_model = converter.convert()
                          -
                          -
                          -
                          -
                          -
                          -

                          Using SavedModel

                          -

                          Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

                          -
                          >>> import tensorflow as tf
                          ->>> from doctr.models import db_resnet50
                          ->>> model = db_resnet50(pretrained=True)
                          ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                          ->>> _ = model(input_t, training=False)
                          ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
                          -
                          -
                          -

                          And loaded just as easily:

                          -
                          >>> import tensorflow as tf
                          ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
                          -
                          -
                          -
                          -
                          - -
                          -
                          - -
                          - -
                          -
                          - - - - - - - - \ No newline at end of file diff --git a/v0.5.0/using_models.html b/v0.5.0/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.5.0/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                          -
                          -
                          - -
                          - -
                          -
                          - -
                          - -
                          -
                          - -
                          -
                          -
                          - - - - - Back to top - -
                          - -
                          - -
                          - -
                          -
                          -
                          -

                          Choosing the right model

                          -

                          The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

                          -

                          For a given task, docTR provides a Predictor, which is composed of 2 components:

                          -
                            -
                          • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

                          • -
                          • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

                          • -
                          -
                          -

                          Text Detection

                          -

                          The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

                          -
                          -

                          Available architectures

                          -

                          The following architectures are currently supported:

                          - -

                          For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                          -
                          - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                          FUNSD

                          CORD

                          Architecture

                          Input shape

                          # params

                          Recall

                          Precision

                          Recall

                          Precision

                          FPS

                          db_resnet50

                          (1024, 1024, 3)

                          25.2 M

                          82.14

                          87.64

                          92.49

                          89.66

                          2.1

                          db_mobilenet_v3_large

                          (1024, 1024, 3)

                          4.2 M

                          79.35

                          84.03

                          81.14

                          66.85

                          -
                          -

                          All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                          -

                          Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

                          -

                          FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                          -
                          -
                          -

                          Detection predictors

                          -

                          detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

                          -
                          >>> import numpy as np
                          ->>> from doctr.models import detection_predictor
                          ->>> predictor = detection_predictor('db_resnet50')
                          ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                          ->>> out = model([dummy_img])
                          -
                          -
                          -
                          -
                          -
                          -

                          Text Recognition

                          -

                          The task consists of transcribing the character sequence in a given image.

                          -
                          -

                          Available architectures

                          -

                          The following architectures are currently supported:

                          - -

                          For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                          -
                          - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                          Text recognition model zoo

                          Architecture

                          Input shape

                          # params

                          FUNSD

                          CORD

                          FPS

                          crnn_vgg16_bn

                          (32, 128, 3)

                          15.8M

                          87.18

                          92.93

                          12.8

                          crnn_mobilenet_v3_small

                          (32, 128, 3)

                          2.1M

                          86.21

                          90.56

                          crnn_mobilenet_v3_large

                          (32, 128, 3)

                          4.5M

                          86.95

                          92.03

                          sar_resnet31

                          (32, 128, 3)

                          56.2M

                          87.70

                          93.41

                          2.7

                          master

                          (32, 128, 3)

                          67.7M

                          87.62

                          93.27

                          -
                          -

                          All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

                          -

                          While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

                          -
                          >>> from doctr.models import recognition_predictor
                          ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                          ->>> print(predictor.model.cfg['vocab'])
                          -
                          -
                          -

                          Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

                          -

                          FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                          -
                          -
                          -

                          Recognition predictors

                          -

                          recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

                          -
                          >>> import numpy as np
                          ->>> from doctr.models import recognition_predictor
                          ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                          ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
                          ->>> out = model([dummy_img])
                          -
                          -
                          -
                          -
                          -
                          -

                          End-to-End OCR

                          -

                          The task consists of both localizing and transcribing textual elements in a given image.

                          -
                          -

                          Available architectures

                          -

                          You can use any combination of detection and recognition models supporte by docTR.

                          -

                          For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                          -
                          - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                          FUNSD

                          CORD

                          Architecture

                          Recall

                          Precision

                          FPS

                          Recall

                          Precision

                          FPS

                          db_resnet50 + crnn_vgg16_bn

                          71.25

                          76.02

                          0.85

                          84.00

                          81.42

                          1.6

                          db_resnet50 + master

                          71.03

                          76.06

                          84.49

                          81.94

                          db_resnet50 + sar_resnet31

                          71.25

                          76.29

                          0.27

                          84.50

                          81.96

                          0.83

                          db_resnet50 + crnn_mobilenet_v3_small

                          69.85

                          74.80

                          80.85

                          78.42

                          0.83

                          db_resnet50 + crnn_mobilenet_v3_large

                          70.57

                          75.57

                          82.57

                          80.08

                          0.83

                          db_mobilenet_v3_large + crnn_vgg16_bn

                          67.73

                          71.73

                          71.65

                          59.03

                          Gvision text detection

                          59.50

                          62.50

                          75.30

                          70.00

                          Gvision doc. text detection

                          64.00

                          53.30

                          68.90

                          61.10

                          AWS textract

                          78.10

                          83.00

                          87.50

                          66.00

                          -
                          -

                          All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                          -

                          Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                          -

                          FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                          -

                          Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

                          -
                          - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                          Receipts

                          Invoices

                          IDs

                          US Tax Forms

                          Resumes

                          Road Fines

                          Architecture

                          Recall

                          Precision

                          Recall

                          Precision

                          Recall

                          Precision

                          Recall

                          Precision

                          Recall

                          Precision

                          Recall

                          Precision

                          db_resnet50 + crnn_vgg16_bn (ours)

                          78.70

                          81.12

                          65.80

                          70.70

                          50.25

                          51.78

                          79.08

                          92.83

                          db_resnet50 + master (ours)

                          79.00

                          81.42

                          65.57

                          69.86

                          51.34

                          52.90

                          78.86

                          92.57

                          db_resnet50 + sar_resnet31 (ours)

                          78.94

                          81.37

                          65.89

                          70.79

                          51.78

                          53.35

                          79.04

                          92.78

                          db_resnet50 + crnn_mobilenet_v3_small (ours)

                          76.81

                          79.15

                          64.89

                          69.61

                          45.03

                          46.38

                          78.96

                          92.11

                          85.91

                          87.20

                          84.85

                          85.86

                          db_resnet50 + crnn_mobilenet_v3_large (ours)

                          78.01

                          80.39

                          65.36

                          70.11

                          48.00

                          49.43

                          79.39

                          92.62

                          87.68

                          89.00

                          85.65

                          86.67

                          db_mobilenet_v3_large + crnn_vgg16_bn (ours)

                          78.36

                          74.93

                          63.04

                          68.41

                          39.36

                          41.75

                          72.14

                          89.97

                          Gvision doc. text detection

                          68.91

                          59.89

                          63.20

                          52.85

                          43.70

                          29.21

                          69.79

                          65.68

                          AWS textract

                          75.77

                          77.70

                          70.47

                          69.13

                          46.39

                          43.32

                          84.31

                          98.11

                          -
                          -
                          -
                          -

                          Two-stage approaches

                          -

                          Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

                          -
                          >>> import numpy as np
                          ->>> from doctr.models import ocr_predictor
                          ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                          ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                          ->>> out = model([input_page])
                          -
                          -
                          -
                          -
                          -

                          What should I do with the output?

                          -

                          The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

                          -

                          Here is a typical Document layout:

                          -
                          Document(
                          -  (pages): [Page(
                          -    dimensions=(340, 600)
                          -    (blocks): [Block(
                          -      (lines): [Line(
                          -        (words): [
                          -          Word(value='No.', confidence=0.91),
                          -          Word(value='RECEIPT', confidence=0.99),
                          -          Word(value='DATE', confidence=0.96),
                          -        ]
                          -      )]
                          -      (artefacts): []
                          -    )]
                          -  )]
                          -)
                          -
                          -
                          -

                          You can also export them as a nested dict, more appropriate for JSON format:

                          -
                          json_output = result.export()
                          -
                          -
                          -

                          For reference, here is the JSON export for the same Document as above:

                          -
                          {
                          -  'pages': [
                          -      {
                          -          'page_idx': 0,
                          -          'dimensions': (340, 600),
                          -          'orientation': {'value': None, 'confidence': None},
                          -          'language': {'value': None, 'confidence': None},
                          -          'blocks': [
                          -              {
                          -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                          -                  'lines': [
                          -                      {
                          -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                          -                          'words': [
                          -                              {
                          -                                  'value': 'No.',
                          -                                  'confidence': 0.914085328578949,
                          -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
                          -                              },
                          -                              {
                          -                                  'value': 'RECEIPT',
                          -                                  'confidence': 0.9949972033500671,
                          -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
                          -                              },
                          -                              {
                          -                                  'value': 'DATE',
                          -                                  'confidence': 0.9578408598899841,
                          -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
                          -                              }
                          -                          ]
                          -                      }
                          -                  ],
                          -                  'artefacts': []
                          -              }
                          -          ]
                          -      }
                          -  ]
                          -}
                          -
                          -
                          -

                          To export the outpout as XML (hocr-format) you can use the export_as_xml method:

                          -
                          xml_output = result.export_as_xml()
                          -for output in xml_output:
                          -  xml_bytes_string = output[0]
                          -  xml_element = output[1]
                          -
                          -
                          -

                          For reference, here is a sample XML byte string output:

                          -
                          <?xml version="1.0" encoding="UTF-8"?>
                          -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
                          -  <head>
                          -    <title>docTR - hOCR</title>
                          -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
                          -    <meta name="ocr-system" content="doctr 0.5.0" />
                          -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
                          -  </head>
                          -  <body>
                          -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
                          -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
                          -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
                          -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
                          -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
                          -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
                          -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
                          -        </span>
                          -      </p>
                          -    </div>
                          -  </body>
                          -</html>
                          -
                          -
                          -
                          -
                          -
                          - -
                          -
                          - -
                          - -
                          -
                          - - - - - - - - \ No newline at end of file diff --git a/v0.5.0/utils.html b/v0.5.0/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.5.0/utils.html +++ b/v0.5.0/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

                          Visualization -
                          -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
                          -

                          Draw a the content of the element page (OCR response) on a blank page.

                          -
                          -
                          Parameters:
                          -
                            -
                          • page – exported Page object to represent

                          • -
                          • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                          • -
                          • font_size – size of the font, default font = 13

                          • -
                          • font_family – family of the font

                          • -
                          -
                          -
                          Returns:
                          -

                          the synthesized page

                          -
                          -
                          -

                      -

                      Task evaluation

                      @@ -382,20 +356,6 @@

                      Visualization -
                      -update(gt: List[str], pred: List[str]) None[source]
                      -

                      Update the state of the metric with new predictions

                      -
                      -
                      Parameters:
                      -
                        -
                      • gt – list of groung-truth character sequences

                      • -
                      • pred – list of predicted character sequences

                      • -
                      -
                      -
                      -

                      -
                      summary() Dict[str, float][source]
                      @@ -412,14 +372,14 @@

                      Visualization
                      -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                      +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

                      Implements common confusion metrics and mean IoU for localization evaluation.

                      The aggregated metrics are computed as follows:

                      \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

                      with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

                      Visualization
                      Parameters:
                      -
                        -
                      • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                      • -
                      • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                      • -
                      • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                      • -
                      • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                      • -
                      +

                      iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                      -
                      -
                      -update(gts: ndarray, preds: ndarray) None[source]
                      -

                      Updates the metric

                      -
                      -
                      Parameters:
                      -
                        -
                      • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                      • -
                      • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                      • -
                      -
                      -
                      -
                      -
                      summary() Tuple[float | None, float | None, float | None][source]
                      @@ -485,15 +426,15 @@

                      Visualization
                      -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                      -

                      Implements an end-to-end OCR metric.

                      +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

                      Implements end-to-end OCR metric.

                      The aggregated metrics are computed as follows:

                      \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

                      with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

                      Visualization
                      Parameters:
                      -
                        -
                      • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                      • -
                      • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                      • -
                      • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                      • -
                      • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                      • -
                      -
                      -

                      -
                      -
                      -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
                      -

                      Updates the metric

                      -
                      -
                      Parameters:
                      -
                        -
                      • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                      • -
                      • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                      • -
                      • gt_labels – a list of N string labels

                      • -
                      • pred_labels – a list of M string labels

                      • -
                      +

                      iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                      -
                      -
                      summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

                      Computes the aggregated metrics

                      Returns:
                      -

                      a tuple with the recall & precision for each string comparison and the mean IoU

                      -
                      -
                      -
                      - -

                      - -
                      -
                      -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                      -

                      Implements an object detection metric.

                      -

                      The aggregated metrics are computed as follows:

                      -
                      -
                      -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
                      -
                      -

                      with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

                      -
                      -
                      -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
                      -
                      -

                      where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

                      -
                      -
                      Example::
                      >>> import numpy as np
                      ->>> from doctr.utils import DetectionMetric
                      ->>> metric = DetectionMetric(iou_thresh=0.5)
                      ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
                      -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
                      ->>> metric.summary()
                      -
                      -
                      -
                      -
                      -
                      -
                      Parameters:
                      -
                        -
                      • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                      • -
                      • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                      • -
                      • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                      • -
                      • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                      • -
                      -
                      -
                      -
                      -
                      -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
                      -

                      Updates the metric

                      -
                      -
                      Parameters:
                      -
                        -
                      • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                      • -
                      • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                      • -
                      • gt_labels – an array of class indices of shape (N,)

                      • -
                      • pred_labels – an array of class indices of shape (M,)

                      • -
                      -
                      -
                      -
                      - -
                      -
                      -summary() Tuple[float | None, float | None, float | None][source]
                      -

                      Computes the aggregated metrics

                      -
                      -
                      Returns:
                      -

                      a tuple with the recall & precision for each class prediction and the mean IoU

                      +

                      a tuple with the recall & precision for each string comparison flexibility and the mean IoU

                      @@ -649,15 +490,7 @@

                      Visualization - -
                      -
                      - Next -
                      -
                      Changelog
                      -
                      - -
                      + diff --git a/v0.5.1/_modules/doctr/datasets/classification/tensorflow.html b/v0.5.1/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.5.1/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                      -
                      -
                      - -
                      - -
                      -
                      - -
                      - -
                      -
                      - -
                      -
                      -
                      - - - - - Back to top - -
                      -
                      - -
                      - -
                      -
                      -

                      Source code for doctr.datasets.classification.tensorflow

                      -# Copyright (C) 2021, Mindee.
                      -
                      -# This program is licensed under the Apache License version 2.
                      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                      -
                      -import tensorflow as tf
                      -
                      -from .base import _CharacterGenerator
                      -
                      -__all__ = ['CharacterGenerator']
                      -
                      -
                      -
                      -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
                      - -
                      -
                      -
                      -
                      - - -
                      -
                      - - Made with Sphinx and @pradyunsg's - - Furo - -
                      -
                      - -
                      -
                      - -
                      -
                      - -
                      -
                      - - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/datasets/cord.html b/v0.5.1/_modules/doctr/datasets/cord.html index 34524b2f5c..3b89955bd8 100644 --- a/v0.5.1/_modules/doctr/datasets/cord.html +++ b/v0.5.1/_modules/doctr/datasets/cord.html @@ -226,32 +226,20 @@

                      Source code for doctr.datasets.cord

                      -# Copyright (C) 2021-2022, Mindee.
                      +# Copyright (C) 2021, Mindee.
                       
                       # This program is licensed under the Apache License version 2.
                       # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                       
                      -import json
                       import os
                      -from pathlib import Path
                      -from typing import Any, Dict, List, Tuple
                      -
                      +import json
                       import numpy as np
                      +from pathlib import Path
                      +from typing import List, Dict, Any, Tuple, Optional, Callable
                       
                       from .datasets import VisionDataset
                      -from .utils import convert_target_to_relative
                      +from doctr.utils.geometry import fit_rbbox
                       
                       __all__ = ['CORD']
                       
                       
                       
                      -[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/cord-grid.png - :align: center - - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', @@ -332,42 +318,41 @@

                      Source code for doctr.datasets.cord

                           def __init__(
                               self,
                               train: bool = True,
                      -        use_polygons: bool = False,
                      +        sample_transforms: Optional[Callable[[Any], Any]] = None,
                      +        rotated_bbox: bool = False,
                               **kwargs: Any,
                           ) -> None:
                       
                               url, sha256 = self.TRAIN if train else self.TEST
                      -        super().__init__(url, None, sha256, True, pre_transforms=convert_target_to_relative, **kwargs)
                      +        super().__init__(url, None, sha256, True, **kwargs)
                       
                               # # List images
                      -        tmp_root = os.path.join(self.root, 'image')
                      +        self.root = os.path.join(self._root, 'image')
                               self.data: List[Tuple[str, Dict[str, Any]]] = []
                               self.train = train
                      -        np_dtype = np.float32
                      -        for img_path in os.listdir(tmp_root):
                      +        self.sample_transforms = sample_transforms
                      +        for img_path in os.listdir(self.root):
                                   # File existence check
                      -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                      -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                      -
                      +            if not os.path.exists(os.path.join(self.root, img_path)):
                      +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                   stem = Path(img_path).stem
                                   _targets = []
                      -            with open(os.path.join(self.root, 'json', f"{stem}.json"), 'rb') as f:
                      +            with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f:
                                       label = json.load(f)
                                       for line in label["valid_line"]:
                                           for word in line["words"]:
                                               if len(word["text"]) > 0:
                                                   x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
                                                   y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
                      -                            if use_polygons:
                      -                                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                      -                                box = np.array([
                      +                            if rotated_bbox:
                      +                                box = list(fit_rbbox(np.array([
                                                           [x[0], y[0]],
                                                           [x[1], y[1]],
                                                           [x[2], y[2]],
                                                           [x[3], y[3]],
                      -                                ], dtype=np_dtype)
                      +                                ], dtype=np.float32)))
                                                   else:
                      -                                # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
                      +                                # Reduce 8 coords to 4
                                                       box = [min(x), min(y), max(x), max(y)]
                                                   _targets.append((word['text'], box))
                       
                      @@ -375,9 +360,8 @@ 

                      Source code for doctr.datasets.cord

                       
                                   self.data.append((
                                       img_path,
                      -                dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))
                      +                dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets)
                                   ))
                      -        self.root = tmp_root
                       
                           def extra_repr(self) -> str:
                               return f"train={self.train}"
                      @@ -414,7 +398,7 @@

                      Source code for doctr.datasets.cord

                             
                           
                         
                      -
                      +
                      diff --git a/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.5.1/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

                      Package Reference

                      • doctr.datasets
                      • -
                      • doctr.io
                      • +
                      • doctr.documents
                      • doctr.models
                      • doctr.transforms
                      • doctr.utils
                      • @@ -284,7 +284,6 @@

                        Source code for doctr.datasets.datasets.tensorflow

                        from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

                        Source code for doctr.datasets.datasets.tensorflow

                        class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

                        Source code for doctr.datasets.datasets.tensorflow

                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/detection.html b/v0.5.1/_modules/doctr/datasets/detection.html index d9e7c7eb17..43e148dc88 100644 --- a/v0.5.1/_modules/doctr/datasets/detection.html +++ b/v0.5.1/_modules/doctr/datasets/detection.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.detection

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import json
                         import os
                        -from typing import Any, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Type, Union
                         
                         import numpy as np
                         
                        -from doctr.io.image import get_img_shape
                        -from doctr.utils.geometry import convert_to_relative_coords
                        +from doctr.file_utils import CLASS_NAME
                         
                         from .datasets import AbstractDataset
                        +from .utils import pre_transform_multiclass
                         
                         __all__ = ["DetectionDataset"]
                         
                        @@ -317,6 +323,7 @@ 

                        Source code for doctr.datasets.detection

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 img_folder: folder with all the images of the dataset
                                 label_path: path to the annotations of each image
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        @@ -332,27 +339,60 @@ 

                        Source code for doctr.datasets.detection

                             ) -> None:
                                 super().__init__(
                                     img_folder,
                        -            pre_transforms=lambda img, boxes: (img, convert_to_relative_coords(boxes, get_img_shape(img))),
                        -            **kwargs
                        +            pre_transforms=pre_transform_multiclass,
                        +            **kwargs,
                                 )
                         
                                 # File existence check
                        +        self._class_names: List = []
                                 if not os.path.exists(label_path):
                                     raise FileNotFoundError(f"unable to locate {label_path}")
                        -        with open(label_path, 'rb') as f:
                        +        with open(label_path, "rb") as f:
                                     labels = json.load(f)
                         
                        -        self.data: List[Tuple[str, np.ndarray]] = []
                        +        self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = []
                                 np_dtype = np.float32
                                 for img_name, label in labels.items():
                                     # File existence check
                                     if not os.path.exists(os.path.join(self.root, img_name)):
                                         raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
                         
                        -            polygons = np.asarray(label['polygons'], dtype=np_dtype)
                        -            geoms = polygons if use_polygons else np.concatenate((polygons.min(axis=1), polygons.max(axis=1)), axis=1)
                        +            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
                         
                        -            self.data.append((img_name, np.asarray(geoms, dtype=np_dtype)))
                        + self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + + def format_polygons( + self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type + ) -> Tuple[np.ndarray, List[str]]: + """Format polygons into an array + + Args: + ---- + polygons: the bounding boxes + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + np_dtype: dtype of array + + Returns: + ------- + geoms: bounding boxes as np array + polygons_classes: list of classes for each bounding box + """ + if isinstance(polygons, list): + self._class_names += [CLASS_NAME] + polygons_classes = [CLASS_NAME for _ in polygons] + _polygons: np.ndarray = np.asarray(polygons, dtype=np_dtype) + elif isinstance(polygons, dict): + self._class_names += list(polygons.keys()) + polygons_classes = [k for k, v in polygons.items() for _ in v] + _polygons = np.concatenate([np.asarray(poly, dtype=np_dtype) for poly in polygons.values() if poly], axis=0) + else: + raise TypeError(f"polygons should be a dictionary or list, it was {type(polygons)}") + geoms = _polygons if use_polygons else np.concatenate((_polygons.min(axis=1), _polygons.max(axis=1)), axis=1) + return geoms, polygons_classes + + @property + def class_names(self): + return sorted(set(self._class_names))
                        @@ -386,7 +426,7 @@

                        Source code for doctr.datasets.detection

                               
                             
                           
                        -
                        +
                      diff --git a/v0.5.1/_modules/doctr/datasets/doc_artefacts.html b/v0.5.1/_modules/doctr/datasets/doc_artefacts.html index fc02b9e048..172122a216 100644 --- a/v0.5.1/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.5.1/_modules/doctr/datasets/doc_artefacts.html @@ -234,10 +234,16 @@

                      Using docTR

                      Package Reference

                        +
                      • doctr.contrib
                      • doctr.datasets
                      • doctr.io
                      • doctr.models
                      • @@ -287,10 +293,10 @@

                        Source code for doctr.datasets.doc_artefacts

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import json
                         import os
                        @@ -300,7 +306,7 @@ 

                        Source code for doctr.datasets.doc_artefacts

                        from .datasets import VisionDataset
                         
                        -__all__ = ['DocArtefacts']
                        +__all__ = ["DocArtefacts"]
                         
                         
                         
                        @@ -309,7 +315,7 @@

                        Source code for doctr.datasets.doc_artefacts

                            """Object detection dataset for non-textual elements in documents.
                             The dataset includes a variety of synthetic document pages with non-textual elements.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/artefacts-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/artefacts-grid.png&src=0
                                 :align: center
                         
                             >>> from doctr.datasets import DocArtefacts
                        @@ -317,13 +323,14 @@ 

                        Source code for doctr.datasets.doc_artefacts

                            >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                         
                        -    URL = 'https://github.com/mindee/doctr/releases/download/v0.4.0/artefact_detection-13fab8ce.zip'
                        -    SHA256 = '13fab8ced7f84583d9dccd0c634f046c3417e62a11fe1dea6efbbaba5052471b'
                        +    URL = "https://doctr-static.mindee.com/models?id=v0.4.0/artefact_detection-13fab8ce.zip&src=0"
                        +    SHA256 = "13fab8ced7f84583d9dccd0c634f046c3417e62a11fe1dea6efbbaba5052471b"
                             CLASSES = ["background", "qr_code", "bar_code", "logo", "photo"]
                         
                             def __init__(
                        @@ -332,20 +339,19 @@ 

                        Source code for doctr.datasets.doc_artefacts

                        use_polygons: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -
                                 super().__init__(self.URL, None, self.SHA256, True, **kwargs)
                                 self.train = train
                         
                                 # Update root
                                 self.root = os.path.join(self.root, "train" if train else "val")
                                 # List images
                        -        tmp_root = os.path.join(self.root, 'images')
                        +        tmp_root = os.path.join(self.root, "images")
                                 with open(os.path.join(self.root, "labels.json"), "rb") as f:
                                     labels = json.load(f)
                                 self.data: List[Tuple[str, Dict[str, Any]]] = []
                                 img_list = os.listdir(tmp_root)
                                 if len(labels) != len(img_list):
                        -            raise AssertionError('the number of images and labels do not match')
                        +            raise AssertionError("the number of images and labels do not match")
                                 np_dtype = np.float32
                                 for img_name, label in labels.items():
                                     # File existence check
                        @@ -353,8 +359,8 @@ 

                        Source code for doctr.datasets.doc_artefacts

                        raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
                         
                                     # xmin, ymin, xmax, ymax
                        -            boxes = np.asarray([obj['geometry'] for obj in label], dtype=np_dtype)
                        -            classes = np.asarray([self.CLASSES.index(obj['label']) for obj in label], dtype=np.int64)
                        +            boxes: np.ndarray = np.asarray([obj["geometry"] for obj in label], dtype=np_dtype)
                        +            classes: np.ndarray = np.asarray([self.CLASSES.index(obj["label"]) for obj in label], dtype=np.int64)
                                     if use_polygons:
                                         # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                         boxes = np.stack(
                        @@ -363,7 +369,8 @@ 

                        Source code for doctr.datasets.doc_artefacts

                        np.stack([boxes[:, 2], boxes[:, 1]], axis=-1),
                                                 np.stack([boxes[:, 2], boxes[:, 3]], axis=-1),
                                                 np.stack([boxes[:, 0], boxes[:, 3]], axis=-1),
                        -                    ], axis=1
                        +                    ],
                        +                    axis=1,
                                         )
                                     self.data.append((img_name, dict(boxes=boxes, labels=classes)))
                                 self.root = tmp_root
                        @@ -403,7 +410,7 @@ 

                        Source code for doctr.datasets.doc_artefacts

                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/funsd.html b/v0.5.1/_modules/doctr/datasets/funsd.html index 56ca18a07a..2f5494dc2a 100644 --- a/v0.5.1/_modules/doctr/datasets/funsd.html +++ b/v0.5.1/_modules/doctr/datasets/funsd.html @@ -226,32 +226,20 @@

                        Source code for doctr.datasets.funsd

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021, Mindee.
                         
                         # This program is licensed under the Apache License version 2.
                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                         
                        -import json
                         import os
                        -from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        -
                        +import json
                         import numpy as np
                        +from pathlib import Path
                        +from typing import List, Dict, Any, Tuple, Optional, Callable
                         
                         from .datasets import VisionDataset
                        -from .utils import convert_target_to_relative
                         
                         __all__ = ['FUNSD']
                         
                         
                         
                        -[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/funsd-grid.png - :align: center - - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ @@ -331,56 +316,41 @@

                        Source code for doctr.datasets.funsd

                             def __init__(
                                 self,
                                 train: bool = True,
                        -        use_polygons: bool = False,
                        +        sample_transforms: Optional[Callable[[Any], Any]] = None,
                        +        rotated_bbox: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                         
                        -        super().__init__(
                        -            self.URL,
                        -            self.FILE_NAME,
                        -            self.SHA256,
                        -            True,
                        -            pre_transforms=convert_target_to_relative,
                        -            **kwargs
                        -        )
                        +        super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs)
                                 self.train = train
                        -        np_dtype = np.float32
                        +        self.sample_transforms = sample_transforms
                         
                                 # Use the subset
                                 subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data')
                         
                                 # # List images
                        -        tmp_root = os.path.join(self.root, subfolder, 'images')
                        +        self.root = os.path.join(self._root, subfolder, 'images')
                                 self.data: List[Tuple[str, Dict[str, Any]]] = []
                        -        for img_path in os.listdir(tmp_root):
                        +        for img_path in os.listdir(self.root):
                                     # File existence check
                        -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                        -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                        -
                        +            if not os.path.exists(os.path.join(self.root, img_path)):
                        +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                     stem = Path(img_path).stem
                        -            with open(os.path.join(self.root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
                        +            with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f:
                                         data = json.load(f)
                         
                                     _targets = [(word['text'], word['box']) for block in data['form']
                                                 for word in block['words'] if len(word['text']) > 0]
                                     text_targets, box_targets = zip(*_targets)
                        -            if use_polygons:
                        -                # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners
                        +            if rotated_bbox:
                        +                # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0
                                         box_targets = [
                                             [
                        -                        [box[0], box[1]],
                        -                        [box[2], box[1]],
                        -                        [box[2], box[3]],
                        -                        [box[0], box[3]],
                        +                        (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0
                                             ] for box in box_targets
                                         ]
                         
                        -            self.data.append((
                        -                img_path,
                        -                dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)),
                        -            ))
                        -
                        -        self.root = tmp_root
                        +            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets)))
                         
                             def extra_repr(self) -> str:
                                 return f"train={self.train}"
                        @@ -417,7 +387,7 @@

                        Source code for doctr.datasets.funsd

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html b/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html index b7afbbe7a5..1d6494d28c 100644 --- a/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.5.1/_modules/doctr/datasets/generator/tensorflow.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.generator.tensorflow

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import tensorflow as tf
                         
                         from .base import _CharacterGenerator, _WordGenerator
                         
                        -__all__ = ['CharacterGenerator', 'WordGenerator']
                        +__all__ = ["CharacterGenerator", "WordGenerator"]
                         
                         
                         
                        @@ -305,10 +311,11 @@

                        Source code for doctr.datasets.generator.tensorflow

                        """Implements a character image generation dataset >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') + >>> ds = CharacterGenerator(vocab='abdef', num_samples=100) >>> img, target = ds[0] Args: + ---- vocab: vocabulary to take the character from num_samples: number of samples that will be generated iterating over the dataset cache_samples: whether generated images should be cached firsthand @@ -322,7 +329,6 @@

                        Source code for doctr.datasets.generator.tensorflow

                        @staticmethod def collate_fn(samples): - images, targets = zip(*samples) images = tf.stack(images, axis=0) @@ -336,10 +342,11 @@

                        Source code for doctr.datasets.generator.tensorflow

                        """Implements a character image generation dataset >>> from doctr.datasets import WordGenerator - >>> ds = WordGenerator(vocab='abdef') + >>> ds = WordGenerator(vocab='abdef', min_chars=1, max_chars=32, num_samples=100) >>> img, target = ds[0] Args: + ---- vocab: vocabulary to take the character from min_chars: minimum number of characters in a word max_chars: maximum number of characters in a word @@ -384,7 +391,7 @@

                        Source code for doctr.datasets.generator.tensorflow

                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/ic03.html b/v0.5.1/_modules/doctr/datasets/ic03.html index e7efe719d0..6680bbc6d7 100644 --- a/v0.5.1/_modules/doctr/datasets/ic03.html +++ b/v0.5.1/_modules/doctr/datasets/ic03.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.ic03

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import os
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import defusedxml.ElementTree as ET
                         import numpy as np
                        +from tqdm import tqdm
                         
                         from .datasets import VisionDataset
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                        -__all__ = ['IC03']
                        +__all__ = ["IC03"]
                         
                         
                         
                        @@ -309,7 +317,7 @@

                        Source code for doctr.datasets.ic03

                             """IC03 dataset from `"ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions"
                             <http://www.iapr-tc11.org/mediawiki/index.php?title=ICDAR_2003_Robust_Reading_Competitions>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/ic03-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic03-grid.png&src=0
                                 :align: center
                         
                             >>> from doctr.datasets import IC03
                        @@ -317,39 +325,61 @@ 

                        Source code for doctr.datasets.ic03

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                         
                        -    TRAIN = ('http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/scene.zip',
                        -             '9d86df514eb09dd693fb0b8c671ef54a0cfe02e803b1bbef9fc676061502eb94',
                        -             'ic03_train.zip')
                        -    TEST = ('http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTest/scene.zip',
                        -            'dbc4b5fd5d04616b8464a1b42ea22db351ee22c2546dd15ac35611857ea111f8',
                        -            'ic03_test.zip')
                        +    TRAIN = (
                        +        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/scene.zip",
                        +        "9d86df514eb09dd693fb0b8c671ef54a0cfe02e803b1bbef9fc676061502eb94",
                        +        "ic03_train.zip",
                        +    )
                        +    TEST = (
                        +        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTest/scene.zip",
                        +        "dbc4b5fd5d04616b8464a1b42ea22db351ee22c2546dd15ac35611857ea111f8",
                        +        "ic03_test.zip",
                        +    )
                         
                             def __init__(
                                 self,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -
                                 url, sha256, file_name = self.TRAIN if train else self.TEST
                        -        super().__init__(url, file_name, sha256, True, **kwargs)
                        +        super().__init__(
                        +            url,
                        +            file_name,
                        +            sha256,
                        +            True,
                        +            pre_transforms=convert_target_to_relative if not recognition_task else None,
                        +            **kwargs,
                        +        )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                        +
                                 self.train = train
                        -        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 np_dtype = np.float32
                         
                                 # Load xml data
                        -        tmp_root = os.path.join(
                        -            self.root, 'SceneTrialTrain' if self.train else 'SceneTrialTest') if sha256 else self.root
                        -        xml_tree = ET.parse(os.path.join(tmp_root, 'words.xml'))
                        +        tmp_root = (
                        +            os.path.join(self.root, "SceneTrialTrain" if self.train else "SceneTrialTest") if sha256 else self.root
                        +        )
                        +        xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
                                 xml_root = xml_tree.getroot()
                         
                        -        for image in xml_root:
                        -            name, resolution, rectangles = image
                        +        for image in tqdm(iterable=xml_root, desc="Unpacking IC03", total=len(xml_root)):
                        +            name, _resolution, rectangles = image
                         
                                     # File existence check
                                     if not os.path.exists(os.path.join(tmp_root, name.text)):
                        @@ -359,41 +389,43 @@ 

                        Source code for doctr.datasets.ic03

                                         # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                         _boxes = [
                                             [
                        -                        [float(rect.attrib['x']), float(rect.attrib['y'])],
                        -                        [float(rect.attrib['x']) + float(rect.attrib['width']), float(rect.attrib['y'])],
                        +                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
                        +                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
                                                 [
                        -                            float(rect.attrib['x']) + float(rect.attrib['width']),
                        -                            float(rect.attrib['y']) + float(rect.attrib['height'])
                        +                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
                        +                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
                                                 ],
                        -                        [float(rect.attrib['x']), float(rect.attrib['y']) + float(rect.attrib['height'])],
                        +                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
                                             ]
                                             for rect in rectangles
                                         ]
                                     else:
                                         # x_min, y_min, x_max, y_max
                                         _boxes = [
                        -                    [float(rect.attrib['x']), float(rect.attrib['y']),  # type: ignore[list-item]
                        -                     float(rect.attrib['x']) + float(rect.attrib['width']),  # type: ignore[list-item]
                        -                     float(rect.attrib['y']) + float(rect.attrib['height'])]  # type: ignore[list-item]
                        +                    [
                        +                        float(rect.attrib["x"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["y"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
                        +                    ]
                                             for rect in rectangles
                                         ]
                         
                                     # filter images without boxes
                                     if len(_boxes) > 0:
                        -                # Convert them to relative
                        -                w, h = int(resolution.attrib['x']), int(resolution.attrib['y'])
                        -                boxes = np.asarray(_boxes, dtype=np_dtype)
                        -                if use_polygons:
                        -                    boxes[:, :, 0] /= w
                        -                    boxes[:, :, 1] /= h
                        -                else:
                        -                    boxes[:, [0, 2]] /= w
                        -                    boxes[:, [1, 3]] /= h
                        -
                        +                boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
                                         # Get the labels
                                         labels = [lab.text for rect in rectangles for lab in rect if lab.text]
                         
                        -                self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                        +                if recognition_task:
                        +                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
                        +                    for crop, label in zip(crops, labels):
                        +                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                        +                            self.data.append((crop, label))
                        +                elif detection_task:
                        +                    self.data.append((name.text, boxes))
                        +                else:
                        +                    self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                         
                                 self.root = tmp_root
                         
                        @@ -432,7 +464,7 @@ 

                        Source code for doctr.datasets.ic03

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/ic13.html b/v0.5.1/_modules/doctr/datasets/ic13.html index 40e534577a..b7c4d9612e 100644 --- a/v0.5.1/_modules/doctr/datasets/ic13.html +++ b/v0.5.1/_modules/doctr/datasets/ic13.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.ic13

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import csv
                         import os
                         from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import numpy as np
                        +from tqdm import tqdm
                         
                         from .datasets import AbstractDataset
                        -from .utils import convert_target_to_relative
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                         __all__ = ["IC13"]
                         
                        @@ -310,7 +317,7 @@ 

                        Source code for doctr.datasets.ic13

                         class IC13(AbstractDataset):
                             """IC13 dataset from `"ICDAR 2013 Robust Reading Competition" <https://rrc.cvc.uab.es/>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/ic13-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic13-grid.png&src=0
                                 :align: center
                         
                             >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
                        @@ -323,9 +330,12 @@ 

                        Source code for doctr.datasets.ic13

                             >>> img, target = test_set[0]
                         
                             Args:
                        +    ----
                                 img_folder: folder with all the images of the dataset
                                 label_folder: folder with all annotation files for the images
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `AbstractDataset`.
                             """
                         
                        @@ -334,33 +344,42 @@ 

                        Source code for doctr.datasets.ic13

                                 img_folder: str,
                                 label_folder: str,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -        super().__init__(img_folder, pre_transforms=convert_target_to_relative, **kwargs)
                        +        super().__init__(
                        +            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
                        +        )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                         
                                 # File existence check
                                 if not os.path.exists(label_folder) or not os.path.exists(img_folder):
                                     raise FileNotFoundError(
                        -                f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}")
                        +                f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
                        +            )
                         
                        -        self.data: List[Tuple[Path, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 np_dtype = np.float32
                         
                                 img_names = os.listdir(img_folder)
                         
                        -        for img_name in img_names:
                        -
                        +        for img_name in tqdm(iterable=img_names, desc="Unpacking IC13", total=len(img_names)):
                                     img_path = Path(img_folder, img_name)
                                     label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
                         
                        -            with open(label_path, newline='\n') as f:
                        +            with open(label_path, newline="\n") as f:
                                         _lines = [
                                             [val[:-1] if val.endswith(",") else val for val in row]
                        -                    for row in csv.reader(f, delimiter=' ', quotechar="'")
                        +                    for row in csv.reader(f, delimiter=" ", quotechar="'")
                                         ]
                        -            labels = [line[-1] for line in _lines]
                        +            labels = [line[-1].replace('"', "") for line in _lines]
                                     # xmin, ymin, xmax, ymax
                        -            box_targets = np.array([list(map(int, line[:4])) for line in _lines], dtype=np_dtype)
                        +            box_targets: np.ndarray = np.array([list(map(int, line[:4])) for line in _lines], dtype=np_dtype)
                                     if use_polygons:
                                         # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                         box_targets = np.array(
                        @@ -370,10 +389,20 @@ 

                        Source code for doctr.datasets.ic13

                                                     [coords[2], coords[1]],
                                                     [coords[2], coords[3]],
                                                     [coords[0], coords[3]],
                        -                        ] for coords in box_targets
                        -                    ], dtype=np_dtype
                        +                        ]
                        +                        for coords in box_targets
                        +                    ],
                        +                    dtype=np_dtype,
                                         )
                        -            self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
                        + + if recognition_task: + crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets) + for crop, label in zip(crops, labels): + self.data.append((crop, label)) + elif detection_task: + self.data.append((img_path, box_targets)) + else: + self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
                        @@ -407,7 +436,7 @@

                        Source code for doctr.datasets.ic13

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/iiit5k.html b/v0.5.1/_modules/doctr/datasets/iiit5k.html index 7348b1215d..4759d20b24 100644 --- a/v0.5.1/_modules/doctr/datasets/iiit5k.html +++ b/v0.5.1/_modules/doctr/datasets/iiit5k.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.iiit5k

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import os
                        -from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import numpy as np
                         import scipy.io as sio
                        +from tqdm import tqdm
                         
                         from .datasets import VisionDataset
                         from .utils import convert_target_to_relative
                         
                        -__all__ = ['IIIT5K']
                        +__all__ = ["IIIT5K"]
                         
                         
                         
                        @@ -312,7 +318,7 @@

                        Source code for doctr.datasets.iiit5k

                             `"BMVC 2012 Scene Text Recognition using Higher Order Language Priors"
                             <https://cdn.iiit.ac.in/cdn/cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/home/mishraBMVC12.pdf>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/iiit5k-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/iiit5k-grid.png&src=0
                                 :align: center
                         
                             >>> # NOTE: this dataset is for character-level localization
                        @@ -321,40 +327,50 @@ 

                        Source code for doctr.datasets.iiit5k

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                         
                        -    URL = 'https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz'
                        -    SHA256 = '7872c9efbec457eb23f3368855e7738f72ce10927f52a382deb4966ca0ffa38e'
                        +    URL = "https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz"
                        +    SHA256 = "7872c9efbec457eb23f3368855e7738f72ce10927f52a382deb4966ca0ffa38e"
                         
                             def __init__(
                                 self,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -
                                 super().__init__(
                                     self.URL,
                                     None,
                                     file_hash=self.SHA256,
                                     extract_archive=True,
                        -            pre_transforms=convert_target_to_relative,
                        -            **kwargs
                        +            pre_transforms=convert_target_to_relative if not recognition_task else None,
                        +            **kwargs,
                                 )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                        +
                                 self.train = train
                         
                                 # Load mat data
                        -        tmp_root = os.path.join(self.root, 'IIIT5K') if self.SHA256 else self.root
                        -        mat_file = 'trainCharBound' if self.train else 'testCharBound'
                        -        mat_data = sio.loadmat(os.path.join(tmp_root, f'{mat_file}.mat'))[mat_file][0]
                        +        tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
                        +        mat_file = "trainCharBound" if self.train else "testCharBound"
                        +        mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
                         
                        -        self.data: List[Tuple[Path, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 np_dtype = np.float32
                         
                        -        for img_path, label, box_targets in mat_data:
                        +        for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
                                     _raw_path = img_path[0]
                                     _raw_label = label[0]
                         
                        @@ -370,15 +386,23 @@ 

                        Source code for doctr.datasets.iiit5k

                                                 [box[0] + box[2], box[1]],
                                                 [box[0] + box[2], box[1] + box[3]],
                                                 [box[0], box[1] + box[3]],
                        -                    ] for box in box_targets
                        +                    ]
                        +                    for box in box_targets
                                         ]
                                     else:
                                         # xmin, ymin, xmax, ymax
                                         box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
                         
                        -            # label are casted to list where each char corresponds to the character's bounding box
                        -            self.data.append((_raw_path, dict(boxes=np.asarray(
                        -                box_targets, dtype=np_dtype), labels=list(_raw_label))))
                        +            if recognition_task:
                        +                self.data.append((_raw_path, _raw_label))
                        +            elif detection_task:
                        +                self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
                        +            else:
                        +                # label are casted to list where each char corresponds to the character's bounding box
                        +                self.data.append((
                        +                    _raw_path,
                        +                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(_raw_label)),
                        +                ))
                         
                                 self.root = tmp_root
                         
                        @@ -417,7 +441,7 @@ 

                        Source code for doctr.datasets.iiit5k

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/imgur5k.html b/v0.5.1/_modules/doctr/datasets/imgur5k.html index 4b1f5713be..10d5c082d2 100644 --- a/v0.5.1/_modules/doctr/datasets/imgur5k.html +++ b/v0.5.1/_modules/doctr/datasets/imgur5k.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.imgur5k

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                        +import glob
                         import json
                         import os
                         from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import cv2
                         import numpy as np
                        +from PIL import Image
                        +from tqdm import tqdm
                         
                         from .datasets import AbstractDataset
                        -from .utils import convert_target_to_relative
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                         __all__ = ["IMGUR5K"]
                         
                        @@ -313,7 +322,7 @@ 

                        Source code for doctr.datasets.imgur5k

                             <https://arxiv.org/abs/2106.08385>`_ |
                             `repository <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/imgur5k-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/imgur5k-grid.png&src=0
                                 :align: center
                                 :width: 630
                                 :height: 400
                        @@ -328,10 +337,13 @@ 

                        Source code for doctr.datasets.imgur5k

                             >>> img, target = test_set[0]
                         
                             Args:
                        +    ----
                                 img_folder: folder with all the images of the dataset
                                 label_path: path to the annotations file of the dataset
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `AbstractDataset`.
                             """
                         
                        @@ -341,16 +353,24 @@ 

                        Source code for doctr.datasets.imgur5k

                                 label_path: str,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -        super().__init__(img_folder, pre_transforms=convert_target_to_relative, **kwargs)
                        +        super().__init__(
                        +            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
                        +        )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                         
                                 # File existence check
                                 if not os.path.exists(label_path) or not os.path.exists(img_folder):
                        -            raise FileNotFoundError(
                        -                f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
                        +            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
                         
                        -        self.data: List[Tuple[Path, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 self.train = train
                                 np_dtype = np.float32
                         
                        @@ -358,10 +378,22 @@ 

                        Source code for doctr.datasets.imgur5k

                                 train_samples = int(len(img_names) * 0.9)
                                 set_slice = slice(train_samples) if self.train else slice(train_samples, None)
                         
                        +        # define folder to write IMGUR5K recognition dataset
                        +        reco_folder_name = "IMGUR5K_recognition_train" if self.train else "IMGUR5K_recognition_test"
                        +        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
                        +        reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name)
                        +        reco_images_counter = 0
                        +
                        +        if recognition_task and os.path.isdir(reco_folder_path):
                        +            self._read_from_folder(reco_folder_path)
                        +            return
                        +        elif recognition_task and not os.path.isdir(reco_folder_path):
                        +            os.makedirs(reco_folder_path, exist_ok=False)
                        +
                                 with open(label_path) as f:
                                     annotation_file = json.load(f)
                         
                        -        for img_name in img_names[set_slice]:
                        +        for img_name in tqdm(iterable=img_names[set_slice], desc="Unpacking IMGUR5K", total=len(img_names[set_slice])):
                                     img_path = Path(img_folder, img_name)
                                     img_id = img_name.split(".")[0]
                         
                        @@ -371,15 +403,18 @@ 

                        Source code for doctr.datasets.imgur5k

                         
                                     # some files have no annotations which are marked with only a dot in the 'word' key
                                     # ref: https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset/blob/main/README.md
                        -            if img_id not in annotation_file['index_to_ann_map'].keys():
                        +            if img_id not in annotation_file["index_to_ann_map"].keys():
                                         continue
                        -            ann_ids = annotation_file['index_to_ann_map'][img_id]
                        -            annotations = [annotation_file['ann_id'][a_id] for a_id in ann_ids]
                        +            ann_ids = annotation_file["index_to_ann_map"][img_id]
                        +            annotations = [annotation_file["ann_id"][a_id] for a_id in ann_ids]
                         
                        -            labels = [ann['word'] for ann in annotations if ann['word'] != '.']
                        +            labels = [ann["word"] for ann in annotations if ann["word"] != "."]
                                     # x_center, y_center, width, height, angle
                        -            _boxes = [list(map(float, ann['bounding_box'].strip('[ ]').split(', ')))
                        -                      for ann in annotations if ann['word'] != '.']
                        +            _boxes = [
                        +                list(map(float, ann["bounding_box"].strip("[ ]").split(", ")))
                        +                for ann in annotations
                        +                if ann["word"] != "."
                        +            ]
                                     # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                     box_targets = [cv2.boxPoints(((box[0], box[1]), (box[2], box[3]), box[4])) for box in _boxes]
                         
                        @@ -389,10 +424,33 @@ 

                        Source code for doctr.datasets.imgur5k

                         
                                     # filter images without boxes
                                     if len(box_targets) > 0:
                        -                self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
                        +                if recognition_task:
                        +                    crops = crop_bboxes_from_image(
                        +                        img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
                        +                    )
                        +                    for crop, label in zip(crops, labels):
                        +                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                        +                            # write data to disk
                        +                            with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
                        +                                f.write(label)
                        +                                tmp_img = Image.fromarray(crop)
                        +                                tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                        +                                reco_images_counter += 1
                        +                elif detection_task:
                        +                    self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
                        +                else:
                        +                    self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
                        +
                        +        if recognition_task:
                        +            self._read_from_folder(reco_folder_path)
                         
                             def extra_repr(self) -> str:
                        -        return f"train={self.train}"
                        + return f"train={self.train}" + + def _read_from_folder(self, path: str) -> None: + for img_path in glob.glob(os.path.join(path, "*.png")): + with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: + self.data.append((img_path, f.read()))
                        @@ -426,7 +484,7 @@

                        Source code for doctr.datasets.imgur5k

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/loader.html b/v0.5.1/_modules/doctr/datasets/loader.html index a54cb68d72..ba5bc217e0 100644 --- a/v0.5.1/_modules/doctr/datasets/loader.html +++ b/v0.5.1/_modules/doctr/datasets/loader.html @@ -226,32 +226,20 @@

                        Source code for doctr.datasets.loader

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021, Mindee.
                         
                         # This program is licensed under the Apache License version 2.
                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                         
                         import math
                        -from typing import Callable, Optional
                        -
                        -import numpy as np
                         import tensorflow as tf
                        +import numpy as np
                        +from typing import Optional
                         
                        -from doctr.utils.multithreading import multithread_exec
                        +from .multithreading import multithread_exec
                         
                         __all__ = ["DataLoader"]
                         
                        @@ -321,23 +308,23 @@ 

                        Source code for doctr.datasets.loader

                         
                         
                         
                        -[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - >>> from doctr.datasets import FUNSD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + Example:: + >>> from doctr.datasets import FUNSD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - num_workers: number of workers to use for data loading - collate_fn: function to merge samples into a batch + workers: number of workers to use for data loading """ def __init__( @@ -346,24 +333,17 @@

                        Source code for doctr.datasets.loader

                                 shuffle: bool = True,
                                 batch_size: int = 1,
                                 drop_last: bool = False,
                        -        num_workers: Optional[int] = None,
                        -        collate_fn: Optional[Callable] = None,
                        +        workers: Optional[int] = None,
                             ) -> None:
                                 self.dataset = dataset
                                 self.shuffle = shuffle
                                 self.batch_size = batch_size
                                 nb = len(self.dataset) / batch_size
                                 self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
                        -        if collate_fn is None:
                        -            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                        -        else:
                        -            self.collate_fn = collate_fn
                        -        self.num_workers = num_workers
                        +        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                        +        self.workers = workers
                                 self.reset()
                         
                        -    def __len__(self) -> int:
                        -        return self.num_batches
                        -
                             def reset(self) -> None:
                                 # Updates indices after each epoch
                                 self._num_yielded = 0
                        @@ -381,7 +361,7 @@ 

                        Source code for doctr.datasets.loader

                                     idx = self._num_yielded * self.batch_size
                                     indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
                         
                        -            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.num_workers)
                        +            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
                         
                                     batch_data = self.collate_fn(samples)
                         
                        @@ -422,7 +402,7 @@ 

                        Source code for doctr.datasets.loader

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/ocr.html b/v0.5.1/_modules/doctr/datasets/ocr.html index 8c97421a48..2c4fb1b838 100644 --- a/v0.5.1/_modules/doctr/datasets/ocr.html +++ b/v0.5.1/_modules/doctr/datasets/ocr.html @@ -226,32 +226,20 @@

                        Source code for doctr.datasets.ocr

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021, Mindee.
                         
                         # This program is licensed under the Apache License version 2.
                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                         
                        -import json
                         import os
                        -from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        -
                        +import json
                         import numpy as np
                        +from pathlib import Path
                        +from typing import List, Dict, Any, Tuple, Optional, Callable
                         
                         from .datasets import AbstractDataset
                        +from doctr.utils.geometry import fit_rbbox
                        +
                         
                         __all__ = ['OCRDataset']
                         
                         
                         
                        -[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset Args: img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `AbstractDataset`. + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `VisionDataset`. """ def __init__( self, img_folder: str, label_file: str, - use_polygons: bool = False, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, **kwargs: Any, ) -> None: - super().__init__(img_folder, **kwargs) + + self.sample_transforms = sample_transforms + self.root = img_folder # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - np_dtype = np.float32 with open(label_file, 'rb') as f: data = json.load(f) - for img_name, annotations in data.items(): + for file_dic in data: # Get image path - img_name = Path(img_name) + img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if len(annotations["typed_words"]) == 0: - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) + if (len(file_dic["coordinates"]) == 0 or + (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) continue - # Unpack the straight boxes (xmin, ymin, xmax, ymax) - geoms = [list(map(float, obj['geometry'][:4])) for obj in annotations['typed_words']] - if use_polygons: - # (x, y) coordinates of top left, top right, bottom right, bottom left corners - geoms = [ - [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] - for geom in geoms - ] - - text_targets = [obj['value'] for obj in annotations['typed_words']] + is_valid: List[bool] = [] + box_targets: List[List[float]] = [] + for box in file_dic["coordinates"]: + if rotated_bbox: + x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) + box = [x, y, w, h, alpha] + is_valid.append(w > 0 and h > 0) + else: + xs, ys = zip(*box) + box = [min(xs), min(ys), max(xs), max(ys)] + is_valid.append(box[0] < box[2] and box[1] < box[3]) + if is_valid[-1]: + box_targets.append(box) - self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
                        + text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] + self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                        @@ -387,7 +384,7 @@

                        Source code for doctr.datasets.ocr

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/recognition.html b/v0.5.1/_modules/doctr/datasets/recognition.html index 5baf716dfe..52424168a9 100644 --- a/v0.5.1/_modules/doctr/datasets/recognition.html +++ b/v0.5.1/_modules/doctr/datasets/recognition.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.recognition

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import json
                         import os
                        @@ -313,6 +319,7 @@ 

                        Source code for doctr.datasets.recognition

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 img_folder: path to the images folder
                                 labels_path: pathe to the json file containing all labels (character sequences)
                                 **kwargs: keyword arguments from `AbstractDataset`.
                        @@ -327,7 +334,7 @@ 

                        Source code for doctr.datasets.recognition

                                 super().__init__(img_folder, **kwargs)
                         
                                 self.data: List[Tuple[str, str]] = []
                        -        with open(labels_path) as f:
                        +        with open(labels_path, encoding="utf-8") as f:
                                     labels = json.load(f)
                         
                                 for img_name, label in labels.items():
                        @@ -377,7 +384,7 @@ 

                        Source code for doctr.datasets.recognition

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/sroie.html b/v0.5.1/_modules/doctr/datasets/sroie.html index fa22acbeca..0425870abb 100644 --- a/v0.5.1/_modules/doctr/datasets/sroie.html +++ b/v0.5.1/_modules/doctr/datasets/sroie.html @@ -226,32 +226,20 @@

                        Source code for doctr.datasets.sroie

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021, Mindee.
                         
                         # This program is licensed under the Apache License version 2.
                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                         
                        -import csv
                         import os
                        -from pathlib import Path
                        -from typing import Any, Dict, List, Tuple
                        -
                        +import csv
                         import numpy as np
                        +from pathlib import Path
                        +from typing import List, Dict, Any, Tuple, Optional, Callable
                         
                         from .datasets import VisionDataset
                        -from .utils import convert_target_to_relative
                         
                         __all__ = ['SROIE']
                         
                         
                         
                        -[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/sroie-grid.png - :align: center - - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ @@ -332,41 +317,44 @@

                        Source code for doctr.datasets.sroie

                             def __init__(
                                 self,
                                 train: bool = True,
                        -        use_polygons: bool = False,
                        +        sample_transforms: Optional[Callable[[Any], Any]] = None,
                        +        rotated_bbox: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                         
                                 url, sha256 = self.TRAIN if train else self.TEST
                        -        super().__init__(url, None, sha256, True, pre_transforms=convert_target_to_relative, **kwargs)
                        +        super().__init__(url, None, sha256, True, **kwargs)
                        +        self.sample_transforms = sample_transforms
                                 self.train = train
                         
                        -        tmp_root = os.path.join(self.root, 'images')
                        -        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        -        np_dtype = np.float32
                        -
                        -        for img_path in os.listdir(tmp_root):
                        +        if rotated_bbox:
                        +            raise NotImplementedError
                         
                        +        # # List images
                        +        self.root = os.path.join(self._root, 'images')
                        +        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        +        for img_path in os.listdir(self.root):
                                     # File existence check
                        -            if not os.path.exists(os.path.join(tmp_root, img_path)):
                        -                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
                        -
                        +            if not os.path.exists(os.path.join(self.root, img_path)):
                        +                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}")
                                     stem = Path(img_path).stem
                        -            with open(os.path.join(self.root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                        -                _rows = [row for row in list(csv.reader(f, delimiter=',')) if len(row) > 0]
                        -
                        -            labels = [",".join(row[8:]) for row in _rows]
                        -            # reorder coordinates (8 -> (4,2) ->
                        -            # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines
                        -            coords = np.stack([np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2))
                        -                              for row in _rows], axis=0)
                        -
                        -            if not use_polygons:
                        -                # xmin, ymin, xmax, ymax
                        -                coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1)
                        -
                        -            self.data.append((img_path, dict(boxes=coords, labels=labels)))
                        -
                        -        self.root = tmp_root
                        +            _targets = []
                        +            with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f:
                        +                for row in csv.reader(f, delimiter=','):
                        +                    # Safeguard for blank lines
                        +                    if len(row) > 0:
                        +                        # Label may contain commas
                        +                        label = ",".join(row[8:])
                        +                        # Reduce 8 coords to 4
                        +                        p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8])
                        +                        left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x)
                        +                        top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y)
                        +                        if len(label) > 0:
                        +                            _targets.append((label, [left, top, right, bot]))
                        +
                        +            text_targets, box_targets = zip(*_targets)
                        +
                        +            self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                         
                             def extra_repr(self) -> str:
                                 return f"train={self.train}"
                        @@ -403,7 +391,7 @@

                        Source code for doctr.datasets.sroie

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/svhn.html b/v0.5.1/_modules/doctr/datasets/svhn.html index 9d501619b3..44f36099fa 100644 --- a/v0.5.1/_modules/doctr/datasets/svhn.html +++ b/v0.5.1/_modules/doctr/datasets/svhn.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.svhn

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import os
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import h5py
                         import numpy as np
                         from tqdm import tqdm
                         
                         from .datasets import VisionDataset
                        -from .utils import convert_target_to_relative
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                        -__all__ = ['SVHN']
                        +__all__ = ["SVHN"]
                         
                         
                         
                        @@ -311,7 +317,7 @@

                        Source code for doctr.datasets.svhn

                             """SVHN dataset from `"The Street View House Numbers (SVHN) Dataset"
                             <http://ufldl.stanford.edu/housenumbers/>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/svhn-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svhn-grid.png&src=0
                                 :align: center
                         
                             >>> from doctr.datasets import SVHN
                        @@ -319,45 +325,60 @@ 

                        Source code for doctr.datasets.svhn

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                        -    TRAIN = ('http://ufldl.stanford.edu/housenumbers/train.tar.gz',
                        -             '4b17bb33b6cd8f963493168f80143da956f28ec406cc12f8e5745a9f91a51898',
                        -             'svhn_train.tar')
                         
                        -    TEST = ('http://ufldl.stanford.edu/housenumbers/test.tar.gz',
                        -            '57ac9ceb530e4aa85b55d991be8fc49c695b3d71c6f6a88afea86549efde7fb5',
                        -            'svhn_test.tar')
                        +    TRAIN = (
                        +        "http://ufldl.stanford.edu/housenumbers/train.tar.gz",
                        +        "4b17bb33b6cd8f963493168f80143da956f28ec406cc12f8e5745a9f91a51898",
                        +        "svhn_train.tar",
                        +    )
                        +
                        +    TEST = (
                        +        "http://ufldl.stanford.edu/housenumbers/test.tar.gz",
                        +        "57ac9ceb530e4aa85b55d991be8fc49c695b3d71c6f6a88afea86549efde7fb5",
                        +        "svhn_test.tar",
                        +    )
                         
                             def __init__(
                                 self,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -
                                 url, sha256, name = self.TRAIN if train else self.TEST
                                 super().__init__(
                                     url,
                                     file_name=name,
                                     file_hash=sha256,
                                     extract_archive=True,
                        -            pre_transforms=convert_target_to_relative,
                        -            **kwargs
                        +            pre_transforms=convert_target_to_relative if not recognition_task else None,
                        +            **kwargs,
                                 )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                        +
                                 self.train = train
                        -        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 np_dtype = np.float32
                         
                        -        tmp_root = os.path.join(self.root, 'train' if train else 'test')
                        +        tmp_root = os.path.join(self.root, "train" if train else "test")
                         
                                 # Load mat data (matlab v7.3 - can not be loaded with scipy)
                        -        with h5py.File(os.path.join(tmp_root, 'digitStruct.mat'), 'r') as f:
                        -            img_refs = f['digitStruct/name']
                        -            box_refs = f['digitStruct/bbox']
                        -            for img_ref, box_ref in tqdm(iterable=zip(img_refs, box_refs), desc='Unpacking SVHN', total=len(img_refs)):
                        +        with h5py.File(os.path.join(tmp_root, "digitStruct.mat"), "r") as f:
                        +            img_refs = f["digitStruct/name"]
                        +            box_refs = f["digitStruct/bbox"]
                        +            for img_ref, box_ref in tqdm(iterable=zip(img_refs, box_refs), desc="Unpacking SVHN", total=len(img_refs)):
                                         # convert ascii matrix to string
                                         img_name = "".join(map(chr, f[img_ref[0]][()].flatten()))
                         
                        @@ -367,39 +388,49 @@ 

                        Source code for doctr.datasets.svhn

                         
                                         # Unpack the information
                                         box = f[box_ref[0]]
                        -                if box['left'].shape[0] == 1:
                        +                if box["left"].shape[0] == 1:
                                             box_dict = {k: [int(vals[0][0])] for k, vals in box.items()}
                                         else:
                                             box_dict = {k: [int(f[v[0]][()].item()) for v in vals] for k, vals in box.items()}
                         
                                         # Convert it to the right format
                        -                coords = np.array([
                        -                    box_dict['left'],
                        -                    box_dict['top'],
                        -                    box_dict['width'],
                        -                    box_dict['height']
                        -                ], dtype=np_dtype).transpose()
                        -                label_targets = list(map(str, box_dict['label']))
                        +                coords: np.ndarray = np.array(
                        +                    [box_dict["left"], box_dict["top"], box_dict["width"], box_dict["height"]], dtype=np_dtype
                        +                ).transpose()
                        +                label_targets = list(map(str, box_dict["label"]))
                         
                                         if use_polygons:
                                             # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                        -                    box_targets = np.stack(
                        +                    box_targets: np.ndarray = np.stack(
                                                 [
                                                     np.stack([coords[:, 0], coords[:, 1]], axis=-1),
                                                     np.stack([coords[:, 0] + coords[:, 2], coords[:, 1]], axis=-1),
                                                     np.stack([coords[:, 0] + coords[:, 2], coords[:, 1] + coords[:, 3]], axis=-1),
                                                     np.stack([coords[:, 0], coords[:, 1] + coords[:, 3]], axis=-1),
                        -                        ], axis=1
                        +                        ],
                        +                        axis=1,
                                             )
                                         else:
                                             # x, y, width, height -> xmin, ymin, xmax, ymax
                        -                    box_targets = np.stack([
                        -                        coords[:, 0],
                        -                        coords[:, 1],
                        -                        coords[:, 0] + coords[:, 2],
                        -                        coords[:, 1] + coords[:, 3],
                        -                    ], axis=-1)
                        -                self.data.append((img_name, dict(boxes=box_targets, labels=label_targets)))
                        +                    box_targets = np.stack(
                        +                        [
                        +                            coords[:, 0],
                        +                            coords[:, 1],
                        +                            coords[:, 0] + coords[:, 2],
                        +                            coords[:, 1] + coords[:, 3],
                        +                        ],
                        +                        axis=-1,
                        +                    )
                        +
                        +                if recognition_task:
                        +                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_name), geoms=box_targets)
                        +                    for crop, label in zip(crops, label_targets):
                        +                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                        +                            self.data.append((crop, label))
                        +                elif detection_task:
                        +                    self.data.append((img_name, box_targets))
                        +                else:
                        +                    self.data.append((img_name, dict(boxes=box_targets, labels=label_targets)))
                         
                                 self.root = tmp_root
                         
                        @@ -438,7 +469,7 @@ 

                        Source code for doctr.datasets.svhn

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/svt.html b/v0.5.1/_modules/doctr/datasets/svt.html index f981b19657..ff75309df4 100644 --- a/v0.5.1/_modules/doctr/datasets/svt.html +++ b/v0.5.1/_modules/doctr/datasets/svt.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.svt

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         import os
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import defusedxml.ElementTree as ET
                         import numpy as np
                        +from tqdm import tqdm
                         
                         from .datasets import VisionDataset
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                        -__all__ = ['SVT']
                        +__all__ = ["SVT"]
                         
                         
                         
                        @@ -309,7 +317,7 @@

                        Source code for doctr.datasets.svt

                             """SVT dataset from `"The Street View Text Dataset - UCSD Computer Vision"
                             <http://vision.ucsd.edu/~kai/svt/>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/svt-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
                                 :align: center
                         
                             >>> from doctr.datasets import SVT
                        @@ -317,34 +325,54 @@ 

                        Source code for doctr.datasets.svt

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                         
                        -    URL = 'http://vision.ucsd.edu/~kai/svt/svt.zip'
                        -    SHA256 = '63b3d55e6b6d1e036e2a844a20c034fe3af3c32e4d914d6e0c4a3cd43df3bebf'
                        +    URL = "http://vision.ucsd.edu/~kai/svt/svt.zip"
                        +    SHA256 = "63b3d55e6b6d1e036e2a844a20c034fe3af3c32e4d914d6e0c4a3cd43df3bebf"
                         
                             def __init__(
                                 self,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        +        super().__init__(
                        +            self.URL,
                        +            None,
                        +            self.SHA256,
                        +            True,
                        +            pre_transforms=convert_target_to_relative if not recognition_task else None,
                        +            **kwargs,
                        +        )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                         
                        -        super().__init__(self.URL, None, self.SHA256, True, **kwargs)
                                 self.train = train
                        -        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                 np_dtype = np.float32
                         
                                 # Load xml data
                        -        tmp_root = os.path.join(self.root, 'svt1') if self.SHA256 else self.root
                        -        xml_tree = ET.parse(os.path.join(tmp_root, 'train.xml')) if self.train else ET.parse(
                        -            os.path.join(tmp_root, 'test.xml'))
                        +        tmp_root = os.path.join(self.root, "svt1") if self.SHA256 else self.root
                        +        xml_tree = (
                        +            ET.parse(os.path.join(tmp_root, "train.xml"))
                        +            if self.train
                        +            else ET.parse(os.path.join(tmp_root, "test.xml"))
                        +        )
                                 xml_root = xml_tree.getroot()
                         
                        -        for image in xml_root:
                        -            name, _, _, resolution, rectangles = image
                        +        for image in tqdm(iterable=xml_root, desc="Unpacking SVT", total=len(xml_root)):
                        +            name, _, _, _resolution, rectangles = image
                         
                                     # File existence check
                                     if not os.path.exists(os.path.join(tmp_root, name.text)):
                        @@ -354,38 +382,41 @@ 

                        Source code for doctr.datasets.svt

                                         # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                         _boxes = [
                                             [
                        -                        [float(rect.attrib['x']), float(rect.attrib['y'])],
                        -                        [float(rect.attrib['x']) + float(rect.attrib['width']), float(rect.attrib['y'])],
                        +                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
                        +                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
                                                 [
                        -                            float(rect.attrib['x']) + float(rect.attrib['width']),
                        -                            float(rect.attrib['y']) + float(rect.attrib['height'])
                        +                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
                        +                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
                                                 ],
                        -                        [float(rect.attrib['x']), float(rect.attrib['y']) + float(rect.attrib['height'])],
                        +                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
                                             ]
                                             for rect in rectangles
                                         ]
                                     else:
                                         # x_min, y_min, x_max, y_max
                                         _boxes = [
                        -                    [float(rect.attrib['x']), float(rect.attrib['y']),  # type: ignore[list-item]
                        -                     float(rect.attrib['x']) + float(rect.attrib['width']),  # type: ignore[list-item]
                        -                     float(rect.attrib['y']) + float(rect.attrib['height'])]  # type: ignore[list-item]
                        +                    [
                        +                        float(rect.attrib["x"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["y"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
                        +                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
                        +                    ]
                                             for rect in rectangles
                                         ]
                        -            # Convert them to relative
                        -            w, h = int(resolution.attrib['x']), int(resolution.attrib['y'])
                        -            boxes = np.asarray(_boxes, dtype=np_dtype)
                        -            if use_polygons:
                        -                boxes[:, :, 0] /= w
                        -                boxes[:, :, 1] /= h
                        -            else:
                        -                boxes[:, [0, 2]] /= w
                        -                boxes[:, [1, 3]] /= h
                         
                        +            boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
                                     # Get the labels
                                     labels = [lab.text for rect in rectangles for lab in rect]
                         
                        -            self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                        +            if recognition_task:
                        +                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
                        +                for crop, label in zip(crops, labels):
                        +                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                        +                        self.data.append((crop, label))
                        +            elif detection_task:
                        +                self.data.append((name.text, boxes))
                        +            else:
                        +                self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                         
                                 self.root = tmp_root
                         
                        @@ -424,7 +455,7 @@ 

                        Source code for doctr.datasets.svt

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/synthtext.html b/v0.5.1/_modules/doctr/datasets/synthtext.html index 6278697719..b3cef0e63f 100644 --- a/v0.5.1/_modules/doctr/datasets/synthtext.html +++ b/v0.5.1/_modules/doctr/datasets/synthtext.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.datasets.synthtext

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                        +import glob
                         import os
                        -from typing import Any, Dict, List, Tuple
                        +from typing import Any, Dict, List, Tuple, Union
                         
                         import numpy as np
                        +from PIL import Image
                         from scipy import io as sio
                         from tqdm import tqdm
                         
                         from .datasets import VisionDataset
                        -from .utils import convert_target_to_relative
                        +from .utils import convert_target_to_relative, crop_bboxes_from_image
                         
                        -__all__ = ['SynthText']
                        +__all__ = ["SynthText"]
                         
                         
                         
                        @@ -312,7 +320,7 @@

                        Source code for doctr.datasets.synthtext

                             <https://arxiv.org/abs/1604.06646>`_ | `"repository" <https://github.com/ankush-me/SynthText>`_ |
                             `"website" <https://www.robots.ox.ac.uk/~vgg/data/scenetext/>`_.
                         
                        -    .. image:: https://github.com/mindee/doctr/releases/download/v0.5.0/svt-grid.png
                        +    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
                                 :align: center
                         
                             >>> from doctr.datasets import SynthText
                        @@ -320,65 +328,111 @@ 

                        Source code for doctr.datasets.synthtext

                             >>> img, target = train_set[0]
                         
                             Args:
                        +    ----
                                 train: whether the subset should be the training one
                                 use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                        +        recognition_task: whether the dataset should be used for recognition task
                        +        detection_task: whether the dataset should be used for detection task
                                 **kwargs: keyword arguments from `VisionDataset`.
                             """
                         
                        -    URL = 'https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip'
                        -    SHA256 = '28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1'
                        +    URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip"
                        +    SHA256 = "28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1"
                         
                             def __init__(
                                 self,
                                 train: bool = True,
                                 use_polygons: bool = False,
                        +        recognition_task: bool = False,
                        +        detection_task: bool = False,
                                 **kwargs: Any,
                             ) -> None:
                        -
                                 super().__init__(
                                     self.URL,
                                     None,
                                     file_hash=None,
                                     extract_archive=True,
                        -            pre_transforms=convert_target_to_relative,
                        -            **kwargs
                        +            pre_transforms=convert_target_to_relative if not recognition_task else None,
                        +            **kwargs,
                                 )
                        +        if recognition_task and detection_task:
                        +            raise ValueError(
                        +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                        +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                        +            )
                        +
                                 self.train = train
                        +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                        +        np_dtype = np.float32
                         
                                 # Load mat data
                        -        tmp_root = os.path.join(self.root, 'SynthText') if self.SHA256 else self.root
                        -        mat_data = sio.loadmat(os.path.join(tmp_root, 'gt.mat'))
                        -        train_samples = int(len(mat_data['imnames'][0]) * 0.9)
                        +        tmp_root = os.path.join(self.root, "SynthText") if self.SHA256 else self.root
                        +        # define folder to write SynthText recognition dataset
                        +        reco_folder_name = "SynthText_recognition_train" if self.train else "SynthText_recognition_test"
                        +        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
                        +        reco_folder_path = os.path.join(tmp_root, reco_folder_name)
                        +        reco_images_counter = 0
                        +
                        +        if recognition_task and os.path.isdir(reco_folder_path):
                        +            self._read_from_folder(reco_folder_path)
                        +            return
                        +        elif recognition_task and not os.path.isdir(reco_folder_path):
                        +            os.makedirs(reco_folder_path, exist_ok=False)
                        +
                        +        mat_data = sio.loadmat(os.path.join(tmp_root, "gt.mat"))
                        +        train_samples = int(len(mat_data["imnames"][0]) * 0.9)
                                 set_slice = slice(train_samples) if self.train else slice(train_samples, None)
                        -        paths = mat_data['imnames'][0][set_slice]
                        -        boxes = mat_data['wordBB'][0][set_slice]
                        -        labels = mat_data['txt'][0][set_slice]
                        +        paths = mat_data["imnames"][0][set_slice]
                        +        boxes = mat_data["wordBB"][0][set_slice]
                        +        labels = mat_data["txt"][0][set_slice]
                                 del mat_data
                         
                        -        self.data: List[Tuple[str, Dict[str, Any]]] = []
                        -        np_dtype = np.float32
                        -
                        -        for img_path, word_boxes, txt in tqdm(iterable=zip(paths, boxes, labels),
                        -                                              desc='Unpacking SynthText', total=len(paths)):
                        +        for img_path, word_boxes, txt in tqdm(
                        +            iterable=zip(paths, boxes, labels), desc="Unpacking SynthText", total=len(paths)
                        +        ):
                                     # File existence check
                                     if not os.path.exists(os.path.join(tmp_root, img_path[0])):
                                         raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path[0])}")
                         
                                     labels = [elt for word in txt.tolist() for elt in word.split()]
                                     # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                        -            word_boxes = word_boxes.transpose(2, 1, 0) if word_boxes.ndim == 3 else np.expand_dims(
                        -                word_boxes.transpose(1, 0), axis=0)
                        +            word_boxes = (
                        +                word_boxes.transpose(2, 1, 0)
                        +                if word_boxes.ndim == 3
                        +                else np.expand_dims(word_boxes.transpose(1, 0), axis=0)
                        +            )
                         
                                     if not use_polygons:
                                         # xmin, ymin, xmax, ymax
                                         word_boxes = np.concatenate((word_boxes.min(axis=1), word_boxes.max(axis=1)), axis=1)
                         
                        -            self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))
                        +            if recognition_task:
                        +                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path[0]), geoms=word_boxes)
                        +                for crop, label in zip(crops, labels):
                        +                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                        +                        # write data to disk
                        +                        with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
                        +                            f.write(label)
                        +                            tmp_img = Image.fromarray(crop)
                        +                            tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                        +                            reco_images_counter += 1
                        +            elif detection_task:
                        +                self.data.append((img_path[0], np.asarray(word_boxes, dtype=np_dtype)))
                        +            else:
                        +                self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))
                        +
                        +        if recognition_task:
                        +            self._read_from_folder(reco_folder_path)
                         
                                 self.root = tmp_root
                         
                             def extra_repr(self) -> str:
                        -        return f"train={self.train}"
                        + return f"train={self.train}" + + def _read_from_folder(self, path: str) -> None: + for img_path in glob.glob(os.path.join(path, "*.png")): + with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f: + self.data.append((img_path, f.read()))
                        @@ -412,7 +466,7 @@

                        Source code for doctr.datasets.synthtext

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/datasets/utils.html b/v0.5.1/_modules/doctr/datasets/utils.html index 6d30dea25c..499d3fff84 100644 --- a/v0.5.1/_modules/doctr/datasets/utils.html +++ b/v0.5.1/_modules/doctr/datasets/utils.html @@ -226,32 +226,20 @@

                        Source code for doctr.datasets.utils

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021, Mindee.
                         
                         # This program is licensed under the Apache License version 2.
                         # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                         
                         import string
                         import unicodedata
                        -from collections.abc import Sequence
                        -from functools import partial
                        -from typing import Any, Dict, List, Optional
                        -from typing import Sequence as SequenceType
                        -from typing import Tuple, TypeVar, Union
                        -
                         import numpy as np
                        -
                        -from doctr.io.image import get_img_shape
                        -from doctr.utils.geometry import convert_to_relative_coords
                        +from typing import List, Optional, Any
                         
                         from .vocabs import VOCABS
                         
                        -__all__ = ['translate', 'encode_string', 'decode_sequence', 'encode_sequences']
                        -
                        -ImageTensor = TypeVar('ImageTensor')
                        +__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
                         
                         
                         def translate(
                        @@ -346,7 +324,7 @@ 

                        Source code for doctr.datasets.utils

                             return translated
                         
                         
                        -def encode_string(
                        +def encode_sequence(
                             input_string: str,
                             vocab: str,
                         ) -> List[int]:
                        @@ -363,29 +341,26 @@ 

                        Source code for doctr.datasets.utils

                         
                         
                         def decode_sequence(
                        -    input_seq: Union[np.array, SequenceType[int]],
                        +    input_array: np.array,
                             mapping: str,
                         ) -> str:
                             """Given a predefined mapping, decode the sequence of numbers to a string
                         
                             Args:
                        -        input_seq: array to decode
                        +        input_array: array to decode
                                 mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
                         
                             Returns:
                        -        A string, decoded from input_seq
                        -    """
                        +        A string, decoded from input_array"""
                         
                        -    if not isinstance(input_seq, (Sequence, np.ndarray)):
                        -        raise TypeError("Invalid sequence type")
                        -    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
                        +    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
                                 raise AssertionError("Input must be an array of int, with max less than mapping size")
                        -
                        -    return ''.join(map(mapping.__getitem__, input_seq))
                        +    decoded = ''.join(mapping[idx] for idx in input_array)
                        +    return decoded
                         
                         
                         
                        -[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -393,7 +368,6 @@

                        Source code for doctr.datasets.utils

                             eos: int = -1,
                             sos: Optional[int] = None,
                             pad: Optional[int] = None,
                        -    dynamic_seq_length: bool = False,
                             **kwargs: Any,
                         ) -> np.ndarray:
                             """Encode character sequences using a given vocab as mapping
                        @@ -405,7 +379,6 @@ 

                        Source code for doctr.datasets.utils

                                 eos: encoding of End Of String
                                 sos: optional encoding of Start Of String
                                 pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
                        -        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
                         
                             Returns:
                                 the padded encoded data as a tensor
                        @@ -414,32 +387,29 @@ 

                        Source code for doctr.datasets.utils

                             if 0 <= eos < len(vocab):
                                 raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
                         
                        -    if not isinstance(target_size, int) or dynamic_seq_length:
                        -        # Maximum string length + EOS
                        -        max_length = max(len(w) for w in sequences) + 1
                        -        if isinstance(sos, int):
                        -            max_length += 1
                        -        if isinstance(pad, int):
                        -            max_length += 1
                        -        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
                        +    if not isinstance(target_size, int):
                        +        target_size = max(len(w) for w in sequences)
                        +        if sos:
                        +            target_size += 1
                        +        if pad:
                        +            target_size += 1
                         
                             # Pad all sequences
                        -    if isinstance(pad, int):  # pad with padding symbol
                        +    if pad:  # pad with padding symbol
                                 if 0 <= pad < len(vocab):
                                     raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
                                 # In that case, add EOS at the end of the word before padding
                        -        default_symbol = pad
                        +        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
                             else:  # pad with eos symbol
                        -        default_symbol = eos
                        -    encoded_data = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
                        +        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
                         
                        -    # Encode the strings
                        -    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
                        -        if isinstance(pad, int):  # add eos at the end of the sequence
                        -            seq.append(eos)
                        -        encoded_data[idx, :min(len(seq), target_size)] = seq[:min(len(seq), target_size)]
                        +    for idx, seq in enumerate(sequences):
                        +        encoded_seq = encode_sequence(seq, vocab)
                        +        if pad:  # add eos at the end of the sequence
                        +            encoded_seq.append(eos)
                        +        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
                         
                        -    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
                        +    if sos:  # place eos symbol at the beginning of each sequence
                                 if 0 <= sos < len(vocab):
                                     raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
                                 encoded_data = np.roll(encoded_data, 1)
                        @@ -447,12 +417,6 @@ 

                        Source code for doctr.datasets.utils

                         
                             return encoded_data
                        - - -def convert_target_to_relative(img: ImageTensor, target: Dict[str, Any]) -> Tuple[ImageTensor, Dict[str, Any]]: - - target['boxes'] = convert_to_relative_coords(target['boxes'], get_img_shape(img)) - return img, target
                        @@ -485,7 +449,7 @@

                        Source code for doctr.datasets.utils

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/io/elements.html b/v0.5.1/_modules/doctr/io/elements.html index dcc3fd432a..73dbca5877 100644 --- a/v0.5.1/_modules/doctr/io/elements.html +++ b/v0.5.1/_modules/doctr/io/elements.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                        Source code for doctr.io.elements

                        -# Copyright (C) 2021-2022, Mindee.
                        +# Copyright (C) 2021-2024, Mindee.
                         
                        -# This program is licensed under the Apache License version 2.
                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                        +# This program is licensed under the Apache License 2.0.
                        +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                         
                         from typing import Any, Dict, List, Optional, Tuple, Union
                         
                        @@ -301,16 +307,21 @@ 

                        Source code for doctr.io.elements

                         from xml.etree.ElementTree import Element as ETElement
                         from xml.etree.ElementTree import SubElement
                         
                        -import matplotlib.pyplot as plt
                         import numpy as np
                         
                         import doctr
                        +from doctr.file_utils import requires_package
                         from doctr.utils.common_types import BoundingBox
                         from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
                        +from doctr.utils.reconstitution import synthesize_kie_page, synthesize_page
                         from doctr.utils.repr import NestedObject
                        -from doctr.utils.visualization import synthesize_page, visualize_page
                         
                        -__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
                        +try:  # optional dependency for visualization
                        +    from doctr.utils.visualization import visualize_kie_page, visualize_page
                        +except ModuleNotFoundError:
                        +    pass
                        +
                        +__all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page", "KIEPage", "Document"]
                         
                         
                         class Element(NestedObject):
                        @@ -328,10 +339,14 @@ 

                        Source code for doctr.io.elements

                         
                             def export(self) -> Dict[str, Any]:
                                 """Exports the object into a nested dict format"""
                        -
                                 export_dict = {k: getattr(self, k) for k in self._exported_keys}
                                 for children_name in self._children_names:
                        -            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
                        +            if children_name in ["predictions"]:
                        +                export_dict[children_name] = {
                        +                    k: [item.export() for item in c] for k, c in getattr(self, children_name).items()
                        +                }
                        +            else:
                        +                export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
                         
                                 return export_dict
                         
                        @@ -349,20 +364,32 @@ 

                        Source code for doctr.io.elements

                             """Implements a word element
                         
                             Args:
                        +    ----
                                 value: the text string of the word
                                 confidence: the confidence associated with the text prediction
                                 geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                                 the page's size
                        +        objectness_score: the objectness score of the detection
                        +        crop_orientation: the general orientation of the crop in degrees and its confidence
                             """
                         
                        -    _exported_keys: List[str] = ["value", "confidence", "geometry"]
                        +    _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
                             _children_names: List[str] = []
                         
                        -    def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, np.ndarray]) -> None:
                        +    def __init__(
                        +        self,
                        +        value: str,
                        +        confidence: float,
                        +        geometry: Union[BoundingBox, np.ndarray],
                        +        objectness_score: float,
                        +        crop_orientation: Dict[str, Any],
                        +    ) -> None:
                                 super().__init__()
                                 self.value = value
                                 self.confidence = confidence
                                 self.geometry = geometry
                        +        self.objectness_score = objectness_score
                        +        self.crop_orientation = crop_orientation
                         
                             def render(self) -> str:
                                 """Renders the full text of the element"""
                        @@ -384,6 +411,7 @@ 

                        Source code for doctr.io.elements

                             """Implements a non-textual element
                         
                             Args:
                        +    ----
                                 artefact_type: the type of artefact
                                 confidence: the confidence of the type prediction
                                 geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                        @@ -419,29 +447,35 @@ 

                        Source code for doctr.io.elements

                             """Implements a line element as a collection of words
                         
                             Args:
                        +    ----
                                 words: list of word elements
                                 geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                                     the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
                                     all words in it.
                             """
                         
                        -    _exported_keys: List[str] = ["geometry"]
                        -    _children_names: List[str] = ['words']
                        +    _exported_keys: List[str] = ["geometry", "objectness_score"]
                        +    _children_names: List[str] = ["words"]
                             words: List[Word] = []
                         
                             def __init__(
                                 self,
                                 words: List[Word],
                                 geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
                        +        objectness_score: Optional[float] = None,
                             ) -> None:
                        +        # Compute the objectness score of the line
                        +        if objectness_score is None:
                        +            objectness_score = float(np.mean([w.objectness_score for w in words]))
                                 # Resolve the geometry using the smallest enclosing bounding box
                                 if geometry is None:
                                     # Check whether this is a rotated or straight box
                                     box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
                        -            geometry = box_resolution_fn([w.geometry for w in words])  # type: ignore[operator, misc]
                        +            geometry = box_resolution_fn([w.geometry for w in words])  # type: ignore[operator]
                         
                                 super().__init__(words=words)
                                 self.geometry = geometry
                        +        self.objectness_score = objectness_score
                         
                             def render(self) -> str:
                                 """Renders the full text of the element"""
                        @@ -451,18 +485,30 @@ 

                        Source code for doctr.io.elements

                             def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                 kwargs = {k: save_dict[k] for k in cls._exported_keys}
                                 kwargs.update({
                        -            'words': [Word.from_dict(_dict) for _dict in save_dict['words']],
                        +            "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
                                 })
                                 return cls(**kwargs)
                        +class Prediction(Word): + """Implements a prediction element""" + + def render(self) -> str: + """Renders the full text of the element""" + return self.value + + def extra_repr(self) -> str: + return f"value='{self.value}', confidence={self.confidence:.2}, bounding_box={self.geometry}" + +
                        [docs] class Block(Element): """Implements a block element as a collection of lines and artefacts Args: + ---- lines: list of line elements artefacts: list of artefacts geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to @@ -470,8 +516,8 @@

                        Source code for doctr.io.elements

                                     all lines and artefacts in it.
                             """
                         
                        -    _exported_keys: List[str] = ["geometry"]
                        -    _children_names: List[str] = ['lines', 'artefacts']
                        +    _exported_keys: List[str] = ["geometry", "objectness_score"]
                        +    _children_names: List[str] = ["lines", "artefacts"]
                             lines: List[Line] = []
                             artefacts: List[Artefact] = []
                         
                        @@ -480,20 +526,25 @@ 

                        Source code for doctr.io.elements

                                 lines: List[Line] = [],
                                 artefacts: List[Artefact] = [],
                                 geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
                        +        objectness_score: Optional[float] = None,
                             ) -> None:
                        +        # Compute the objectness score of the line
                        +        if objectness_score is None:
                        +            objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
                                 # Resolve the geometry using the smallest enclosing bounding box
                                 if geometry is None:
                                     line_boxes = [word.geometry for line in lines for word in line.words]
                                     artefact_boxes = [artefact.geometry for artefact in artefacts]
                        -            box_resolution_fn = resolve_enclosing_rbbox if isinstance(
                        -                lines[0].geometry, np.ndarray
                        -            ) else resolve_enclosing_bbox
                        -            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator, arg-type]
                        +            box_resolution_fn = (
                        +                resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
                        +            )
                        +            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator]
                         
                                 super().__init__(lines=lines, artefacts=artefacts)
                                 self.geometry = geometry
                        +        self.objectness_score = objectness_score
                         
                        -    def render(self, line_break: str = '\n') -> str:
                        +    def render(self, line_break: str = "\n") -> str:
                                 """Renders the full text of the element"""
                                 return line_break.join(line.render() for line in self.lines)
                         
                        @@ -501,8 +552,8 @@ 

                        Source code for doctr.io.elements

                             def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                 kwargs = {k: save_dict[k] for k in cls._exported_keys}
                                 kwargs.update({
                        -            'lines': [Line.from_dict(_dict) for _dict in save_dict['lines']],
                        -            'artefacts': [Artefact.from_dict(_dict) for _dict in save_dict['artefacts']],
                        +            "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
                        +            "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
                                 })
                                 return cls(**kwargs)
                        @@ -514,6 +565,8 @@

                        Source code for doctr.io.elements

                             """Implements a page element as a collection of blocks
                         
                             Args:
                        +    ----
                        +        page: image encoded as a numpy array in uint8
                                 blocks: list of block elements
                                 page_idx: the index of the page in the input raw document
                                 dimensions: the page size in pixels in format (height, width)
                        @@ -522,11 +575,12 @@ 

                        Source code for doctr.io.elements

                             """
                         
                             _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
                        -    _children_names: List[str] = ['blocks']
                        +    _children_names: List[str] = ["blocks"]
                             blocks: List[Block] = []
                         
                             def __init__(
                                 self,
                        +        page: np.ndarray,
                                 blocks: List[Block],
                                 page_idx: int,
                                 dimensions: Tuple[int, int],
                        @@ -534,12 +588,13 @@ 

                        Source code for doctr.io.elements

                                 language: Optional[Dict[str, Any]] = None,
                             ) -> None:
                                 super().__init__(blocks=blocks)
                        +        self.page = page
                                 self.page_idx = page_idx
                                 self.dimensions = dimensions
                                 self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
                                 self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
                         
                        -    def render(self, block_break: str = '\n\n') -> str:
                        +    def render(self, block_break: str = "\n\n") -> str:
                                 """Renders the full text of the element"""
                                 return block_break.join(b.render() for b in self.blocks)
                         
                        @@ -548,37 +603,41 @@ 

                        Source code for doctr.io.elements

                         
                         
                        [docs] - def show( - self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs - ) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor + **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed") + requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed") + import matplotlib.pyplot as plt + + visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) plt.show(**kwargs)
                        def synthesize(self, **kwargs) -> np.ndarray: """Synthesize the page from the predictions - Returns: + Returns + ------- synthesized page """ - return synthesize_page(self.export(), **kwargs) - def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]: + def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]: """Export the page as XML (hOCR-format) convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md Args: + ---- file_title: the title of the XML file Returns: + ------- a tuple of the XML byte string, and its ElementTree """ p_idx = self.page_idx @@ -586,86 +645,259 @@

                        Source code for doctr.io.elements

                                 line_count: int = 1
                                 word_count: int = 1
                                 height, width = self.dimensions
                        -        language = self.language if 'language' in self.language.keys() else 'en'
                        +        language = self.language if "language" in self.language.keys() else "en"
                                 # Create the XML root element
                        -        page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
                        +        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
                                 # Create the header / SubElements of the root element
                        -        head = SubElement(page_hocr, 'head')
                        -        SubElement(head, 'title').text = file_title
                        -        SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
                        -        SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
                        -        SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
                        -                                         'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
                        +        head = SubElement(page_hocr, "head")
                        +        SubElement(head, "title").text = file_title
                        +        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
                        +        SubElement(
                        +            head,
                        +            "meta",
                        +            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
                        +        )
                        +        SubElement(
                        +            head,
                        +            "meta",
                        +            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
                        +        )
                                 # Create the body
                        -        body = SubElement(page_hocr, 'body')
                        -        SubElement(body, 'div', attrib={
                        -            'class': 'ocr_page',
                        -            'id': f'page_{p_idx + 1}',
                        -            'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
                        -        })
                        +        body = SubElement(page_hocr, "body")
                        +        SubElement(
                        +            body,
                        +            "div",
                        +            attrib={
                        +                "class": "ocr_page",
                        +                "id": f"page_{p_idx + 1}",
                        +                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
                        +            },
                        +        )
                                 # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
                                 for block in self.blocks:
                                     if len(block.geometry) != 2:
                                         raise TypeError("XML export is only available for straight bounding boxes for now.")
                        -            (xmin, ymin), (xmax, ymax) = block.geometry  # type: ignore[misc]
                        -            block_div = SubElement(body, 'div', attrib={
                        -                'class': 'ocr_carea',
                        -                'id': f'block_{block_count}',
                        -                'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        -                    {int(round(xmax * width))} {int(round(ymax * height))}'
                        -            })
                        -            paragraph = SubElement(block_div, 'p', attrib={
                        -                'class': 'ocr_par',
                        -                'id': f'par_{block_count}',
                        -                'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        -                    {int(round(xmax * width))} {int(round(ymax * height))}'
                        -            })
                        +            (xmin, ymin), (xmax, ymax) = block.geometry
                        +            block_div = SubElement(
                        +                body,
                        +                "div",
                        +                attrib={
                        +                    "class": "ocr_carea",
                        +                    "id": f"block_{block_count}",
                        +                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        +                    {int(round(xmax * width))} {int(round(ymax * height))}",
                        +                },
                        +            )
                        +            paragraph = SubElement(
                        +                block_div,
                        +                "p",
                        +                attrib={
                        +                    "class": "ocr_par",
                        +                    "id": f"par_{block_count}",
                        +                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        +                    {int(round(xmax * width))} {int(round(ymax * height))}",
                        +                },
                        +            )
                                     block_count += 1
                                     for line in block.lines:
                        -                (xmin, ymin), (xmax, ymax) = line.geometry  # type: ignore[misc]
                        +                (xmin, ymin), (xmax, ymax) = line.geometry
                                         # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
                        -                line_span = SubElement(paragraph, 'span', attrib={
                        -                    'class': 'ocr_line',
                        -                    'id': f'line_{line_count}',
                        -                    'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        -                        {int(round(xmax * width))} {int(round(ymax * height))}; \
                        -                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
                        -                })
                        +                line_span = SubElement(
                        +                    paragraph,
                        +                    "span",
                        +                    attrib={
                        +                        "class": "ocr_line",
                        +                        "id": f"line_{line_count}",
                        +                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        +                        {int(round(xmax * width))} {int(round(ymax * height))}; \
                        +                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
                        +                    },
                        +                )
                                         line_count += 1
                                         for word in line.words:
                        -                    (xmin, ymin), (xmax, ymax) = word.geometry  # type: ignore[misc]
                        +                    (xmin, ymin), (xmax, ymax) = word.geometry
                                             conf = word.confidence
                        -                    word_div = SubElement(line_span, 'span', attrib={
                        -                        'class': 'ocrx_word',
                        -                        'id': f'word_{word_count}',
                        -                        'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        -                            {int(round(xmax * width))} {int(round(ymax * height))}; \
                        -                            x_wconf {int(round(conf * 100))}'
                        -                    })
                        +                    word_div = SubElement(
                        +                        line_span,
                        +                        "span",
                        +                        attrib={
                        +                            "class": "ocrx_word",
                        +                            "id": f"word_{word_count}",
                        +                            "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                        +                            {int(round(xmax * width))} {int(round(ymax * height))}; \
                        +                            x_wconf {int(round(conf * 100))}",
                        +                        },
                        +                    )
                                             # set the text
                                             word_div.text = word.value
                                             word_count += 1
                         
                        -        return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))
                        +        return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
                         
                             @classmethod
                             def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                 kwargs = {k: save_dict[k] for k in cls._exported_keys}
                        -        kwargs.update({'blocks': [Block.from_dict(block_dict) for block_dict in save_dict['blocks']]})
                        +        kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
                                 return cls(**kwargs)
                        +class KIEPage(Element): + """Implements a KIE page element as a collection of predictions + + Args: + ---- + predictions: Dictionary with list of block elements for each detection class + page: image encoded as a numpy array in uint8 + page_idx: the index of the page in the input raw document + dimensions: the page size in pixels in format (height, width) + orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction + language: a dictionary with the language value and confidence of the prediction + """ + + _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"] + _children_names: List[str] = ["predictions"] + predictions: Dict[str, List[Prediction]] = {} + + def __init__( + self, + page: np.ndarray, + predictions: Dict[str, List[Prediction]], + page_idx: int, + dimensions: Tuple[int, int], + orientation: Optional[Dict[str, Any]] = None, + language: Optional[Dict[str, Any]] = None, + ) -> None: + super().__init__(predictions=predictions) + self.page = page + self.page_idx = page_idx + self.dimensions = dimensions + self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None) + self.language = language if isinstance(language, dict) else dict(value=None, confidence=None) + + def render(self, prediction_break: str = "\n\n") -> str: + """Renders the full text of the element""" + return prediction_break.join( + f"{class_name}: {p.render()}" for class_name, predictions in self.predictions.items() for p in predictions + ) + + def extra_repr(self) -> str: + return f"dimensions={self.dimensions}" + + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + """Overlay the result on a given image + + Args: + interactive: whether the display should be interactive + preserve_aspect_ratio: pass True if you passed True to the predictor + **kwargs: keyword arguments passed to the matplotlib.pyplot.show method + """ + requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed") + requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed") + import matplotlib.pyplot as plt + + visualize_kie_page( + self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio + ) + plt.show(**kwargs) + + def synthesize(self, **kwargs) -> np.ndarray: + """Synthesize the page from the predictions + + Args: + ---- + **kwargs: keyword arguments passed to the matplotlib.pyplot.show method + + Returns: + ------- + synthesized page + """ + return synthesize_kie_page(self.export(), **kwargs) + + def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]: + """Export the page as XML (hOCR-format) + convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md + + Args: + ---- + file_title: the title of the XML file + + Returns: + ------- + a tuple of the XML byte string, and its ElementTree + """ + p_idx = self.page_idx + prediction_count: int = 1 + height, width = self.dimensions + language = self.language if "language" in self.language.keys() else "en" + # Create the XML root element + page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)}) + # Create the header / SubElements of the root element + head = SubElement(page_hocr, "head") + SubElement(head, "title").text = file_title + SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"}) + SubElement( + head, + "meta", + attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"}, # type: ignore[attr-defined] + ) + SubElement( + head, + "meta", + attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"}, + ) + # Create the body + body = SubElement(page_hocr, "body") + SubElement( + body, + "div", + attrib={ + "class": "ocr_page", + "id": f"page_{p_idx + 1}", + "title": f"image; bbox 0 0 {width} {height}; ppageno 0", + }, + ) + # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes + for class_name, predictions in self.predictions.items(): + for prediction in predictions: + if len(prediction.geometry) != 2: + raise TypeError("XML export is only available for straight bounding boxes for now.") + (xmin, ymin), (xmax, ymax) = prediction.geometry + prediction_div = SubElement( + body, + "div", + attrib={ + "class": "ocr_carea", + "id": f"{class_name}_prediction_{prediction_count}", + "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}", + }, + ) + prediction_div.text = prediction.value + prediction_count += 1 + + return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr) + + @classmethod + def from_dict(cls, save_dict: Dict[str, Any], **kwargs): + kwargs = {k: save_dict[k] for k in cls._exported_keys} + kwargs.update({ + "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]] + }) + return cls(**kwargs) + +
                        [docs] class Document(Element): """Implements a document element as a collection of pages Args: + ---- pages: list of page elements """ - _children_names: List[str] = ['pages'] + _children_names: List[str] = ["pages"] pages: List[Page] = [] def __init__( @@ -674,38 +906,36 @@

                        Source code for doctr.io.elements

                             ) -> None:
                                 super().__init__(pages=pages)
                         
                        -    def render(self, page_break: str = '\n\n\n\n') -> str:
                        +    def render(self, page_break: str = "\n\n\n\n") -> str:
                                 """Renders the full text of the element"""
                                 return page_break.join(p.render() for p in self.pages)
                         
                         
                        [docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
                        + def show(self, **kwargs) -> None: + """Overlay the result on a given image""" + for result in self.pages: + result.show(**kwargs)
                        def synthesize(self, **kwargs) -> List[np.ndarray]: """Synthesize all pages from their predictions - Returns: + Returns + ------- list of synthesized pages """ - return [page.synthesize() for page in self.pages] def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: """Export the document as XML (hOCR-format) Args: + ---- **kwargs: additional keyword arguments passed to the Page.export_as_xml method Returns: + ------- list of tuple of (bytes, ElementTree) """ return [page.export_as_xml(**kwargs) for page in self.pages] @@ -713,9 +943,27 @@

                        Source code for doctr.io.elements

                             @classmethod
                             def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                 kwargs = {k: save_dict[k] for k in cls._exported_keys}
                        -        kwargs.update({'pages': [Page.from_dict(page_dict) for page_dict in save_dict['pages']]})
                        +        kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
                                 return cls(**kwargs)
                        + + +class KIEDocument(Document): + """Implements a document element as a collection of pages + + Args: + ---- + pages: list of page elements + """ + + _children_names: List[str] = ["pages"] + pages: List[KIEPage] = [] # type: ignore[assignment] + + def __init__( + self, + pages: List[KIEPage], + ) -> None: + super().__init__(pages=pages) # type: ignore[arg-type]
                        @@ -748,7 +996,7 @@

                        Source code for doctr.io.elements

                               
                             
                           
                        -
                        +
                        diff --git a/v0.5.1/_modules/doctr/io/html.html b/v0.5.1/_modules/doctr/io/html.html index 11a322542e..d5495fcd8a 100644 --- a/v0.5.1/_modules/doctr/io/html.html +++ b/v0.5.1/_modules/doctr/io/html.html @@ -234,10 +234,16 @@

                        Using docTR

                        Package Reference

                          +
                        • doctr.contrib
                        • doctr.datasets
                        • doctr.io
                        • doctr.models
                        • @@ -287,16 +293,14 @@

                          Source code for doctr.io.html

                          -# Copyright (C) 2021-2022, Mindee.
                          +# Copyright (C) 2021-2024, Mindee.
                           
                          -# This program is licensed under the Apache License version 2.
                          -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                          +# This program is licensed under the Apache License 2.0.
                          +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                           
                           from typing import Any
                           
                          -from weasyprint import HTML
                          -
                          -__all__ = ['read_html']
                          +__all__ = ["read_html"]
                           
                           
                           
                          @@ -304,15 +308,19 @@

                          Source code for doctr.io.html

                           def read_html(url: str, **kwargs: Any) -> bytes:
                               """Read a PDF file and convert it into an image in numpy format
                           
                          -    >>> from doctr.documents import read_html
                          +    >>> from doctr.io import read_html
                               >>> doc = read_html("https://www.yoursite.com")
                           
                               Args:
                          +    ----
                                   url: URL of the target web page
                          +        **kwargs: keyword arguments from `weasyprint.HTML`
                           
                               Returns:
                          +    -------
                                   decoded PDF file as a bytes stream
                               """
                          +    from weasyprint import HTML
                           
                               return HTML(url, **kwargs).write_pdf()
                          @@ -348,7 +356,7 @@

                          Source code for doctr.io.html

                                 
                               
                             
                          -
                          +
                          diff --git a/v0.5.1/_modules/doctr/io/image/base.html b/v0.5.1/_modules/doctr/io/image/base.html index 3642b3294a..1ba249a68a 100644 --- a/v0.5.1/_modules/doctr/io/image/base.html +++ b/v0.5.1/_modules/doctr/io/image/base.html @@ -234,10 +234,16 @@

                          Using docTR

                          Package Reference

                            +
                          • doctr.contrib
                          • doctr.datasets
                          • doctr.io
                          • doctr.models
                          • @@ -287,10 +293,10 @@

                            Source code for doctr.io.image.base

                            -# Copyright (C) 2021-2022, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                             from pathlib import Path
                             from typing import Optional, Tuple
                            @@ -300,7 +306,7 @@ 

                            Source code for doctr.io.image.base

                             
                             from doctr.utils.common_types import AbstractFile
                             
                            -__all__ = ['read_img_as_numpy']
                            +__all__ = ["read_img_as_numpy"]
                             
                             
                             
                            @@ -312,25 +318,26 @@

                            Source code for doctr.io.image.base

                             ) -> np.ndarray:
                                 """Read an image file into numpy format
                             
                            -    >>> from doctr.documents import read_img
                            -    >>> page = read_img("path/to/your/doc.jpg")
                            +    >>> from doctr.io import read_img_as_numpy
                            +    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
                             
                                 Args:
                            +    ----
                                     file: the path to the image file
                                     output_size: the expected output size of each page in format H x W
                                     rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
                             
                                 Returns:
                            +    -------
                                     the page decoded as numpy ndarray of shape H x W x 3
                                 """
                            -
                                 if isinstance(file, (str, Path)):
                                     if not Path(file).is_file():
                                         raise FileNotFoundError(f"unable to access {file}")
                                     img = cv2.imread(str(file), cv2.IMREAD_COLOR)
                                 elif isinstance(file, bytes):
                            -        file = np.frombuffer(file, np.uint8)
                            -        img = cv2.imdecode(file, cv2.IMREAD_COLOR)
                            +        _file: np.ndarray = np.frombuffer(file, np.uint8)
                            +        img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
                                 else:
                                     raise TypeError("unsupported object type for argument 'file'")
                             
                            @@ -377,7 +384,7 @@ 

                            Source code for doctr.io.image.base

                                   
                                 
                               
                            -
                            +
                            diff --git a/v0.5.1/_modules/doctr/io/image/tensorflow.html b/v0.5.1/_modules/doctr/io/image/tensorflow.html index c87ff73752..f9faeeab1c 100644 --- a/v0.5.1/_modules/doctr/io/image/tensorflow.html +++ b/v0.5.1/_modules/doctr/io/image/tensorflow.html @@ -234,10 +234,16 @@

                            Using docTR

                            Package Reference

                            Source code for doctr.io.image.tensorflow

                            -# Copyright (C) 2021-2022, Mindee.
                            +# Copyright (C) 2021-2024, Mindee.
                             
                            -# This program is licensed under the Apache License version 2.
                            -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                            +# This program is licensed under the Apache License 2.0.
                            +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                             
                             from typing import Tuple
                             
                             import numpy as np
                             import tensorflow as tf
                             from PIL import Image
                            -
                            -if tf.__version__ >= '2.6.0':
                            -    from tensorflow.keras.utils import img_to_array
                            -else:
                            -    from tensorflow.keras.preprocessing.image import img_to_array
                            +from tensorflow.keras.utils import img_to_array
                             
                             from doctr.utils.common_types import AbstractPath
                             
                            -__all__ = ['tensor_from_pil', 'read_img_as_tensor', 'decode_img_as_tensor', 'tensor_from_numpy', 'get_img_shape']
                            +__all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
                             
                             
                            -def tensor_from_pil(pil_img: Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                            +def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                                 """Convert a PIL Image to a TensorFlow tensor
                             
                                 Args:
                            +    ----
                                     pil_img: a PIL image
                                     dtype: the output tensor data type
                             
                                 Returns:
                            +    -------
                                     decoded image as tensor
                                 """
                            -
                                 npy_img = img_to_array(pil_img)
                             
                                 return tensor_from_numpy(npy_img, dtype)
                            @@ -330,13 +333,14 @@ 

                            Source code for doctr.io.image.tensorflow

                                 """Read an image file as a TensorFlow tensor
                             
                                 Args:
                            +    ----
                                     img_path: location of the image file
                                     dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                             
                                 Returns:
                            +    -------
                                     decoded image as a tensor
                                 """
                            -
                                 if dtype not in (tf.uint8, tf.float16, tf.float32):
                                     raise ValueError("insupported value for dtype")
                             
                            @@ -357,13 +361,14 @@ 

                            Source code for doctr.io.image.tensorflow

                                 """Read a byte stream as a TensorFlow tensor
                             
                                 Args:
                            +    ----
                                     img_content: bytes of a decoded image
                                     dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                             
                                 Returns:
                            +    -------
                                     decoded image as a tensor
                                 """
                            -
                                 if dtype not in (tf.uint8, tf.float16, tf.float32):
                                     raise ValueError("insupported value for dtype")
                             
                            @@ -381,13 +386,14 @@ 

                            Source code for doctr.io.image.tensorflow

                                 """Read an image file as a TensorFlow tensor
                             
                                 Args:
                            -        img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                            +    ----
                            +        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                                     dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                             
                                 Returns:
                            +    -------
                                     same image as a tensor of shape (H, W, C)
                                 """
                            -
                                 if dtype not in (tf.uint8, tf.float16, tf.float32):
                                     raise ValueError("insupported value for dtype")
                             
                            @@ -401,6 +407,7 @@ 

                            Source code for doctr.io.image.tensorflow

                             
                             
                             def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
                            +    """Get the shape of an image"""
                                 return img.shape[:2]
                             
                            @@ -434,7 +441,7 @@

                            Source code for doctr.io.image.tensorflow

                                   
                                 
                               
                            -
                            + diff --git a/v0.5.1/_modules/doctr/io/pdf.html b/v0.5.1/_modules/doctr/io/pdf.html index 9af473fe64..91baf96f7b 100644 --- a/v0.5.1/_modules/doctr/io/pdf.html +++ b/v0.5.1/_modules/doctr/io/pdf.html @@ -234,10 +234,16 @@

                            Using docTR

                            Package Reference

                              +
                            • doctr.contrib
                            • doctr.datasets
                            • doctr.io
                            • doctr.models
                            • @@ -287,50 +293,53 @@

                              Source code for doctr.io.pdf

                              -# Copyright (C) 2021-2022, Mindee.
                              +# Copyright (C) 2021-2024, Mindee.
                               
                              -# This program is licensed under the Apache License version 2.
                              -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                              +# This program is licensed under the Apache License 2.0.
                              +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                               
                              -import os.path
                              -from pathlib import Path
                              -from typing import Any, List
                              +from typing import Any, List, Optional
                               
                               import numpy as np
                               import pypdfium2 as pdfium
                               
                               from doctr.utils.common_types import AbstractFile
                               
                              -__all__ = ['read_pdf']
                              +__all__ = ["read_pdf"]
                               
                               
                               
                              [docs] -def read_pdf(file: AbstractFile, scale: float = 2, **kwargs: Any) -> List[np.ndarray]: +def read_pdf( + file: AbstractFile, + scale: float = 2, + rgb_mode: bool = True, + password: Optional[str] = None, + **kwargs: Any, +) -> List[np.ndarray]: """Read a PDF file and convert it into an image in numpy format - >>> from doctr.documents import read_pdf + >>> from doctr.io import read_pdf >>> doc = read_pdf("path/to/your/doc.pdf") Args: + ---- file: the path to the PDF file scale: rendering scale (1 corresponds to 72dpi) - kwargs: additional parameters to :func:`pypdfium2._helpers.pdf_renderer.render_pdf_topil` + rgb_mode: if True, the output will be RGB, otherwise BGR + password: a password to unlock the document, if encrypted + **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render` Returns: + ------- the list of pages decoded as numpy ndarray of shape H x W x C """ - - if isinstance(file, Path): - file = str(file) - if not isinstance(file, (str, bytes)): - raise TypeError("unsupported object type for argument 'file'") - - if isinstance(file, str) and not os.path.isfile(file): - raise FileNotFoundError(f"unable to access {file}") - - # Rasterise pages to PIL images with pypdfium2 and convert to numpy ndarrays - return [np.asarray(img) for img, _ in pdfium.render_pdf_topil(file, scale=scale, **kwargs)]
                              + # Rasterise pages to numpy ndarrays with pypdfium2 + pdf = pdfium.PdfDocument(file, password=password) + try: + return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf] + finally: + pdf.close()
                              @@ -364,7 +373,7 @@

                              Source code for doctr.io.pdf

                                     
                                   
                                 
                              - + diff --git a/v0.5.1/_modules/doctr/io/reader.html b/v0.5.1/_modules/doctr/io/reader.html index 9a3c3a9e38..49cdc7d152 100644 --- a/v0.5.1/_modules/doctr/io/reader.html +++ b/v0.5.1/_modules/doctr/io/reader.html @@ -234,10 +234,16 @@

                              Using docTR

                              Package Reference

                                +
                              • doctr.contrib
                              • doctr.datasets
                              • doctr.io
                              • doctr.models
                              • @@ -287,23 +293,24 @@

                                Source code for doctr.io.reader

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                 from pathlib import Path
                                 from typing import List, Sequence, Union
                                 
                                 import numpy as np
                                 
                                +from doctr.file_utils import requires_package
                                 from doctr.utils.common_types import AbstractFile
                                 
                                 from .html import read_html
                                 from .image import read_img_as_numpy
                                 from .pdf import read_pdf
                                 
                                -__all__ = ['DocumentFile']
                                +__all__ = ["DocumentFile"]
                                 
                                 
                                 
                                @@ -317,16 +324,18 @@

                                Source code for doctr.io.reader

                                     def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
                                         """Read a PDF file
                                 
                                -        >>> from doctr.documents import DocumentFile
                                +        >>> from doctr.io import DocumentFile
                                         >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
                                 
                                         Args:
                                +        ----
                                             file: the path to the PDF file or a binary stream
                                +            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
                                 
                                         Returns:
                                +        -------
                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                         """
                                -
                                         return read_pdf(file, **kwargs)
                                @@ -336,15 +345,23 @@

                                Source code for doctr.io.reader

                                     def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
                                         """Interpret a web page as a PDF document
                                 
                                -        >>> from doctr.documents import DocumentFile
                                +        >>> from doctr.io import DocumentFile
                                         >>> doc = DocumentFile.from_url("https://www.yoursite.com")
                                 
                                         Args:
                                +        ----
                                             url: the URL of the target web page
                                +            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
                                 
                                         Returns:
                                +        -------
                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                         """
                                +        requires_package(
                                +            "weasyprint",
                                +            "`.from_url` requires weasyprint installed.\n"
                                +            + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation",
                                +        )
                                         pdf_stream = read_html(url)
                                         return cls.from_pdf(pdf_stream, **kwargs)
                                @@ -355,13 +372,16 @@

                                Source code for doctr.io.reader

                                     def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
                                         """Read an image file (or a collection of image files) and convert it into an image in numpy format
                                 
                                -        >>> from doctr.documents import DocumentFile
                                +        >>> from doctr.io import DocumentFile
                                         >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
                                 
                                         Args:
                                +        ----
                                             files: the path to the image file or a binary stream, or a collection of those
                                +            **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`
                                 
                                         Returns:
                                +        -------
                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                         """
                                         if isinstance(files, (str, Path, bytes)):
                                @@ -402,7 +422,7 @@ 

                                Source code for doctr.io.reader

                                       
                                     
                                   
                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.5.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index a0f857205e..0000000000 --- a/v0.5.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,688 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                -
                                -
                                - -
                                - -
                                -
                                - -
                                - -
                                -
                                - -
                                -
                                -
                                - - - - - Back to top - -
                                -
                                - -
                                - -
                                -
                                -

                                Source code for doctr.models.backbones.mobilenet.tensorflow

                                -# Copyright (C) 2021, Mindee.
                                -
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                -
                                -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
                                -
                                -from typing import Any, Dict, List, Optional, Tuple, Union
                                -
                                -import tensorflow as tf
                                -from tensorflow.keras import layers
                                -from tensorflow.keras.models import Sequential
                                -
                                -from ....datasets import VOCABS
                                -from ...utils import conv_sequence, load_pretrained_params
                                -
                                -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
                                -           "mobilenet_v3_large_r"]
                                -
                                -
                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                -    'mobilenet_v3_large': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 32, 3),
                                -        'vocab': VOCABS['legacy_french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
                                -    },
                                -    'mobilenet_v3_large_r': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 32, 3),
                                -        'vocab': VOCABS['french'],
                                -        'url': None,
                                -    },
                                -    'mobilenet_v3_small': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 32, 3),
                                -        'vocab': VOCABS['legacy_french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
                                -    },
                                -    'mobilenet_v3_small_r': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 32, 3),
                                -        'vocab': VOCABS['french'],
                                -        'url': None,
                                -    }
                                -}
                                -
                                -
                                -def hard_swish(x: tf.Tensor) -> tf.Tensor:
                                -    return x * tf.nn.relu6(x + 3.) / 6.0
                                -
                                -
                                -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
                                -    if min_value is None:
                                -        min_value = divisor
                                -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
                                -    # Make sure that round down does not go down by more than 10%.
                                -    if new_v < 0.9 * v:
                                -        new_v += divisor
                                -    return new_v
                                -
                                -
                                -class SqueezeExcitation(Sequential):
                                -    """Squeeze and Excitation.
                                -    """
                                -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
                                -        super().__init__(
                                -            [
                                -                layers.GlobalAveragePooling2D(),
                                -                layers.Dense(chan // squeeze_factor, activation='relu'),
                                -                layers.Dense(chan, activation='hard_sigmoid'),
                                -                layers.Reshape((1, 1, chan))
                                -            ]
                                -        )
                                -
                                -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
                                -        x = super().call(inputs, **kwargs)
                                -        x = tf.math.multiply(inputs, x)
                                -        return x
                                -
                                -
                                -class InvertedResidualConfig:
                                -    def __init__(
                                -        self,
                                -        input_channels: int,
                                -        kernel: int,
                                -        expanded_channels: int,
                                -        out_channels: int,
                                -        use_se: bool,
                                -        activation: str,
                                -        stride: Union[int, Tuple[int, int]],
                                -        width_mult: float = 1,
                                -    ) -> None:
                                -        self.input_channels = self.adjust_channels(input_channels, width_mult)
                                -        self.kernel = kernel
                                -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
                                -        self.out_channels = self.adjust_channels(out_channels, width_mult)
                                -        self.use_se = use_se
                                -        self.use_hs = activation == "HS"
                                -        self.stride = stride
                                -
                                -    @staticmethod
                                -    def adjust_channels(channels: int, width_mult: float):
                                -        return _make_divisible(channels * width_mult, 8)
                                -
                                -
                                -class InvertedResidual(layers.Layer):
                                -    """InvertedResidual for mobilenet
                                -
                                -    Args:
                                -        conf: configuration object for inverted residual
                                -    """
                                -    def __init__(
                                -        self,
                                -        conf: InvertedResidualConfig,
                                -        **kwargs: Any,
                                -    ) -> None:
                                -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
                                -        super().__init__(**kwargs)
                                -
                                -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
                                -
                                -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
                                -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
                                -
                                -        _layers = []
                                -        # expand
                                -        if conf.expanded_channels != conf.input_channels:
                                -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
                                -
                                -        # depth-wise
                                -        _layers.extend(conv_sequence(
                                -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
                                -            groups=conf.expanded_channels,
                                -        ))
                                -
                                -        if conf.use_se:
                                -            _layers.append(SqueezeExcitation(conf.expanded_channels))
                                -
                                -        # project
                                -        _layers.extend(conv_sequence(
                                -            conf.out_channels, None, kernel_size=1, bn=True,
                                -        ))
                                -
                                -        self.block = Sequential(_layers)
                                -
                                -    def call(
                                -        self,
                                -        inputs: tf.Tensor,
                                -        **kwargs: Any,
                                -    ) -> tf.Tensor:
                                -
                                -        out = self.block(inputs, **kwargs)
                                -        if self.use_res_connect:
                                -            out = tf.add(out, inputs)
                                -
                                -        return out
                                -
                                -
                                -class MobileNetV3(Sequential):
                                -    """Implements MobileNetV3, inspired from both:
                                -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
                                -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
                                -    """
                                -
                                -    def __init__(
                                -        self,
                                -        layout: List[InvertedResidualConfig],
                                -        input_shape: Tuple[int, int, int],
                                -        include_top: bool = False,
                                -        head_chans: int = 1024,
                                -        num_classes: int = 1000,
                                -    ) -> None:
                                -
                                -        _layers = [
                                -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
                                -                       input_shape=input_shape), name="stem")
                                -        ]
                                -
                                -        for idx, conf in enumerate(layout):
                                -            _layers.append(
                                -                InvertedResidual(conf, name=f"inverted_{idx}"),
                                -            )
                                -
                                -        _layers.append(
                                -            Sequential(
                                -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
                                -                name="final_block"
                                -            )
                                -        )
                                -
                                -        if include_top:
                                -            _layers.extend([
                                -                layers.GlobalAveragePooling2D(),
                                -                layers.Dense(head_chans, activation=hard_swish),
                                -                layers.Dropout(0.2),
                                -                layers.Dense(num_classes),
                                -            ])
                                -
                                -        super().__init__(_layers)
                                -
                                -
                                -def _mobilenet_v3(
                                -    arch: str,
                                -    pretrained: bool,
                                -    input_shape: Optional[Tuple[int, int, int]] = None,
                                -    **kwargs: Any
                                -) -> MobileNetV3:
                                -    input_shape = input_shape or default_cfgs[arch]['input_shape']
                                -
                                -    # cf. Table 1 & 2 of the paper
                                -    if arch.startswith("mobilenet_v3_small"):
                                -        inverted_residual_setting = [
                                -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
                                -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                                -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
                                -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                                -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                                -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                                -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
                                -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
                                -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                                -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                                -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                                -        ]
                                -        head_chans = 1024
                                -    else:
                                -        inverted_residual_setting = [
                                -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
                                -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
                                -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
                                -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                                -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                                -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                                -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                                -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
                                -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                                -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                                -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
                                -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
                                -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                                -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                                -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                                -        ]
                                -        head_chans = 1280
                                -
                                -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
                                -
                                -    # Build the model
                                -    model = MobileNetV3(
                                -        inverted_residual_setting,
                                -        input_shape,
                                -        head_chans=head_chans,
                                -        **kwargs,
                                -    )
                                -    # Load pretrained parameters
                                -    if pretrained:
                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                -
                                -    return model
                                -
                                -
                                -
                                -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
                                - - - -
                                -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
                                - - - -
                                -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
                                - - - -
                                -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
                                - -
                                -
                                -
                                -
                                - - -
                                -
                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                -
                                - -
                                -
                                - -
                                -
                                - -
                                -
                                - - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.5.1/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index d959be9a0f..0000000000 --- a/v0.5.1/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                -
                                -
                                - -
                                - -
                                -
                                - -
                                - -
                                -
                                - -
                                -
                                -
                                - - - - - Back to top - -
                                -
                                - -
                                - -
                                -
                                -

                                Source code for doctr.models.backbones.resnet.tensorflow

                                -# Copyright (C) 2021, Mindee.
                                -
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                -
                                -from typing import Any, Dict, List, Optional, Tuple
                                -
                                -import tensorflow as tf
                                -from tensorflow.keras import layers
                                -from tensorflow.keras.models import Sequential
                                -
                                -from ...utils import conv_sequence, load_pretrained_params
                                -
                                -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
                                -
                                -
                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
                                -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
                                -                 'url': None},
                                -}
                                -
                                -
                                -class ResnetBlock(layers.Layer):
                                -
                                -    """Implements a resnet31 block with shortcut
                                -
                                -    Args:
                                -        conv_shortcut: Use of shortcut
                                -        output_channels: number of channels to use in Conv2D
                                -        kernel_size: size of square kernels
                                -        strides: strides to use in the first convolution of the block
                                -    """
                                -    def __init__(
                                -        self,
                                -        output_channels: int,
                                -        conv_shortcut: bool,
                                -        strides: int = 1,
                                -        **kwargs
                                -    ) -> None:
                                -
                                -        super().__init__(**kwargs)
                                -        if conv_shortcut:
                                -            self.shortcut = Sequential(
                                -                [
                                -                    layers.Conv2D(
                                -                        filters=output_channels,
                                -                        strides=strides,
                                -                        padding='same',
                                -                        kernel_size=1,
                                -                        use_bias=False,
                                -                        kernel_initializer='he_normal'
                                -                    ),
                                -                    layers.BatchNormalization()
                                -                ]
                                -            )
                                -        else:
                                -            self.shortcut = layers.Lambda(lambda x: x)
                                -        self.conv_block = Sequential(
                                -            self.conv_resnetblock(output_channels, 3, strides)
                                -        )
                                -        self.act = layers.Activation('relu')
                                -
                                -    @staticmethod
                                -    def conv_resnetblock(
                                -        output_channels: int,
                                -        kernel_size: int,
                                -        strides: int = 1,
                                -    ) -> List[layers.Layer]:
                                -        return [
                                -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
                                -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
                                -            layers.BatchNormalization(),
                                -        ]
                                -
                                -    def call(
                                -        self,
                                -        inputs: tf.Tensor
                                -    ) -> tf.Tensor:
                                -        clone = self.shortcut(inputs)
                                -        conv_out = self.conv_block(inputs)
                                -        out = self.act(clone + conv_out)
                                -
                                -        return out
                                -
                                -
                                -class ResnetStage(Sequential):
                                -
                                -    """Implements a resnet31 stage
                                -
                                -    Args:
                                -        num_blocks: number of blocks inside the stage
                                -        output_channels: number of channels to use in Conv2D
                                -        downsample: if true, performs a /2 downsampling at the first block of the stage
                                -    """
                                -    def __init__(
                                -        self,
                                -        num_blocks: int,
                                -        output_channels: int,
                                -        downsample: bool = False,
                                -    ) -> None:
                                -
                                -        super().__init__()
                                -        final_blocks = [
                                -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
                                -        ]
                                -        if downsample is True:
                                -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
                                -        else:
                                -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
                                -        for final_block in final_blocks:
                                -            self.add(final_block)
                                -
                                -
                                -class ResNet(Sequential):
                                -
                                -    """Resnet class with two convolutions and a maxpooling before the first stage
                                -
                                -    Args:
                                -        num_blocks: number of resnet block in each stage
                                -        output_channels: number of channels in each stage
                                -        conv_seq: wether to add a conv_sequence after each stage
                                -        pooling: pooling to add after each stage (if None, no pooling)
                                -        input_shape: shape of inputs
                                -        include_top: whether the classifier head should be instantiated
                                -    """
                                -
                                -    def __init__(
                                -        self,
                                -        num_blocks: Tuple[int, int, int, int],
                                -        output_channels: Tuple[int, int, int, int],
                                -        conv_seq: Tuple[bool, bool, bool, bool],
                                -        pooling: Tuple[
                                -            Optional[Tuple[int, int]],
                                -            Optional[Tuple[int, int]],
                                -            Optional[Tuple[int, int]],
                                -            Optional[Tuple[int, int]]
                                -        ],
                                -        input_shape: Tuple[int, int, int] = (640, 640, 3),
                                -        include_top: bool = False,
                                -    ) -> None:
                                -
                                -        _layers = [
                                -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
                                -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
                                -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
                                -        ]
                                -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
                                -            _layers.append(ResnetStage(n_blocks, out_channels))
                                -            if conv:
                                -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
                                -            if pool:
                                -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
                                -        super().__init__(_layers)
                                -
                                -
                                -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
                                -
                                -    # Build the model
                                -    model = ResNet(
                                -        default_cfgs[arch]['num_blocks'],
                                -        default_cfgs[arch]['output_channels'],
                                -        default_cfgs[arch]['conv_seq'],
                                -        default_cfgs[arch]['pooling'],
                                -        **kwargs
                                -    )
                                -    # Load pretrained parameters
                                -    if pretrained:
                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                -
                                -    return model
                                -
                                -
                                -
                                -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
                                - -
                                -
                                -
                                -
                                - - -
                                -
                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                -
                                - -
                                -
                                - -
                                -
                                - -
                                -
                                - - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.5.1/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 48c285257a..0000000000 --- a/v0.5.1/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                -
                                -
                                - -
                                - -
                                -
                                - -
                                - -
                                -
                                - -
                                -
                                -
                                - - - - - Back to top - -
                                -
                                - -
                                - -
                                -
                                -

                                Source code for doctr.models.backbones.vgg.tensorflow

                                -# Copyright (C) 2021, Mindee.
                                -
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                -
                                -from typing import Any, Dict, Tuple
                                -
                                -from tensorflow.keras import layers
                                -from tensorflow.keras.models import Sequential
                                -
                                -from ...utils import conv_sequence, load_pretrained_params
                                -
                                -__all__ = ['VGG', 'vgg16_bn']
                                -
                                -
                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
                                -                 'rect_pools': (False, False, True, True, True),
                                -                 'url': None},
                                -}
                                -
                                -
                                -class VGG(Sequential):
                                -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
                                -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
                                -
                                -    Args:
                                -        num_blocks: number of convolutional block in each stage
                                -        planes: number of output channels in each stage
                                -        rect_pools: whether pooling square kernels should be replace with rectangular ones
                                -        input_shape: shapes of the input tensor
                                -        include_top: whether the classifier head should be instantiated
                                -    """
                                -    def __init__(
                                -        self,
                                -        num_blocks: Tuple[int, int, int, int, int],
                                -        planes: Tuple[int, int, int, int, int],
                                -        rect_pools: Tuple[bool, bool, bool, bool, bool],
                                -        input_shape: Tuple[int, int, int] = (512, 512, 3),
                                -        include_top: bool = False,
                                -    ) -> None:
                                -
                                -        _layers = []
                                -        # Specify input_shape only for the first layer
                                -        kwargs = {"input_shape": input_shape}
                                -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
                                -            for _ in range(nb_blocks):
                                -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
                                -                kwargs = {}
                                -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
                                -        super().__init__(_layers)
                                -
                                -
                                -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
                                -
                                -    # Build the model
                                -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
                                -                default_cfgs[arch]['rect_pools'], **kwargs)
                                -    # Load pretrained parameters
                                -    if pretrained:
                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                -
                                -    return model
                                -
                                -
                                -
                                -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
                                - -
                                -
                                -
                                -
                                - - -
                                -
                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                -
                                - -
                                -
                                - -
                                -
                                - -
                                -
                                - - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html index e9c8bed57b..e181ef6a1f 100644 --- a/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -234,10 +234,16 @@

                                Using docTR

                                Package Reference

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                -
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                 import math
                                +from copy import deepcopy
                                 from functools import partial
                                 from typing import Any, Dict, List, Optional, Tuple
                                 
                                 import tensorflow as tf
                                -from tensorflow.keras import layers
                                +from tensorflow.keras import activations, layers
                                 from tensorflow.keras.models import Sequential
                                 
                                 from doctr.datasets import VOCABS
                                @@ -306,16 +312,16 @@ 

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                from ...utils import load_pretrained_params from ..resnet.tensorflow import ResNet -__all__ = ['magc_resnet31'] +__all__ = ["magc_resnet31"] default_cfgs: Dict[str, Dict[str, Any]] = { - 'magc_resnet31': { - 'mean': (0.5, 0.5, 0.5), - 'std': (1., 1., 1.), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': None, + "magc_resnet31": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/magc_resnet31-16aa7d71.weights.h5&src=0", }, } @@ -325,6 +331,7 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                <https://arxiv.org/pdf/1910.02562.pdf>`_. Args: + ---- inplanes: input channels headers: number of headers to split channels attn_scale: if True, re-scale attention to counteract the variance distibutions @@ -338,39 +345,28 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                headers: int = 8, attn_scale: bool = False, ratio: float = 0.0625, # bottleneck ratio of 1/16 as described in paper - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) self.headers = headers # h self.inplanes = inplanes # C self.attn_scale = attn_scale + self.ratio = ratio self.planes = int(inplanes * ratio) self.single_header_inplanes = int(inplanes / headers) # C / h - self.conv_mask = layers.Conv2D( - filters=1, - kernel_size=1, - kernel_initializer=tf.initializers.he_normal() - ) + self.conv_mask = layers.Conv2D(filters=1, kernel_size=1, kernel_initializer=tf.initializers.he_normal()) self.transform = Sequential( [ - layers.Conv2D( - filters=self.planes, - kernel_size=1, - kernel_initializer=tf.initializers.he_normal() - ), + layers.Conv2D(filters=self.planes, kernel_size=1, kernel_initializer=tf.initializers.he_normal()), layers.LayerNormalization([1, 2, 3]), layers.ReLU(), - layers.Conv2D( - filters=self.inplanes, - kernel_size=1, - kernel_initializer=tf.initializers.he_normal() - ), + layers.Conv2D(filters=self.inplanes, kernel_size=1, kernel_initializer=tf.initializers.he_normal()), ], - name='transform' + name="transform", ) def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor: @@ -397,7 +393,7 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                if self.attn_scale and self.headers > 1: context_mask = context_mask / math.sqrt(self.single_header_inplanes) # B*h, 1, H*W, 1 - context_mask = tf.keras.activations.softmax(context_mask, axis=2) + context_mask = activations.softmax(context_mask, axis=2) # Compute context # B*h, 1, C/h, 1 @@ -429,9 +425,15 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                origin_stem: bool = True, **kwargs: Any, ) -> ResNet: + kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) + kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) + kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) - kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['classes'])) - kwargs['input_shape'] = kwargs.get('input_shape', default_cfgs[arch]['input_shape']) + _cfg = deepcopy(default_cfgs[arch]) + _cfg["num_classes"] = kwargs["num_classes"] + _cfg["classes"] = kwargs["classes"] + _cfg["input_shape"] = kwargs["input_shape"] + kwargs.pop("classes") # Build the model model = ResNet( @@ -442,11 +444,16 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                stage_pooling, origin_stem, attn_module=partial(MAGC, headers=8, attn_scale=True), + cfg=_cfg, **kwargs, ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -465,14 +472,16 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A feature extractor model """ - return _magc_resnet( - 'magc_resnet31', + "magc_resnet31", pretrained, [1, 2, 5, 3], [256, 256, 512, 512], @@ -516,7 +525,7 @@

                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html index cf26cab65d..c9545166e7 100644 --- a/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -234,10 +234,16 @@

                                Using docTR

                                Package Reference

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                 # Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
                                 
                                @@ -304,51 +310,65 @@ 

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                from ....datasets import VOCABS from ...utils import conv_sequence, load_pretrained_params -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large", - "mobilenet_v3_large_r", "mobilenet_v3_small_orientation"] +__all__ = [ + "MobileNetV3", + "mobilenet_v3_small", + "mobilenet_v3_small_r", + "mobilenet_v3_large", + "mobilenet_v3_large_r", + "mobilenet_v3_small_crop_orientation", + "mobilenet_v3_small_page_orientation", +] default_cfgs: Dict[str, Dict[str, Any]] = { - 'mobilenet_v3_large': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/mobilenet_v3_large-47d25d7e.zip', + "mobilenet_v3_large": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large-d857506e.weights.h5&src=0", }, - 'mobilenet_v3_large_r': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/mobilenet_v3_large_r-a108e192.zip', + "mobilenet_v3_large_r": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large_r-eef2e3c6.weights.h5&src=0", }, - 'mobilenet_v3_small': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/mobilenet_v3_small-8a32c32c.zip', + "mobilenet_v3_small": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small-3fcebad7.weights.h5&src=0", }, - 'mobilenet_v3_small_r': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/mobilenet_v3_small_r-3d61452e.zip', + "mobilenet_v3_small_r": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_r-dd50218d.weights.h5&src=0", }, - 'mobilenet_v3_small_orientation': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (128, 128, 3), - 'classes': [0, 90, 180, 270], - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/classif_mobilenet_v3_small-1ea8db03.zip', + "mobilenet_v3_small_crop_orientation": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (128, 128, 3), + "classes": [0, -90, 180, 90], + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_crop_orientation-ef019b6b.weights.h5&src=0", + }, + "mobilenet_v3_small_page_orientation": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (512, 512, 3), + "classes": [0, -90, 180, 90], + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_page_orientation-0071d55d.weights.h5&src=0", }, } def hard_swish(x: tf.Tensor) -> tf.Tensor: - return x * tf.nn.relu6(x + 3.) / 6.0 + return x * tf.nn.relu6(x + 3.0) / 6.0 def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int: @@ -362,17 +382,15 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                class SqueezeExcitation(Sequential): - """Squeeze and Excitation. - """ + """Squeeze and Excitation.""" + def __init__(self, chan: int, squeeze_factor: int = 4) -> None: - super().__init__( - [ - layers.GlobalAveragePooling2D(), - layers.Dense(chan // squeeze_factor, activation='relu'), - layers.Dense(chan, activation='hard_sigmoid'), - layers.Reshape((1, 1, chan)) - ] - ) + super().__init__([ + layers.GlobalAveragePooling2D(), + layers.Dense(chan // squeeze_factor, activation="relu"), + layers.Dense(chan, activation="hard_sigmoid"), + layers.Reshape((1, 1, chan)), + ]) def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor: x = super().call(inputs, **kwargs) @@ -409,14 +427,16 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                """InvertedResidual for mobilenet Args: + ---- conf: configuration object for inverted residual """ + def __init__( self, conf: InvertedResidualConfig, **kwargs: Any, ) -> None: - _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {} + _kwargs = {"input_shape": kwargs.pop("input_shape")} if isinstance(kwargs.get("input_shape"), tuple) else {} super().__init__(**kwargs) act_fn = hard_swish if conf.use_hs else tf.nn.relu @@ -430,18 +450,29 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs)) # depth-wise - _layers.extend(conv_sequence( - conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True, - groups=conf.expanded_channels, - )) + _layers.extend( + conv_sequence( + conf.expanded_channels, + act_fn, + kernel_size=conf.kernel, + strides=conf.stride, + bn=True, + groups=conf.expanded_channels, + ) + ) if conf.use_se: _layers.append(SqueezeExcitation(conf.expanded_channels)) # project - _layers.extend(conv_sequence( - conf.out_channels, None, kernel_size=1, bn=True, - )) + _layers.extend( + conv_sequence( + conf.out_channels, + None, + kernel_size=1, + bn=True, + ) + ) self.block = Sequential(_layers) @@ -450,7 +481,6 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                inputs: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - out = self.block(inputs, **kwargs) if self.use_res_connect: out = tf.add(out, inputs) @@ -473,10 +503,13 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                cfg: Optional[Dict[str, Any]] = None, input_shape: Optional[Tuple[int, int, int]] = None, ) -> None: - _layers = [ - Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2, - input_shape=input_shape), name="stem") + Sequential( + conv_sequence( + layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2, input_shape=input_shape + ), + name="stem", + ) ] for idx, conf in enumerate(layout): @@ -485,10 +518,7 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                ) _layers.append( - Sequential( - conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1), - name="final_block" - ) + Sequential(conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1), name="final_block") ) if include_top: @@ -503,15 +533,16 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                self.cfg = cfg -def _mobilenet_v3( - arch: str, - pretrained: bool, - rect_strides: bool = False, - **kwargs: Any -) -> MobileNetV3: +def _mobilenet_v3(arch: str, pretrained: bool, rect_strides: bool = False, **kwargs: Any) -> MobileNetV3: + kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) + kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) + kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) + _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = kwargs.get('input_shape', default_cfgs[arch]['input_shape']) - _cfg['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['classes'])) + _cfg["num_classes"] = kwargs["num_classes"] + _cfg["classes"] = kwargs["classes"] + _cfg["input_shape"] = kwargs["input_shape"] + kwargs.pop("classes") # cf. Table 1 & 2 of the paper if arch.startswith("mobilenet_v3_small"): @@ -549,8 +580,8 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                ] head_chans = 1280 - kwargs['num_classes'] = _cfg['num_classes'] - kwargs['input_shape'] = _cfg['input_shape'] + kwargs["num_classes"] = _cfg["num_classes"] + kwargs["input_shape"] = _cfg["input_shape"] # Build the model model = MobileNetV3( @@ -561,7 +592,11 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -580,13 +615,15 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, False, **kwargs)
                                + return _mobilenet_v3("mobilenet_v3_small", pretrained, False, **kwargs)
                                @@ -604,13 +641,15 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, True, **kwargs)
                                + return _mobilenet_v3("mobilenet_v3_small_r", pretrained, True, **kwargs)
                                @@ -628,12 +667,15 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, False, **kwargs)
                                + return _mobilenet_v3("mobilenet_v3_large", pretrained, False, **kwargs)
                                @@ -651,36 +693,67 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, True, **kwargs)
                                + return _mobilenet_v3("mobilenet_v3_large_r", pretrained, True, **kwargs)
                                -
                                -[docs] -def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: +
                                +[docs] +def mobilenet_v3_small_crop_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: """MobileNetV3-Small architecture as described in `"Searching for MobileNetV3", <https://arxiv.org/pdf/1905.02244.pdf>`_. >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_orientation - >>> model = mobilenet_v3_small_orientation(pretrained=False) + >>> from doctr.models import mobilenet_v3_small_crop_orientation + >>> model = mobilenet_v3_small_crop_orientation(pretrained=False) >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ + return _mobilenet_v3("mobilenet_v3_small_crop_orientation", pretrained, include_top=True, **kwargs)
                                + + + +
                                +[docs] +def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: + """MobileNetV3-Small architecture as described in + `"Searching for MobileNetV3", + <https://arxiv.org/pdf/1905.02244.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import mobilenet_v3_small_page_orientation + >>> model = mobilenet_v3_small_page_orientation(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) - return _mobilenet_v3('mobilenet_v3_small_orientation', pretrained, include_top=True, **kwargs)
                                + Args: + ---- + pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture + + Returns: + ------- + a keras.Model + """ + return _mobilenet_v3("mobilenet_v3_small_page_orientation", pretrained, include_top=True, **kwargs)
                                @@ -714,7 +787,7 @@

                                Source code for doctr.models.classification.mobilenet.tensorflow

                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html index a524553ec7..620d4f0635 100644 --- a/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/resnet/tensorflow.html @@ -234,10 +234,16 @@

                                Using docTR

                                Package Reference

                                Source code for doctr.models.classification.resnet.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                +from copy import deepcopy
                                 from typing import Any, Callable, Dict, List, Optional, Tuple
                                 
                                 import tensorflow as tf
                                @@ -303,87 +310,77 @@ 

                                Source code for doctr.models.classification.resnet.tensorflow

                                from ...utils import conv_sequence, load_pretrained_params -__all__ = ['ResNet', 'resnet18', 'resnet31', 'resnet34', 'resnet50', 'resnet34_wide'] +__all__ = ["ResNet", "resnet18", "resnet31", "resnet34", "resnet50", "resnet34_wide"] default_cfgs: Dict[str, Dict[str, Any]] = { - 'resnet18': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/resnet18-d4634669.zip', + "resnet18": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet18-f42d3854.weights.h5&src=0", }, - 'resnet31': { - 'mean': (0.5, 0.5, 0.5), - 'std': (1., 1., 1.), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.5.0/resnet31-5a47a60b.zip', + "resnet31": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet31-ab75f78c.weights.h5&src=0", }, - 'resnet34': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.5.0/resnet34-5dcc97ca.zip', + "resnet34": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34-03967df9.weights.h5&src=0", }, - 'resnet50': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.5.0/resnet50-e75e4cdf.zip', + "resnet50": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet50-82358f34.weights.h5&src=0", }, - 'resnet34_wide': { - 'mean': (0.694, 0.695, 0.693), - 'std': (0.299, 0.296, 0.301), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.5.0/resnet34_wide-c1271816.zip', + "resnet34_wide": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34_wide-b18fdf79.weights.h5&src=0", }, } class ResnetBlock(layers.Layer): - """Implements a resnet31 block with shortcut Args: + ---- conv_shortcut: Use of shortcut output_channels: number of channels to use in Conv2D kernel_size: size of square kernels strides: strides to use in the first convolution of the block """ - def __init__( - self, - output_channels: int, - conv_shortcut: bool, - strides: int = 1, - **kwargs - ) -> None: + def __init__(self, output_channels: int, conv_shortcut: bool, strides: int = 1, **kwargs) -> None: super().__init__(**kwargs) if conv_shortcut: - self.shortcut = Sequential( - [ - layers.Conv2D( - filters=output_channels, - strides=strides, - padding='same', - kernel_size=1, - use_bias=False, - kernel_initializer='he_normal' - ), - layers.BatchNormalization() - ] - ) + self.shortcut = Sequential([ + layers.Conv2D( + filters=output_channels, + strides=strides, + padding="same", + kernel_size=1, + use_bias=False, + kernel_initializer="he_normal", + ), + layers.BatchNormalization(), + ]) else: self.shortcut = layers.Lambda(lambda x: x) - self.conv_block = Sequential( - self.conv_resnetblock(output_channels, 3, strides) - ) - self.act = layers.Activation('relu') + self.conv_block = Sequential(self.conv_resnetblock(output_channels, 3, strides)) + self.act = layers.Activation("relu") @staticmethod def conv_resnetblock( @@ -392,14 +389,11 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                strides: int = 1, ) -> List[layers.Layer]: return [ - *conv_sequence(output_channels, 'relu', bn=True, strides=strides, kernel_size=kernel_size), + *conv_sequence(output_channels, "relu", bn=True, strides=strides, kernel_size=kernel_size), *conv_sequence(output_channels, None, bn=True, kernel_size=kernel_size), ] - def call( - self, - inputs: tf.Tensor - ) -> tf.Tensor: + def call(self, inputs: tf.Tensor) -> tf.Tensor: clone = self.shortcut(inputs) conv_out = self.conv_block(inputs) out = self.act(clone + conv_out) @@ -408,14 +402,9 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                def resnet_stage( - num_blocks: int, - out_channels: int, - shortcut: bool = False, - downsample: bool = False + num_blocks: int, out_channels: int, shortcut: bool = False, downsample: bool = False ) -> List[layers.Layer]: - _layers: List[layers.Layer] = [ - ResnetBlock(out_channels, conv_shortcut=shortcut, strides=2 if downsample else 1) - ] + _layers: List[layers.Layer] = [ResnetBlock(out_channels, conv_shortcut=shortcut, strides=2 if downsample else 1)] for _ in range(1, num_blocks): _layers.append(ResnetBlock(out_channels, conv_shortcut=False)) @@ -427,6 +416,7 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                """Implements a ResNet architecture Args: + ---- num_blocks: number of resnet block in each stage output_channels: number of channels in each stage stage_downsample: whether the first residual block of a stage should downsample @@ -452,31 +442,32 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                attn_module: Optional[Callable[[int], layers.Layer]] = None, include_top: bool = True, num_classes: int = 1000, + cfg: Optional[Dict[str, Any]] = None, input_shape: Optional[Tuple[int, int, int]] = None, ) -> None: - inplanes = stem_channels if origin_stem: _layers = [ - *conv_sequence(inplanes, 'relu', True, kernel_size=7, strides=2, input_shape=input_shape), - layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'), + *conv_sequence(inplanes, "relu", True, kernel_size=7, strides=2, input_shape=input_shape), + layers.MaxPool2D(pool_size=(3, 3), strides=2, padding="same"), ] else: _layers = [ - *conv_sequence(inplanes // 2, 'relu', True, kernel_size=3, input_shape=input_shape), - *conv_sequence(inplanes, 'relu', True, kernel_size=3), - layers.MaxPool2D(pool_size=2, strides=2, padding='valid'), + *conv_sequence(inplanes // 2, "relu", True, kernel_size=3, input_shape=input_shape), + *conv_sequence(inplanes, "relu", True, kernel_size=3), + layers.MaxPool2D(pool_size=2, strides=2, padding="valid"), ] - for n_blocks, out_chan, down, conv, pool in zip(num_blocks, output_channels, stage_downsample, stage_conv, - stage_pooling): + for n_blocks, out_chan, down, conv, pool in zip( + num_blocks, output_channels, stage_downsample, stage_conv, stage_pooling + ): _layers.extend(resnet_stage(n_blocks, out_chan, out_chan != inplanes, down)) if attn_module is not None: _layers.append(attn_module(out_chan)) if conv: - _layers.extend(conv_sequence(out_chan, activation='relu', bn=True, kernel_size=3)) + _layers.extend(conv_sequence(out_chan, activation="relu", bn=True, kernel_size=3)) if pool: - _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid')) + _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding="valid")) inplanes = out_chan if include_top: @@ -486,6 +477,7 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                ]) super().__init__(_layers) + self.cfg = cfg def _resnet( @@ -497,17 +489,29 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                stage_conv: List[bool], stage_pooling: List[Optional[Tuple[int, int]]], origin_stem: bool = True, - **kwargs: Any + **kwargs: Any, ) -> ResNet: + kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) + kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) + kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) - kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['classes'])) - kwargs['input_shape'] = kwargs.get('input_shape', default_cfgs[arch]['input_shape']) + _cfg = deepcopy(default_cfgs[arch]) + _cfg["num_classes"] = kwargs["num_classes"] + _cfg["classes"] = kwargs["classes"] + _cfg["input_shape"] = kwargs["input_shape"] + kwargs.pop("classes") # Build the model - model = ResNet(num_blocks, output_channels, stage_downsample, stage_conv, stage_pooling, origin_stem, **kwargs) + model = ResNet( + num_blocks, output_channels, stage_downsample, stage_conv, stage_pooling, origin_stem, cfg=_cfg, **kwargs + ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -525,14 +529,16 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( - 'resnet18', + "resnet18", pretrained, [2, 2, 2, 2], [64, 128, 256, 512], @@ -559,14 +565,16 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( - 'resnet31', + "resnet31", pretrained, [1, 2, 5, 3], [256, 256, 512, 512], @@ -593,14 +601,16 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( - 'resnet34', + "resnet34", pretrained, [3, 4, 6, 3], [64, 128, 256, 512], @@ -626,27 +636,44 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ + kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs["resnet50"]["classes"])) + kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs["resnet50"]["input_shape"]) + kwargs["classes"] = kwargs.get("classes", default_cfgs["resnet50"]["classes"]) - kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs['resnet50']['classes'])) - kwargs['input_shape'] = kwargs.get('input_shape', default_cfgs['resnet50']['input_shape']) + _cfg = deepcopy(default_cfgs["resnet50"]) + _cfg["num_classes"] = kwargs["num_classes"] + _cfg["classes"] = kwargs["classes"] + _cfg["input_shape"] = kwargs["input_shape"] + kwargs.pop("classes") model = ResNet50( weights=None, include_top=True, pooling=True, - input_shape=kwargs['input_shape'], - classes=kwargs['num_classes'], + input_shape=kwargs["input_shape"], + classes=kwargs["num_classes"], classifier_activation=None, ) + model.cfg = _cfg + # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs['resnet50']['url']) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, + default_cfgs["resnet50"]["url"], + skip_mismatch=kwargs["num_classes"] != len(default_cfgs["resnet50"]["classes"]), + ) return model
                                @@ -663,14 +690,16 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( - 'resnet34_wide', + "resnet34_wide", pretrained, [3, 4, 6, 3], [128, 256, 512, 1024], @@ -713,7 +742,7 @@

                                Source code for doctr.models.classification.resnet.tensorflow

                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

                                Source code for doctr.models.classification.textnet.tensorflow

                                from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html index e791732174..66ee6dcdd8 100644 --- a/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/vgg/tensorflow.html @@ -234,10 +234,16 @@

                                Using docTR

                                Package Reference

                                Source code for doctr.models.classification.vgg.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                +from copy import deepcopy
                                 from typing import Any, Dict, List, Optional, Tuple
                                 
                                 from tensorflow.keras import layers
                                @@ -301,16 +308,16 @@ 

                                Source code for doctr.models.classification.vgg.tensorflow

                                from ...utils import conv_sequence, load_pretrained_params -__all__ = ['VGG', 'vgg16_bn_r'] +__all__ = ["VGG", "vgg16_bn_r"] default_cfgs: Dict[str, Dict[str, Any]] = { - 'vgg16_bn_r': { - 'mean': (0.5, 0.5, 0.5), - 'std': (1., 1., 1.), - 'input_shape': (32, 32, 3), - 'classes': list(VOCABS['french']), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.4.1/vgg16_bn_r-c5836cea.zip', + "vgg16_bn_r": { + "mean": (0.5, 0.5, 0.5), + "std": (1.0, 1.0, 1.0), + "input_shape": (32, 32, 3), + "classes": list(VOCABS["french"]), + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vgg16_bn_r-b4d69212.weights.h5&src=0", }, } @@ -320,6 +327,7 @@

                                Source code for doctr.models.classification.vgg.tensorflow

                                <https://arxiv.org/pdf/1409.1556.pdf>`_. Args: + ---- num_blocks: number of convolutional block in each stage planes: number of output channels in each stage rect_pools: whether pooling square kernels should be replace with rectangular ones @@ -327,6 +335,7 @@

                                Source code for doctr.models.classification.vgg.tensorflow

                                num_classes: number of output classes input_shape: shapes of the input tensor """ + def __init__( self, num_blocks: List[int], @@ -335,42 +344,45 @@

                                Source code for doctr.models.classification.vgg.tensorflow

                                include_top: bool = False, num_classes: int = 1000, input_shape: Optional[Tuple[int, int, int]] = None, + cfg: Optional[Dict[str, Any]] = None, ) -> None: - _layers = [] # Specify input_shape only for the first layer kwargs = {"input_shape": input_shape} for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools): for _ in range(nb_blocks): - _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs)) # type: ignore[arg-type] + _layers.extend(conv_sequence(out_chan, "relu", True, kernel_size=3, **kwargs)) # type: ignore[arg-type] kwargs = {} _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2))) if include_top: - _layers.extend([ - layers.GlobalAveragePooling2D(), - layers.Dense(num_classes) - ]) + _layers.extend([layers.GlobalAveragePooling2D(), layers.Dense(num_classes)]) super().__init__(_layers) + self.cfg = cfg def _vgg( - arch: str, - pretrained: bool, - num_blocks: List[int], - planes: List[int], - rect_pools: List[bool], - **kwargs: Any + arch: str, pretrained: bool, num_blocks: List[int], planes: List[int], rect_pools: List[bool], **kwargs: Any ) -> VGG: + kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) + kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) + kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) - kwargs['num_classes'] = kwargs.get("num_classes", len(default_cfgs[arch]['classes'])) - kwargs['input_shape'] = kwargs.get("input_shape", default_cfgs[arch]['input_shape']) + _cfg = deepcopy(default_cfgs[arch]) + _cfg["num_classes"] = kwargs["num_classes"] + _cfg["classes"] = kwargs["classes"] + _cfg["input_shape"] = kwargs["input_shape"] + kwargs.pop("classes") # Build the model - model = VGG(num_blocks, planes, rect_pools, **kwargs) + model = VGG(num_blocks, planes, rect_pools, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]['url']) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -389,19 +401,16 @@

                                Source code for doctr.models.classification.vgg.tensorflow

                                >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on ImageNet + **kwargs: keyword arguments of the VGG architecture Returns: + ------- VGG feature extractor """ - return _vgg( - 'vgg16_bn_r', - pretrained, - [2, 2, 3, 3, 3], - [64, 128, 256, 512, 512], - [False, False, True, True, True], - **kwargs + "vgg16_bn_r", pretrained, [2, 2, 3, 3, 3], [64, 128, 256, 512, 512], [False, False, True, True, True], **kwargs )
                                @@ -436,7 +445,7 @@

                                Source code for doctr.models.classification.vgg.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html b/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html index 81ef3d9dcf..7059d1f1d8 100644 --- a/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/classification/vit/tensorflow.html @@ -302,7 +302,7 @@

                                Source code for doctr.models.classification.vit.tensorflow

                                from typing import Any, Dict, Optional, Tuple import tensorflow as tf -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock diff --git a/v0.5.1/_modules/doctr/models/classification/zoo.html b/v0.5.1/_modules/doctr/models/classification/zoo.html index 6308df729b..9ecb9674f6 100644 --- a/v0.5.1/_modules/doctr/models/classification/zoo.html +++ b/v0.5.1/_modules/doctr/models/classification/zoo.html @@ -234,10 +234,16 @@

                                Using docTR

                                Package Reference

                                Source code for doctr.models.classification.zoo

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021-2024, Mindee.
                                 
                                -# This program is licensed under the Apache License version 2.
                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                +# This program is licensed under the Apache License 2.0.
                                +# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                 
                                -from typing import Any
                                +from typing import Any, List
                                 
                                -from doctr.file_utils import is_tf_available, is_torch_available
                                +from doctr.file_utils import is_tf_available
                                 
                                 from .. import classification
                                 from ..preprocessor import PreProcessor
                                -from .predictor import CropOrientationPredictor
                                +from .predictor import OrientationPredictor
                                 
                                -__all__ = ["crop_orientation_predictor"]
                                +__all__ = ["crop_orientation_predictor", "page_orientation_predictor"]
                                 
                                +ARCHS: List[str] = [
                                +    "magc_resnet31",
                                +    "mobilenet_v3_small",
                                +    "mobilenet_v3_small_r",
                                +    "mobilenet_v3_large",
                                +    "mobilenet_v3_large_r",
                                +    "resnet18",
                                +    "resnet31",
                                +    "resnet34",
                                +    "resnet50",
                                +    "resnet34_wide",
                                +    "textnet_tiny",
                                +    "textnet_small",
                                +    "textnet_base",
                                +    "vgg16_bn_r",
                                +    "vit_s",
                                +    "vit_b",
                                +]
                                +ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_crop_orientation", "mobilenet_v3_small_page_orientation"]
                                 
                                -if is_tf_available():
                                -    ARCHS = ['mobilenet_v3_small_orientation']
                                -elif is_torch_available():
                                -    ARCHS = ['mobilenet_v3_small_orientation']
                                 
                                +def _orientation_predictor(
                                +    arch: Any, pretrained: bool, model_type: str, disabled: bool = False, **kwargs: Any
                                +) -> OrientationPredictor:
                                +    if disabled:
                                +        # Case where the orientation predictor is disabled
                                +        return OrientationPredictor(None, None)
                                 
                                -def _crop_orientation_predictor(
                                -    arch: str,
                                -    pretrained: bool,
                                -    **kwargs: Any
                                -) -> CropOrientationPredictor:
                                +    if isinstance(arch, str):
                                +        if arch not in ORIENTATION_ARCHS:
                                +            raise ValueError(f"unknown architecture '{arch}'")
                                 
                                -    if arch not in ARCHS:
                                -        raise ValueError(f"unknown architecture '{arch}'")
                                +        # Load directly classifier from backbone
                                +        _model = classification.__dict__[arch](pretrained=pretrained)
                                +    else:
                                +        if not isinstance(arch, classification.MobileNetV3):
                                +            raise ValueError(f"unknown architecture: {type(arch)}")
                                +        _model = arch
                                 
                                -    # Load directly classifier from backbone
                                -    _model = classification.__dict__[arch](pretrained=pretrained)
                                -    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                -    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                -    kwargs['batch_size'] = kwargs.get('batch_size', 64)
                                -    input_shape = _model.cfg['input_shape'][:-1] if is_tf_available() else _model.cfg['input_shape'][1:]
                                -    predictor = CropOrientationPredictor(
                                -        PreProcessor(input_shape, preserve_aspect_ratio=True, symmetric_pad=True, **kwargs),
                                -        _model
                                +    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
                                +    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
                                +    kwargs["batch_size"] = kwargs.get("batch_size", 128 if model_type == "crop" else 4)
                                +    input_shape = _model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:]
                                +    predictor = OrientationPredictor(
                                +        PreProcessor(input_shape, preserve_aspect_ratio=True, symmetric_pad=True, **kwargs), _model
                                     )
                                     return predictor
                                 
                                @@ -334,27 +360,54 @@ 

                                Source code for doctr.models.classification.zoo

                                <
                                [docs] def crop_orientation_predictor( - arch: str = 'mobilenet_v3_small_orientation', - pretrained: bool = False, - **kwargs: Any -) -> CropOrientationPredictor: - """Orientation classification architecture. + arch: Any = "mobilenet_v3_small_crop_orientation", pretrained: bool = False, **kwargs: Any +) -> OrientationPredictor: + """Crop orientation classification architecture. >>> import numpy as np >>> from doctr.models import crop_orientation_predictor - >>> model = crop_orientation_predictor(arch='classif_mobilenet_v3_small', pretrained=True) - >>> input_crop = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> model = crop_orientation_predictor(arch='mobilenet_v3_small_crop_orientation', pretrained=True) + >>> input_crop = (255 * np.random.rand(256, 256, 3)).astype(np.uint8) >>> out = model([input_crop]) Args: - arch: name of the architecture to use (e.g. 'mobilenet_v3_small') + ---- + arch: name of the architecture to use (e.g. 'mobilenet_v3_small_crop_orientation') pretrained: If True, returns a model pre-trained on our recognition crops dataset + **kwargs: keyword arguments to be passed to the OrientationPredictor Returns: - CropOrientationPredictor + ------- + OrientationPredictor """ + return _orientation_predictor(arch, pretrained, model_type="crop", **kwargs)
                                - return _crop_orientation_predictor(arch, pretrained, **kwargs)
                                + + +
                                +[docs] +def page_orientation_predictor( + arch: Any = "mobilenet_v3_small_page_orientation", pretrained: bool = False, **kwargs: Any +) -> OrientationPredictor: + """Page orientation classification architecture. + + >>> import numpy as np + >>> from doctr.models import page_orientation_predictor + >>> model = page_orientation_predictor(arch='mobilenet_v3_small_page_orientation', pretrained=True) + >>> input_page = (255 * np.random.rand(512, 512, 3)).astype(np.uint8) + >>> out = model([input_page]) + + Args: + ---- + arch: name of the architecture to use (e.g. 'mobilenet_v3_small_page_orientation') + pretrained: If True, returns a model pre-trained on our recognition crops dataset + **kwargs: keyword arguments to be passed to the OrientationPredictor + + Returns: + ------- + OrientationPredictor + """ + return _orientation_predictor(arch, pretrained, model_type="page", **kwargs)
                                @@ -388,7 +441,7 @@

                                Source code for doctr.models.classification.zoo

                                <
                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index 87d4733137..dc65e2ed03 100644 --- a/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                @@ -295,36 +283,29 @@ 

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - -import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras import layers -from tensorflow.keras.applications import ResNet50 +from typing import List, Tuple, Optional, Any, Dict -from doctr.models.utils import IntermediateLayerGetter, conv_sequence, load_pretrained_params from doctr.utils.repr import NestedObject - -from ...classification import mobilenet_v3_large +from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence from .base import DBPostProcessor, _DBNet -__all__ = ['DBNet', 'db_resnet50', 'db_mobilenet_v3_large'] +__all__ = ['DBNet', 'db_resnet50'] default_cfgs: Dict[str, Dict[str, Any]] = { 'db_resnet50': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'backbone': 'ResNet50', + 'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], + 'fpn_channels': 128, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip', }, - 'db_mobilenet_v3_large': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'input_shape': (1024, 1024, 3), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/db_mobilenet_v3_large-8c16d5bf.zip', - }, } @@ -398,9 +379,6 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo Args: feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to - num_classes: number of output channels in the segmentation map - assume_straight_pages: if True, fit straight bounding boxes only - cfg: the configuration dict of the model """ _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] @@ -408,9 +386,8 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea - num_classes: int = 1, - assume_straight_pages: bool = True, + fpn_channels: int = 128, + rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: @@ -418,7 +395,7 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo self.cfg = cfg self.feat_extractor = feature_extractor - self.assume_straight_pages = assume_straight_pages + self.rotated_bbox = rotated_bbox self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels @@ -431,7 +408,7 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), layers.BatchNormalization(), layers.Activation('relu'), - layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer='he_normal'), + layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), ] ) self.threshold_head = keras.Sequential( @@ -440,17 +417,17 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), layers.BatchNormalization(), layers.Activation('relu'), - layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer='he_normal'), + layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), ] ) - self.postprocessor = DBPostProcessor(assume_straight_pages=assume_straight_pages) + self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, thresh_map: tf.Tensor, - target: List[np.ndarray] + target: List[Dict[str, Any]] ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output @@ -467,10 +444,10 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) - seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) # Compute balanced BCE loss for proba_map @@ -506,9 +483,9 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, - return_preds: bool = False, + return_boxes: bool = False, **kwargs: Any, ) -> Dict[str, Any]: @@ -517,15 +494,15 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_preds: + if return_model_output or target is None or return_boxes: prob_map = tf.math.sigmoid(logits) if return_model_output: out["out_map"] = prob_map - if target is None or return_preds: - # Post-process boxes (keep only text predictions) - out["preds"] = [preds[0] for preds in self.postprocessor(prob_map.numpy())] + if target is None or return_boxes: + # Post-process boxes + out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) @@ -535,68 +512,30 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo return out -def _db_resnet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) # Feature extractor - feat_extractor = IntermediateLayerGetter( - backbone_fn( - weights='imagenet' if pretrained_backbone else None, - include_top=False, - pooling=None, - input_shape=_cfg['input_shape'], - ), - fpn_layers, + resnet = tf.keras.applications.__dict__[_cfg['backbone']]( + include_top=False, + weights=None, + input_shape=_cfg['input_shape'], + pooling=None, ) - # Build the model - model = DBNet(feat_extractor, cfg=_cfg, **kwargs) - # Load pretrained parameters - if pretrained: - load_pretrained_params(model, _cfg['url']) - - return model - - -def _db_mobilenet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained - - # Patch the config - _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or _cfg['input_shape'] - - # Feature extractor feat_extractor = IntermediateLayerGetter( - backbone_fn( - input_shape=_cfg['input_shape'], - include_top=False, - pretrained=pretrained_backbone, - ), - fpn_layers, + resnet, + _cfg['fpn_layers'], ) + kwargs['fpn_channels'] = _cfg['fpn_channels'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] + # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters @@ -607,45 +546,17 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo
                                -[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet( - 'db_resnet50', - pretrained, - ResNet50, - ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], - **kwargs, - )
                                - - - -
                                -[docs] -def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. - - >>> import tensorflow as tf - >>> from doctr.models import db_mobilenet_v3_large - >>> model = db_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text detection dataset @@ -654,13 +565,7 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo text detection architecture """ - return _db_mobilenet( - 'db_mobilenet_v3_large', - pretrained, - mobilenet_v3_large, - ["inverted_2", "inverted_5", "inverted_11", "final_block"], - **kwargs, - )

                                + return _db_resnet('db_resnet50', pretrained, **kwargs)

                                @@ -694,7 +599,7 @@

                                Source code for doctr.models.detection.differentiable_binarization.tensorflo

                                -
                                + diff --git a/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

                                Source code for doctr.models.detection.fast.tensorflow

                                import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html index cc9316950c..9f836ce462 100644 --- a/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/detection/linknet/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                @@ -295,59 +283,40 @@ 

                                Source code for doctr.models.detection.linknet.tensorflow

                                # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple - -import numpy as np import tensorflow as tf from tensorflow import keras -from tensorflow.keras import Model, Sequential, layers +from tensorflow.keras import layers, Sequential +from typing import Dict, Any, Tuple, Optional, List -from doctr.models.classification import resnet18, resnet34, resnet50 -from doctr.models.utils import IntermediateLayerGetter, conv_sequence, load_pretrained_params from doctr.utils.repr import NestedObject - +from doctr.models.backbones import ResnetStage +from doctr.models.utils import conv_sequence, load_pretrained_params from .base import LinkNetPostProcessor, _LinkNet -__all__ = ['LinkNet', 'linknet_resnet18', 'linknet_resnet34', 'linknet_resnet50', 'linknet_resnet18_rotation'] +__all__ = ['LinkNet', 'linknet16'] default_cfgs: Dict[str, Dict[str, Any]] = { - 'linknet_resnet18': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'input_shape': (1024, 1024, 3), - 'url': None, - }, - 'linknet_resnet18_rotation': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'input_shape': (1024, 1024, 3), - 'url': 'https://github.com/mindee/doctr/releases/download/v0.5.0/linknet_resnet18-a48e6ed3.zip', - }, - 'linknet_resnet34': { - 'mean': (0.798, 0.785, 0.772), - 'std': (0.264, 0.2749, 0.287), - 'input_shape': (1024, 1024, 3), - 'url': None, - }, - 'linknet_resnet50': { + 'linknet16': { 'mean': (0.798, 0.785, 0.772), 'std': (0.264, 0.2749, 0.287), + 'num_classes': 1, 'input_shape': (1024, 1024, 3), + 'rotated_bbox': False, 'url': None, }, } -def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential: +def decoder_block(in_chan: int, out_chan: int) -> Sequential: """Creates a LinkNet decoder block""" return Sequential([ - *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1, **kwargs), + *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1), layers.Conv2DTranspose( filters=in_chan // 4, kernel_size=3, - strides=stride, + strides=2, padding="same", use_bias=False, kernel_initializer='he_normal' @@ -358,36 +327,36 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                ]) -class LinkNetFPN(Model, NestedObject): - """LinkNet Decoder module""" +class LinkNetFPN(layers.Layer, NestedObject): + """LinkNet Encoder-Decoder module""" def __init__( self, - out_chans: int, - in_shapes: List[Tuple[int, ...]], ) -> None: super().__init__() - self.out_chans = out_chans - strides = [2] * (len(in_shapes) - 1) + [1] - i_chans = [s[-1] for s in in_shapes[::-1]] - o_chans = i_chans[1:] + [out_chans] - self.decoders = [ - decoder_block(in_chan, out_chan, s, input_shape=in_shape) - for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1]) - ] + self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True) + self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True) + self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True) + self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True) + self.decoder_1 = decoder_block(in_chan=64, out_chan=64) + self.decoder_2 = decoder_block(in_chan=128, out_chan=64) + self.decoder_3 = decoder_block(in_chan=256, out_chan=128) + self.decoder_4 = decoder_block(in_chan=512, out_chan=256) def call( self, - x: List[tf.Tensor] + x: tf.Tensor ) -> tf.Tensor: - out = 0 - for decoder, fmap in zip(self.decoders, x[::-1]): - out = decoder(out + fmap) - return out - - def extra_repr(self) -> str: - return f"out_chans={self.out_chans}" + x_1 = self.encoder_1(x) + x_2 = self.encoder_2(x_1) + x_3 = self.encoder_3(x_2) + x_4 = self.encoder_4(x_3) + y_4 = self.decoder_4(x_4) + y_3 = self.decoder_3(y_4 + x_3) + y_2 = self.decoder_2(y_3 + x_2) + y_1 = self.decoder_1(y_2 + x_1) + return y_1 class LinkNet(_LinkNet, keras.Model): @@ -398,24 +367,25 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                num_classes: number of channels for the output """ - _children_names: List[str] = ['feat_extractor', 'fpn', 'classifier', 'postprocessor'] + _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor'] def __init__( self, - feat_extractor: IntermediateLayerGetter, - fpn_channels: int = 64, num_classes: int = 1, - assume_straight_pages: bool = True, + input_shape: Tuple[int, int, int] = (512, 512, 3), + rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(cfg=cfg) - self.assume_straight_pages = assume_straight_pages + self.rotated_bbox = rotated_bbox - self.feat_extractor = feat_extractor + self.stem = Sequential([ + *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape), + layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'), + ]) - self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape]) - self.fpn.build(self.feat_extractor.output_shape) + self.fpn = LinkNetFPN() self.classifier = Sequential([ layers.Conv2DTranspose( @@ -424,31 +394,31 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                strides=2, padding="same", use_bias=False, - kernel_initializer='he_normal', - input_shape=self.fpn.decoders[-1].output_shape[1:], + kernel_initializer='he_normal' ), layers.BatchNormalization(), layers.Activation('relu'), - *conv_sequence(32, 'relu', True, kernel_size=3, strides=1), + *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), layers.Conv2DTranspose( filters=num_classes, kernel_size=2, strides=2, padding="same", - use_bias=True, + use_bias=False, kernel_initializer='he_normal' ), ]) - self.postprocessor = LinkNetPostProcessor(assume_straight_pages=assume_straight_pages) + self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, - target: List[np.ndarray], - gamma: float = 2., + target: List[Dict[str, Any]], + focal_loss: bool = False, alpha: float = .5, - eps: float = 1e-8, + gamma: float = 2., + edge_factor: float = 2., ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. @@ -456,97 +426,94 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                Args: out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - gamma: modulating factor in the focal loss formula + focal_loss: if True, use focal loss instead of BCE + edge_factor: boost factor for box edges (in case of BCE) alpha: balancing factor in the focal loss formula + gammma: modulating factor in the focal loss formula Returns: A loss tensor """ - seg_target, seg_mask = self.build_target(target, out_map.shape[1:3]) - - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - seg_mask = tf.cast(seg_mask, tf.float32) - bce_loss = tf.keras.losses.binary_crossentropy(seg_target, out_map, from_logits=True)[..., None] - proba_map = tf.sigmoid(out_map) + # Get the cross_entropy for each entry + bce = tf.keras.losses.binary_crossentropy( + seg_target[seg_mask], + tf.squeeze(out_map, axis=[-1])[seg_mask], + from_logits=True) + + if focal_loss: + if gamma and gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + + # Convert logits to prob, compute gamma factor + pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) + p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) + modulating_factor = tf.pow((1.0 - p_t), gamma) - # Focal loss - if gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - # Convert logits to prob, compute gamma factor - p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) - alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) - # Unreduced loss - focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss - # Class reduced - focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2)) / tf.reduce_sum(seg_mask, (0, 1, 2)) + # Compute alpha factor + alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - # Dice loss - inter = tf.math.reduce_sum(seg_mask * proba_map * seg_target, (0, 1, 2)) - cardinality = tf.math.reduce_sum(seg_mask * (proba_map + seg_target), (0, 1, 2)) - dice_loss = 1 - 2 * (inter + eps) / (cardinality + eps) + # compute the final loss + loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - return tf.reduce_mean(focal_loss) + tf.reduce_mean(dice_loss) + else: + # Compute BCE loss with highlighted edges + loss = tf.math.multiply( + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), + bce + ) + loss = tf.reduce_mean(loss) + + return loss def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, - return_preds: bool = False, + return_boxes: bool = False, + focal_loss: bool = True, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) - logits = self.fpn(feat_maps, **kwargs) - logits = self.classifier(logits, **kwargs) + logits = self.stem(x) + logits = self.fpn(logits) + logits = self.classifier(logits) out: Dict[str, tf.Tensor] = {} - if return_model_output or target is None or return_preds: + if return_model_output or target is None or return_boxes: prob_map = tf.math.sigmoid(logits) if return_model_output: out["out_map"] = prob_map - if target is None or return_preds: + if target is None or return_boxes: # Post-process boxes - out["preds"] = [preds[0] for preds in self.postprocessor(prob_map.numpy())] + out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) if target is not None: - loss = self.compute_loss(logits, target) + loss = self.compute_loss(logits, target, focal_loss) out['loss'] = loss return out -def _linknet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> LinkNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = input_shape or default_cfgs[arch]['input_shape'] - - # Feature extractor - feat_extractor = IntermediateLayerGetter( - backbone_fn( - pretrained=pretrained_backbone, - include_top=False, - input_shape=_cfg['input_shape'], - ), - fpn_layers, - ) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] + kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: load_pretrained_params(model, _cfg['url']) @@ -554,101 +521,18 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                return model -
                                -[docs] -def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet18 - >>> model = linknet_resnet18(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - 'linknet_resnet18', - pretrained, - resnet18, - ['resnet_block_1', 'resnet_block_3', 'resnet_block_5', 'resnet_block_7'], - **kwargs, - )
                                - - - -def linknet_resnet18_rotation(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet18_rotation - >>> model = linknet_resnet18_rotation(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - 'linknet_resnet18_rotation', - pretrained, - resnet18, - ['resnet_block_1', 'resnet_block_3', 'resnet_block_5', 'resnet_block_7'], - **kwargs, - ) - - -
                                -[docs] -def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet34 - >>> model = linknet_resnet34(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - 'linknet_resnet34', - pretrained, - resnet34, - ['resnet_block_2', 'resnet_block_6', 'resnet_block_12', 'resnet_block_15'], - **kwargs, - )
                                - - - -
                                -[docs] -def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
                                +[docs] +def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet50 - >>> model = linknet_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import linknet16 + >>> model = linknet16(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text detection dataset @@ -657,13 +541,7 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                text detection architecture """ - return _linknet( - 'linknet_resnet50', - pretrained, - resnet50, - ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], - **kwargs, - )
                                + return _linknet('linknet16', pretrained, **kwargs)
                                @@ -697,7 +575,7 @@

                                Source code for doctr.models.detection.linknet.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/models/detection/zoo.html b/v0.5.1/_modules/doctr/models/detection/zoo.html index 87b6288a06..23a2f451e3 100644 --- a/v0.5.1/_modules/doctr/models/detection/zoo.html +++ b/v0.5.1/_modules/doctr/models/detection/zoo.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.detection.zoo

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                @@ -295,75 +283,58 @@ 

                                Source code for doctr.models.detection.zoo

                                 from typing import Any
                                 
                                 from doctr.file_utils import is_tf_available, is_torch_available
                                -
                                -from .. import detection
                                +from .core import DetectionPredictor
                                 from ..preprocessor import PreProcessor
                                -from .predictor import DetectionPredictor
                                +from .. import detection
                                +
                                 
                                 __all__ = ["detection_predictor"]
                                 
                                 
                                 if is_tf_available():
                                -    ARCHS = ['db_resnet50', 'db_mobilenet_v3_large', 'linknet_resnet18', 'linknet_resnet18_rotation']
                                -    ROT_ARCHS = ['linknet_resnet18_rotation']
                                +    ARCHS = ['db_resnet50', 'linknet16']
                                 elif is_torch_available():
                                -    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3_large', 'linknet_resnet18', 'db_resnet50_rotation']
                                -    ROT_ARCHS = ['db_resnet50_rotation']
                                +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
                                 
                                 
                                -def _predictor(
                                -    arch: str,
                                -    pretrained: bool,
                                -    assume_straight_pages: bool = True,
                                -    **kwargs: Any
                                -) -> DetectionPredictor:
                                +def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
                                 
                                     if arch not in ARCHS:
                                         raise ValueError(f"unknown architecture '{arch}'")
                                 
                                -    if arch not in ROT_ARCHS and not assume_straight_pages:
                                -        raise AssertionError("You are trying to use a model trained on straight pages while not assuming"
                                -                             " your pages are straight. If you have only straight documents, don't pass"
                                -                             f" assume_straight_pages=False, otherwise you should use one of these archs: {ROT_ARCHS}")
                                -
                                     # Detection
                                -    _model = detection.__dict__[arch](pretrained=pretrained, assume_straight_pages=assume_straight_pages)
                                +    _model = detection.__dict__[arch](pretrained=pretrained)
                                     kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                     kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                     kwargs['batch_size'] = kwargs.get('batch_size', 1)
                                     predictor = DetectionPredictor(
                                -        PreProcessor(_model.cfg['input_shape'][:-1] if is_tf_available() else _model.cfg['input_shape'][1:], **kwargs),
                                +        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
                                         _model
                                     )
                                     return predictor
                                 
                                 
                                 
                                -[docs] -def detection_predictor( - arch: str = 'db_resnet50', - pretrained: bool = False, - assume_straight_pages: bool = True, - **kwargs: Any -) -> DetectionPredictor: +[docs] +def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: """Text detection architecture. - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(arch='db_resnet50', pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + Example:: + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture to use (e.g. 'db_resnet50') + arch: name of the architecture to use ('db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset - assume_straight_pages: If True, fit straight boxes to the page Returns: Detection predictor """ - return _predictor(arch, pretrained, assume_straight_pages, **kwargs)
                                + return _predictor(arch, pretrained, **kwargs)
                                @@ -397,7 +368,7 @@

                                Source code for doctr.models.detection.zoo

                                       
                                     
                                   
                                -
                                + diff --git a/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html index 35cdd9910a..7b8529c26d 100644 --- a/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                 from copy import deepcopy
                                -from typing import Any, Dict, List, Optional, Tuple
                                -
                                 import tensorflow as tf
                                 from tensorflow.keras import layers
                                -from tensorflow.keras.models import Model, Sequential
                                -
                                -from doctr.datasets import VOCABS
                                +from tensorflow.keras.models import Sequential, Model
                                +from typing import Tuple, Dict, Any, Optional, List
                                 
                                -from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
                                -from ...utils.tensorflow import load_pretrained_params
                                +from ... import backbones
                                +from ...utils import load_pretrained_params
                                 from ..core import RecognitionModel, RecognitionPostProcessor
                                 
                                -__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_mobilenet_v3_small',
                                -           'crnn_mobilenet_v3_large']
                                +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
                                 
                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                     'crnn_vgg16_bn': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 128, 3),
                                -        'vocab': VOCABS['legacy_french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/crnn_vgg16_bn-76b7f2c6.zip',
                                -    },
                                -    'crnn_mobilenet_v3_small': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                +        'mean': (.5, .5, .5),
                                +        'std': (1., 1., 1.),
                                +        'backbone': 'vgg16_bn', 'rnn_units': 128,
                                         'input_shape': (32, 128, 3),
                                -        'vocab': VOCABS['french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip',
                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
                                     },
                                -    'crnn_mobilenet_v3_large': {
                                +    'crnn_resnet31': {
                                         'mean': (0.694, 0.695, 0.693),
                                         'std': (0.299, 0.296, 0.301),
                                +        'backbone': 'resnet31', 'rnn_units': 128,
                                         'input_shape': (32, 128, 3),
                                -        'vocab': VOCABS['french'],
                                -        'url': None,
                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
                                     },
                                 }
                                 
                                @@ -427,15 +408,16 @@ 

                                Source code for doctr.models.recognition.crnn.tensorflow

                                """Compute CTC loss for the model. Args: + gt: the encoded tensor with gt labels model_output: predicted logits of the model - target: lengths of each gt word inside the batch + seq_len: lengths of each gt word inside the batch Returns: The loss of the model on the batch """ - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -472,29 +454,23 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                return out -def _crnn( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained - - kwargs['vocab'] = kwargs.get('vocab', default_cfgs[arch]['vocab']) +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: + # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['vocab'] = kwargs['vocab'] - _cfg['input_shape'] = input_shape or default_cfgs[arch]['input_shape'] + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - feat_extractor = backbone_fn( + # Feature extractor + feat_extractor = backbones.__dict__[_cfg['backbone']]( input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) + kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters @@ -505,16 +481,17 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                -[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -523,21 +500,20 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                text recognition architecture """ - return _crnn('crnn_vgg16_bn', pretrained, vgg16_bn_r, **kwargs)
                                + return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
                                -
                                -[docs] -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based +def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import crnn_resnet31 + >>> model = crnn_resnet31(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -546,31 +522,7 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                text recognition architecture """ - return _crnn('crnn_mobilenet_v3_small', pretrained, mobilenet_v3_small_r, **kwargs)
                                - - - -
                                -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn('crnn_mobilenet_v3_large', pretrained, mobilenet_v3_large_r, **kwargs)
                                - + return _crnn('crnn_resnet31', pretrained, **kwargs)
                                @@ -603,7 +555,7 @@

                                Source code for doctr.models.recognition.crnn.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html index 188c528b0c..6d9bff4577 100644 --- a/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/master/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                -from copy import deepcopy
                                -from typing import Any, Dict, List, Optional, Tuple
                                -
                                 import tensorflow as tf
                                -from tensorflow.keras import Model, layers
                                -
                                -from doctr.datasets import VOCABS
                                -from doctr.models.classification import magc_resnet31
                                +from tensorflow.keras import layers, Sequential, Model
                                +from typing import Tuple, List, Dict, Any, Optional
                                +from copy import deepcopy
                                 
                                -from ...utils.tensorflow import load_pretrained_params
                                -from ..transformer.tensorflow import Decoder, create_look_ahead_mask, create_padding_mask, positional_encoding
                                +from ..core import RecognitionPostProcessor
                                +from ...backbones.resnet import ResnetStage
                                +from ...utils import conv_sequence, load_pretrained_params
                                +from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
                                +from ....datasets import VOCABS
                                 from .base import _MASTER, _MASTERPostProcessor
                                 
                                -__all__ = ['MASTER', 'master']
                                +
                                +__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
                                 
                                 
                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                     'master': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                -        'input_shape': (32, 128, 3),
                                -        'vocab': VOCABS['legacy_french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/master-bade6eae.zip',
                                +        'mean': (.5, .5, .5),
                                +        'std': (1., 1., 1.),
                                +        'input_shape': (48, 160, 3),
                                +        'vocab': VOCABS['french'],
                                +        'url': None,
                                     },
                                 }
                                 
                                 
                                +class MAGC(layers.Layer):
                                +
                                +    """Implements the Multi-Aspect Global Context Attention, as described in
                                +    <https://arxiv.org/pdf/1910.02562.pdf>`_.
                                +
                                +    Args:
                                +        inplanes: input channels
                                +        headers: number of headers to split channels
                                +        att_scale: if True, re-scale attention to counteract the variance distibutions
                                +        **kwargs
                                +    """
                                +
                                +    def __init__(
                                +        self,
                                +        inplanes: int,
                                +        headers: int = 1,
                                +        att_scale: bool = False,
                                +        **kwargs
                                +    ) -> None:
                                +        super().__init__(**kwargs)
                                +
                                +        self.headers = headers  # h
                                +        self.inplanes = inplanes  # C
                                +        self.att_scale = att_scale
                                +
                                +        self.single_header_inplanes = int(inplanes / headers)  # C / h
                                +
                                +        self.conv_mask = tf.keras.layers.Conv2D(
                                +            filters=1,
                                +            kernel_size=1,
                                +            kernel_initializer=tf.initializers.he_normal()
                                +        )
                                +
                                +        self.transform = tf.keras.Sequential(
                                +            [
                                +                tf.keras.layers.Conv2D(
                                +                    filters=self.inplanes,
                                +                    kernel_size=1,
                                +                    kernel_initializer=tf.initializers.he_normal()
                                +                ),
                                +                tf.keras.layers.LayerNormalization([1, 2, 3]),
                                +                tf.keras.layers.ReLU(),
                                +                tf.keras.layers.Conv2D(
                                +                    filters=self.inplanes,
                                +                    kernel_size=1,
                                +                    kernel_initializer=tf.initializers.he_normal()
                                +                ),
                                +            ],
                                +            name='transform'
                                +        )
                                +
                                +    @tf.function
                                +    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
                                +        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
                                +
                                +        # B, H, W, C -->> B*h, H, W, C/h
                                +        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
                                +        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
                                +        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
                                +
                                +        # Compute shorcut
                                +        shortcut = x
                                +        # B*h, 1, H*W, C/h
                                +        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
                                +        # B*h, 1, C/h, H*W
                                +        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
                                +
                                +        # Compute context mask
                                +        # B*h, H, W, 1,
                                +        context_mask = self.conv_mask(x)
                                +        # B*h, 1, H*W, 1
                                +        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
                                +        # scale variance
                                +        if self.att_scale and self.headers > 1:
                                +            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
                                +        # B*h, 1, H*W, 1
                                +        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
                                +
                                +        # Compute context
                                +        # B*h, 1, C/h, 1
                                +        context = tf.matmul(shortcut, context_mask)
                                +        context = tf.reshape(context, shape=(b, 1, c, 1))
                                +        # B, 1, 1, C
                                +        context = tf.transpose(context, perm=(0, 1, 3, 2))
                                +        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
                                +        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
                                +        context.set_shape([batch, 1, 1, chan])
                                +        return context
                                +
                                +    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
                                +        # Context modeling: B, H, W, C  ->  B, 1, 1, C
                                +        context = self.context_modeling(inputs)
                                +        # Transform: B, 1, 1, C  ->  B, 1, 1, C
                                +        transformed = self.transform(context)
                                +        return inputs + transformed
                                +
                                +
                                +class MAGCResnet(Sequential):
                                +
                                +    """Implements the modified resnet with MAGC layers, as described in paper.
                                +
                                +    Args:
                                +        headers: number of header to split channels in MAGC layers
                                +        input_shape: shape of the model input (without batch dim)
                                +    """
                                +
                                +    def __init__(
                                +        self,
                                +        headers: int = 1,
                                +        input_shape: Tuple[int, int, int] = (48, 160, 3),
                                +    ) -> None:
                                +        _layers = [
                                +            # conv_1x
                                +            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
                                +            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
                                +            layers.MaxPooling2D((2, 2), (2, 2)),
                                +            # conv_2x
                                +            ResnetStage(num_blocks=1, output_channels=256),
                                +            MAGC(inplanes=256, headers=headers, att_scale=True),
                                +            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
                                +            layers.MaxPooling2D((2, 2), (2, 2)),
                                +            # conv_3x
                                +            ResnetStage(num_blocks=2, output_channels=512),
                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                +            layers.MaxPooling2D((2, 1), (2, 1)),
                                +            # conv_4x
                                +            ResnetStage(num_blocks=5, output_channels=512),
                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                +            # conv_5x
                                +            ResnetStage(num_blocks=3, output_channels=512),
                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                +        ]
                                +        super().__init__(_layers)
                                +
                                +
                                 class MASTER(_MASTER, Model):
                                 
                                     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
                                     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
                                 
                                     Args:
                                -        feature_extractor: the backbone serving as feature extractor
                                         vocab: vocabulary, (without EOS, SOS, PAD)
                                         d_model: d parameter for the transformer decoder
                                +        headers: headers for the MAGC module
                                         dff: depth of the pointwise feed-forward layer
                                         num_heads: number of heads for the mutli-head attention module
                                         num_layers: number of decoder layers to stack
                                         max_length: maximum length of character sequence handled by the model
                                -        dropout: dropout probability of the decoder
                                -        input_shape: size of the image inputs
                                -        cfg: dictionary containing information about the model
                                +        input_size: size of the image inputs
                                     """
                                 
                                     def __init__(
                                         self,
                                -        feature_extractor: tf.keras.Model,
                                         vocab: str,
                                         d_model: int = 512,
                                +        headers: int = 1,
                                         dff: int = 2048,
                                -        num_heads: int = 8,  # number of heads in the transformer decoder
                                +        num_heads: int = 8,
                                         num_layers: int = 3,
                                         max_length: int = 50,
                                -        dropout: float = 0.2,
                                -        input_shape: Tuple[int, int, int] = (32, 128, 3),
                                +        input_shape: Tuple[int, int, int] = (48, 160, 3),
                                         cfg: Optional[Dict[str, Any]] = None,
                                     ) -> None:
                                         super().__init__()
                                @@ -357,7 +480,7 @@ 

                                Source code for doctr.models.recognition.master.tensorflow

                                self.cfg = cfg self.vocab_size = len(vocab) - self.feat_extractor = feature_extractor + self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape) self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model) # 3 more classes: EOS/PAD/SOS self.decoder = Decoder( @@ -367,21 +490,21 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                dff=dff, vocab_size=self.vocab_size, maximum_position_encoding=max_length, - dropout=dropout, ) self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model) self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform()) self.postprocessor = MASTERPostProcessor(vocab=self.vocab) + @tf.function def make_mask(self, target: tf.Tensor) -> tf.Tensor: look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1]) target_padding_mask = create_padding_mask(target, self.vocab_size + 2) # Pad symbol combined_mask = tf.maximum(target_padding_mask, look_ahead_mask) return combined_mask - @staticmethod def compute_loss( + self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: List[int], @@ -410,7 +533,7 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -435,16 +558,16 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                """ # Encode - feature = self.feat_extractor(x, **kwargs) + feature = self.feature_extractor(x, **kwargs) b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) - encoded = feature + tf.cast(self.feature_pe[:, :h * w, :], dtype=feature.dtype) + encoded = feature + self.feature_pe[:, :h * w, :] out: Dict[str, tf.Tensor] = {} if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) if kwargs.get('training', False): if target is None: @@ -489,7 +612,7 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=encoded.dtype) # 3 symbols + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): ys_mask = self.make_mask(ys) @@ -509,7 +632,6 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures - Args: vocab: string containing the ordered sequence of supported characters ignore_case: if True, ignore case of letters @@ -538,30 +660,17 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                return list(zip(word_values, probs.numpy().tolist())) -def _master( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - **kwargs: Any -) -> MASTER: - - pretrained_backbone = pretrained_backbone and not pretrained +def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg['input_shape'] = kwargs.get('input_shape', _cfg['input_shape']) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) kwargs['vocab'] = _cfg['vocab'] - kwargs['input_shape'] = _cfg['input_shape'] # Build the model - model = MASTER( - backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg['input_shape'], include_top=False), - cfg=_cfg, - **kwargs, - ) + model = MASTER(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: load_pretrained_params(model, default_cfgs[arch]['url']) @@ -570,24 +679,22 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                -[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - + Example:: + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - Returns: text recognition architecture """ - return _master('master', pretrained, magc_resnet31, **kwargs)
                                + return _master('master', pretrained, **kwargs)
                                @@ -621,7 +728,7 @@

                                Source code for doctr.models.recognition.master.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

                                Source code for doctr.models.recognition.parseq.tensorflow

                                import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

                                Source code for doctr.models.recognition.parseq.tensorflow

                                self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

                                Source code for doctr.models.recognition.parseq.tensorflow

                                ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

                                Source code for doctr.models.recognition.parseq.tensorflow

                                target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html index 4446dfd22e..3a9989ef30 100644 --- a/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/sar/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                 from copy import deepcopy
                                -from typing import Any, Dict, List, Optional, Tuple
                                -
                                 import tensorflow as tf
                                -from tensorflow.keras import Model, Sequential, layers
                                -
                                -from doctr.datasets import VOCABS
                                -from doctr.utils.repr import NestedObject
                                +from tensorflow.keras import Sequential, layers, Model
                                +from typing import Tuple, Dict, List, Any, Optional
                                 
                                -from ...classification import resnet31
                                -from ...utils.tensorflow import load_pretrained_params
                                +from ... import backbones
                                +from ...utils import load_pretrained_params
                                 from ..core import RecognitionModel, RecognitionPostProcessor
                                +from doctr.utils.repr import NestedObject
                                 
                                -__all__ = ['SAR', 'sar_resnet31']
                                +__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
                                 
                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                +    'sar_vgg16_bn': {
                                +        'mean': (.5, .5, .5),
                                +        'std': (1., 1., 1.),
                                +        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
                                +        'input_shape': (32, 128, 3),
                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
                                +    },
                                     'sar_resnet31': {
                                -        'mean': (0.694, 0.695, 0.693),
                                -        'std': (0.299, 0.296, 0.301),
                                +        'mean': (.5, .5, .5),
                                +        'std': (1., 1., 1.),
                                +        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
                                         'input_shape': (32, 128, 3),
                                -        'vocab': VOCABS['legacy_french'],
                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/sar_resnet31-9ee49970.zip',
                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
                                     },
                                 }
                                 
                                @@ -394,7 +390,7 @@ 

                                Source code for doctr.models.recognition.sar.tensorflow

                                super().__init__() self.vocab_size = vocab_size self.lstm_decoder = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_layers)] + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) @@ -415,12 +411,12 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                # initialize states (each of shape (N, rnn_units)) states = self.lstm_decoder.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype + inputs=None, batch_size=features.shape[0], dtype=tf.float32 ) # run first step of lstm # holistic: shape (N, rnn_units) _, states = self.lstm_decoder(holistic, states, **kwargs) - # Initialize with the index of virtual START symbol (placed after <eos> so that the one-hot is only zeros) + # Initialize with the index of virtual START symbol (placed after <eos>) symbol = tf.fill(features.shape[0], self.vocab_size + 1) logits_list = [] if kwargs.get('training') and gt is None: @@ -501,8 +497,8 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                self.postprocessor = SARPostProcessor(vocab=vocab) - @staticmethod def compute_loss( + self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -530,7 +526,7 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -546,7 +542,7 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling encoded = self.encoder(pooled_features, **kwargs) if target is not None: - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) seq_len = tf.cast(seq_len, tf.int32) decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) @@ -595,30 +591,30 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg['input_shape'] = input_shape or _cfg['input_shape'] _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) + _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) + _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) + _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = backbone_fn( - pretrained=pretrained_backbone, + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( input_shape=_cfg['input_shape'], include_top=False, ) kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + kwargs['embedding_units'] = _cfg['embedding_units'] + kwargs['attention_units'] = _cfg['attention_units'] + kwargs['max_length'] = _cfg['max_length'] + kwargs['num_decoders'] = _cfg['num_decoders'] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) @@ -629,17 +625,42 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                return model +
                                +[docs] +def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: + """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong + Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. + + Example:: + >>> import tensorflow as tf + >>> from doctr.models import sar_vgg16_bn + >>> model = sar_vgg16_bn(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + + Returns: + text recognition architecture + """ + + return _sar('sar_vgg16_bn', pretrained, **kwargs)
                                + + +
                                -[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example: + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -648,7 +669,7 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                text recognition architecture """ - return _sar('sar_resnet31', pretrained, resnet31, **kwargs)
                                + return _sar('sar_resnet31', pretrained, **kwargs)
                                @@ -682,7 +703,7 @@

                                Source code for doctr.models.recognition.sar.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html index 23730f6227..aecde3662a 100644 --- a/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.5.1/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -302,7 +302,7 @@

                                Source code for doctr.models.recognition.vitstr.tensorflow

                                from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS diff --git a/v0.5.1/_modules/doctr/models/recognition/zoo.html b/v0.5.1/_modules/doctr/models/recognition/zoo.html index 603b30b84d..0f1bff8861 100644 --- a/v0.5.1/_modules/doctr/models/recognition/zoo.html +++ b/v0.5.1/_modules/doctr/models/recognition/zoo.html @@ -226,32 +226,20 @@

                                Source code for doctr.models.recognition.zoo

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                 from typing import Any
                                 
                                -from doctr.file_utils import is_tf_available
                                -from doctr.models.preprocessor import PreProcessor
                                -
                                +from doctr.file_utils import is_tf_available, is_torch_available
                                +from .core import RecognitionPredictor
                                +from ..preprocessor import PreProcessor
                                 from .. import recognition
                                -from .predictor import RecognitionPredictor
                                +
                                 
                                 __all__ = ["recognition_predictor"]
                                 
                                 
                                -ARCHS = ['crnn_vgg16_bn', 'crnn_mobilenet_v3_small', 'crnn_mobilenet_v3_large', 'sar_resnet31', 'master']
                                +if is_tf_available():
                                +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
                                +elif is_torch_available():
                                +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
                                 
                                 
                                 def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                                @@ -315,9 +306,8 @@ 

                                Source code for doctr.models.recognition.zoo

                                kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                     kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                     kwargs['batch_size'] = kwargs.get('batch_size', 32)
                                -    input_shape = _model.cfg['input_shape'][:2] if is_tf_available() else _model.cfg['input_shape'][-2:]
                                     predictor = RecognitionPredictor(
                                -        PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs),
                                +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                                         _model
                                     )
                                 
                                @@ -325,7 +315,7 @@ 

                                Source code for doctr.models.recognition.zoo

                                -[docs]
                                +[docs]
                                 def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor:
                                     """Text recognition architecture.
                                 
                                @@ -337,7 +327,7 @@ 

                                Source code for doctr.models.recognition.zoo

                                        >>> out = model([input_page])
                                 
                                     Args:
                                -        arch: name of the architecture to use (e.g. 'crnn_vgg16_bn')
                                +        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
                                         pretrained: If True, returns a model pre-trained on our text recognition dataset
                                 
                                     Returns:
                                @@ -378,7 +368,7 @@ 

                                Source code for doctr.models.recognition.zoo

                                   
                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/models/zoo.html b/v0.5.1/_modules/doctr/models/zoo.html index 253cd75279..bfa5a6fdf4 100644 --- a/v0.5.1/_modules/doctr/models/zoo.html +++ b/v0.5.1/_modules/doctr/models/zoo.html @@ -226,32 +226,15 @@

                                Source code for doctr.models.zoo

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                 from typing import Any
                                -
                                +from .core import OCRPredictor
                                 from .detection.zoo import detection_predictor
                                -from .predictor import OCRPredictor
                                 from .recognition.zoo import recognition_predictor
                                 
                                +
                                 __all__ = ["ocr_predictor"]
                                 
                                 
                                -def _predictor(
                                -    det_arch: str,
                                -    reco_arch: str,
                                -    pretrained: bool,
                                -    assume_straight_pages: bool = True,
                                -    preserve_aspect_ratio: bool = False,
                                -    symmetric_pad: bool = True,
                                -    det_bs: int = 2,
                                -    reco_bs: int = 128,
                                -    **kwargs,
                                -) -> OCRPredictor:
                                +def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
                                 
                                     # Detection
                                -    det_predictor = detection_predictor(
                                -        det_arch,
                                -        pretrained=pretrained,
                                -        batch_size=det_bs,
                                -        assume_straight_pages=assume_straight_pages,
                                -        preserve_aspect_ratio=preserve_aspect_ratio,
                                -        symmetric_pad=symmetric_pad,
                                -    )
                                +    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
                                 
                                     # Recognition
                                     reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
                                 
                                -    return OCRPredictor(
                                -        det_predictor,
                                -        reco_predictor,
                                -        assume_straight_pages=assume_straight_pages,
                                -        preserve_aspect_ratio=preserve_aspect_ratio,
                                -        symmetric_pad=symmetric_pad,
                                -        **kwargs
                                -    )
                                +    return OCRPredictor(det_predictor, reco_predictor)
                                 
                                 
                                 
                                -[docs] +[docs] def ocr_predictor( det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, - assume_straight_pages: bool = True, - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = True, - export_as_straight_boxes: bool = False, **kwargs: Any ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + Example:: + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - det_arch: name of the detection architecture to use (e.g. 'db_resnet50', 'db_mobilenet_v3_large') - reco_arch: name of the recognition architecture to use (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') pretrained: If True, returns a model pre-trained on our OCR dataset - assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages - without rotated textual elements. - preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before - running the detection model on it. - symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. - export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions - (potentially rotated) as straight bounding boxes. - kwargs: keyword args of `OCRPredictor` Returns: OCR predictor """ - return _predictor( - det_arch, - reco_arch, - pretrained, - assume_straight_pages=assume_straight_pages, - preserve_aspect_ratio=preserve_aspect_ratio, - symmetric_pad=symmetric_pad, - export_as_straight_boxes=export_as_straight_boxes, - **kwargs, - )
                                + return _predictor(det_arch, reco_arch, pretrained, **kwargs)
                                @@ -416,7 +354,7 @@

                                Source code for doctr.models.zoo

                                       
                                     
                                   
                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/transforms/modules/base.html b/v0.5.1/_modules/doctr/transforms/modules/base.html index a56732c9c5..e7b5ea10d9 100644 --- a/v0.5.1/_modules/doctr/transforms/modules/base.html +++ b/v0.5.1/_modules/doctr/transforms/modules/base.html @@ -226,32 +226,20 @@

                                Source code for doctr.transforms.modules.base

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                -import math
                                 import random
                                -from typing import Any, Callable, Dict, List, Tuple
                                -
                                -import numpy as np
                                +from typing import List, Any, Callable
                                 
                                 from doctr.utils.repr import NestedObject
                                -
                                 from .. import functional as F
                                 
                                -__all__ = ['SampleCompose', 'ImageTransform', 'ColorInversion', 'OneOf', 'RandomApply', 'RandomRotate', 'RandomCrop']
                                -
                                -
                                -class SampleCompose(NestedObject):
                                -    """Implements a wrapper that will apply transformations sequentially on both image and target
                                -
                                -    .. tabs::
                                -
                                -        .. tab:: TensorFlow
                                -
                                -            .. code:: python
                                -
                                -                >>> import numpy as np
                                -                >>> import tensorflow as tf
                                -                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
                                -                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
                                -                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
                                -
                                -        .. tab:: PyTorch
                                -
                                -            .. code:: python
                                -
                                -                >>> import numpy as np
                                -                >>> import torch
                                -                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
                                -                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
                                -                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
                                -
                                -    Args:
                                -        transforms: list of transformation modules
                                -    """
                                -
                                -    _children_names: List[str] = ['sample_transforms']
                                -
                                -    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
                                -        self.sample_transforms = transforms
                                -
                                -    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
                                -        for t in self.sample_transforms:
                                -            x, target = t(x, target)
                                -
                                -        return x, target
                                -
                                -
                                -class ImageTransform(NestedObject):
                                -    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
                                -
                                -    .. tabs::
                                -
                                -        .. tab:: TensorFlow
                                -
                                -            .. code:: python
                                -
                                -                >>> import tensorflow as tf
                                -                >>> from doctr.transforms import ImageTransform, ColorInversion
                                -                >>> transfo = ImageTransform(ColorInversion((32, 32)))
                                -                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
                                -
                                -        .. tab:: PyTorch
                                -
                                -            .. code:: python
                                -
                                -                >>> import torch
                                -                >>> from doctr.transforms import ImageTransform, ColorInversion
                                -                >>> transfo = ImageTransform(ColorInversion((32, 32)))
                                -                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
                                -
                                -    Args:
                                -        transform: the image transformation module to wrap
                                -    """
                                 
                                -    _children_names: List[str] = ['img_transform']
                                -
                                -    def __init__(self, transform: Callable[[Any], Any]) -> None:
                                -        self.img_transform = transform
                                -
                                -    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
                                -        img = self.img_transform(img)
                                -        return img, target
                                +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
                                 
                                 
                                 
                                -[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import ColorInversion - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import ColorInversion - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(torch.rand(8, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: min_val: range [min_val, 1] to colorize RGB pixels @@ -424,29 +317,15 @@

                                Source code for doctr.transforms.modules.base

                                -[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import OneOf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import OneOf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(torch.rand(1, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transforms: list of transformations, one only will be picked @@ -466,29 +345,15 @@

                                Source code for doctr.transforms.modules.base

                                -[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomApply - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import RandomApply - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(torch.rand(1, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transform: transformation to apply @@ -506,68 +371,6 @@

                                Source code for doctr.transforms.modules.base

                                return self.transform(img) return img
                                - - -
                                -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image and its boxes - - .. image:: https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png - :align: center - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - def __init__(self, max_angle: float = 5., expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: - angle = random.uniform(-self.max_angle, self.max_angle) - r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) - # Removes deleted boxes - is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 - return r_img, r_polys[is_kept]
                                - - - -
                                -[docs] -class RandomCrop(NestedObject): - """Randomly crop a tensor image and its boxes - - Args: - scale: tuple of floats, relative (min_area, max_area) of the crop - ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w - """ - def __init__(self, scale: Tuple[float, float] = (0.08, 1.), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: - self.scale = scale - self.ratio = ratio - - def extra_repr(self) -> str: - return f"scale={self.scale}, ratio={self.ratio}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - scale = random.uniform(self.scale[0], self.scale[1]) - ratio = random.uniform(self.ratio[0], self.ratio[1]) - # Those might overflow - crop_h = math.sqrt(scale * ratio) - crop_w = math.sqrt(scale / ratio) - xmin, ymin = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h) - xmax, ymax = xmin + crop_w, ymin + crop_h - # Clip them - xmin, ymin = max(xmin, 0), max(ymin, 0) - xmax, ymax = min(xmax, 1), min(ymax, 1) - - croped_img, crop_boxes = F.crop_detection(img, target["boxes"], (xmin, ymin, xmax, ymax)) - return croped_img, dict(boxes=crop_boxes)
                                -
                                @@ -600,7 +403,7 @@

                                Source code for doctr.transforms.modules.base

                                -
                                +
                                diff --git a/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html b/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html index 59eaa1efaa..51b31b4fc4 100644 --- a/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.5.1/_modules/doctr/transforms/modules/tensorflow.html @@ -226,32 +226,20 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                 import random
                                -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
                                -
                                -import numpy as np
                                 import tensorflow as tf
                                -import tensorflow_addons as tfa
                                +from typing import List, Any, Tuple, Callable
                                 
                                 from doctr.utils.repr import NestedObject
                                 
                                -from ..functional.tensorflow import random_shadow
                                 
                                 __all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
                                -           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality', 'GaussianBlur',
                                -           'ChannelShuffle', 'GaussianNoise', 'RandomHorizontalFlip', 'RandomShadow']
                                +           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
                                 
                                 
                                 
                                -[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - >>> import tensorflow as tf - >>> from doctr.transforms import Compose, Resize - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Compose, Resize + >>> import tensorflow as tf + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transforms: list of transformation modules @@ -336,14 +320,15 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - >>> import tensorflow as tf - >>> from doctr.transforms import Resize - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Resize + >>> import tensorflow as tf + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: output_size: expected output size @@ -353,7 +338,7 @@

                                Source code for doctr.transforms.modules.tensorflow

                                """ def __init__( self, - output_size: Union[int, Tuple[int, int]], + output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, @@ -363,108 +348,66 @@

                                Source code for doctr.transforms.modules.tensorflow

                                self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad - if isinstance(self.output_size, int): - self.wanted_size = (self.output_size, self.output_size) - elif isinstance(self.output_size, (tuple, list)): - self.wanted_size = self.output_size - else: - raise AssertionError("Output size should be either a list, a tuple or an int") - def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" if self.preserve_aspect_ratio: _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__( - self, - img: tf.Tensor, - target: Optional[np.ndarray] = None, - ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: - - input_dtype = img.dtype - - img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio) - # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio - raw_shape = img.shape[:2] + def __call__(self, img: tf.Tensor) -> tf.Tensor: + img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: - if isinstance(self.output_size, (tuple, list)): - # In that case we need to pad because we want to enforce both width and height - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - if self.preserve_aspect_ratio: - # Get absolute coords - if target.shape[1:] == (4,): - if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] - target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] - target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] - else: - target[:, [0, 2]] *= raw_shape[1] / img.shape[1] - target[:, [1, 3]] *= raw_shape[0] / img.shape[0] - elif target.shape[1:] == (4, 2): - if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] - target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] - target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] - else: - target[..., 0] *= raw_shape[1] / img.shape[1] - target[..., 1] *= raw_shape[0] / img.shape[0] - else: - raise AssertionError - return tf.cast(img, dtype=input_dtype), target - - return tf.cast(img, dtype=input_dtype)
                                + # pad width + if not self.symmetric_pad: + offset = (0, 0) + elif self.output_size[0] == img.shape[0]: + offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) + else: + offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) + img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) + return img
                                -[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - >>> import tensorflow as tf - >>> from doctr.transforms import Normalize - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: mean: average value per channel std: standard deviation per channel """ def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
                                -[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - >>> import tensorflow as tf - >>> from doctr.transforms import LambdaTransformation - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import LambdaTransformation + >>> import tensorflow as tf + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: fn: the function to be applied to the input tensor @@ -478,34 +421,32 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - >>> import tensorflow as tf - >>> from doctr.transforms import ToGray - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ - def __init__(self, num_output_channels: int = 1): - self.num_output_channels = num_output_channels - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.rgb_to_grayscale(img) - return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
                                + return tf.image.rgb_to_grayscale(img)
                                -[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - >>> import tensorflow as tf - >>> from doctr.transforms import RandomBrightness - >>> transfo = RandomBrightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Brightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] @@ -523,15 +464,16 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - >>> import tensorflow as tf - >>> from doctr.transforms import RandomContrast - >>> transfo = RandomContrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Contrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) @@ -548,15 +490,16 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - >>> import tensorflow as tf - >>> from doctr.transforms import RandomSaturation - >>> transfo = RandomSaturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Saturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) @@ -573,14 +516,15 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - >>> import tensorflow as tf - >>> from doctr.transforms import RandomHue - >>> transfo = RandomHue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Hue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] @@ -597,14 +541,15 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - >>> import tensorflow as tf - >>> from doctr.transforms import RandomGamma - >>> transfo = RandomGamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Gamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: min_gamma: non-negative real number, lower bound for gamma param @@ -636,14 +581,15 @@

                                Source code for doctr.transforms.modules.tensorflow

                                -[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - >>> import tensorflow as tf - >>> from doctr.transforms import RandomJpegQuality - >>> transfo = RandomJpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = JpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: min_quality: int between [0, 100] @@ -661,164 +607,6 @@

                                Source code for doctr.transforms.modules.tensorflow

                                img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality )
                                - - -
                                -[docs] -class GaussianBlur(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - >>> import tensorflow as tf - >>> from doctr.transforms import GaussianBlur - >>> transfo = GaussianBlur(3, (.1, 5)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - kernel_shape: size of the blurring kernel - std: min and max value of the standard deviation - """ - def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: - self.kernel_shape = kernel_shape - self.std = std - - def extra_repr(self) -> str: - return f"kernel_shape={self.kernel_shape}, std={self.std}" - - @tf.function - def __call__(self, img: tf.Tensor) -> tf.Tensor: - sigma = random.uniform(self.std[0], self.std[1]) - return tfa.image.gaussian_filter2d( - img, filter_shape=self.kernel_shape, sigma=sigma, - )
                                - - - -
                                -[docs] -class ChannelShuffle(NestedObject): - """Randomly shuffle channel order of a given image""" - - def __init__(self): - pass - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
                                - - - -
                                -[docs] -class GaussianNoise(NestedObject): - """Adds Gaussian Noise to the input tensor - - >>> import tensorflow as tf - >>> from doctr.transforms import GaussianNoise - >>> transfo = GaussianNoise(0., 1.) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - mean : mean of the gaussian distribution - std : std of the gaussian distribution - """ - def __init__(self, mean: float = 0., std: float = 1.) -> None: - super().__init__() - self.std = std - self.mean = mean - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - # Reshape the distribution - noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std - if x.dtype == tf.uint8: - return tf.cast( - tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), - dtype=tf.uint8 - ) - else: - return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) - - def extra_repr(self) -> str: - return f"mean={self.mean}, std={self.std}"
                                - - - -
                                -[docs] -class RandomHorizontalFlip(NestedObject): - """Adds random horizontal flip to the input tensor/np.ndarray - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomHorizontalFlip - >>> transfo = RandomHorizontalFlip(p=0.5) - >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) - >>> target = { - >>> "boxes": np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32), - >>> "labels": np.ones(1, dtype= np.int64) - >>> } - >>> out = transfo(image, target) - - Args: - p : probability of Horizontal Flip - """ - def __init__(self, p: float) -> None: - super().__init__() - self.p = p - - def __call__( - self, - img: Union[tf.Tensor, np.ndarray], - target: Dict[str, Any] - ) -> Tuple[tf.Tensor, Dict[str, Any]]: - """ - Args: - img: Image to be flipped. - target: Dictionary with boxes (in relative coordinates of shape (N, 4)) and labels as keys - Returns: - Tuple of numpy nd-array or Tensor and target - """ - if np.random.rand(1) <= self.p: - _img = tf.image.flip_left_right(img) - _target = target.copy() - # Changing the relative bbox coordinates - _target["boxes"][:, ::2] = 1 - target["boxes"][:, [2, 0]] - return _img, _target - return img, target
                                - - - -
                                -[docs] -class RandomShadow(NestedObject): - """Adds random shade to the input image - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomShadow - >>> transfo = RandomShadow(0., 1.) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - opacity_range : minimum and maximum opacity of the shade - """ - def __init__(self, opacity_range: Tuple[float, float] = None) -> None: - super().__init__() - self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (.2, .8) - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - # Reshape the distribution - if x.dtype == tf.uint8: - return tf.cast( - tf.clip_by_value( - tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), - 0, - 255, - ), - dtype=tf.uint8 - ) - else: - return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) - - def extra_repr(self) -> str: - return f"opacity_range={self.opacity_range}"
                                -
                                @@ -851,7 +639,7 @@

                                Source code for doctr.transforms.modules.tensorflow

                                +
                                diff --git a/v0.5.1/_modules/doctr/utils/metrics.html b/v0.5.1/_modules/doctr/utils/metrics.html index 2fc3ce92e0..20af9416ea 100644 --- a/v0.5.1/_modules/doctr/utils/metrics.html +++ b/v0.5.1/_modules/doctr/utils/metrics.html @@ -226,32 +226,20 @@

                                Source code for doctr.utils.metrics

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                -from typing import Dict, List, Optional, Tuple
                                -
                                -import cv2
                                 import numpy as np
                                -from scipy.optimize import linear_sum_assignment
                                +import cv2
                                +from typing import List, Tuple, Dict, Optional
                                 from unidecode import unidecode
                                +from scipy.optimize import linear_sum_assignment
                                +from doctr.utils.geometry import rbbox_to_polygon
                                 
                                -__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'polygon_iou',
                                -           'nms', 'LocalizationConfusion', 'OCRMetric', 'DetectionMetric']
                                +__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
                                +           'nms', 'LocalizationConfusion', 'OCRMetric']
                                 
                                 
                                 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
                                -    """Performs string comparison with multiple levels of tolerance
                                +    """Perform string comparison with multiple levels of tolerance
                                 
                                     Args:
                                         word1: a string
                                @@ -325,41 +313,40 @@ 

                                Source code for doctr.utils.metrics

                                 
                                 
                                 
                                -[docs] +[docs] class TextMatch: - r"""Implements text match metric (word-level accuracy) for recognition task. + """Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \forall X, Y \in \mathcal{W}^N, - TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) + \\forall X, Y \\in \\mathcal{W}^N, + TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \forall a, x \in \mathcal{W}, - f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{W}` is the set of all possible character sequences, + \\forall a, x \\in \\mathcal{W}, + f_a(x) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } x = a \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + Example:: + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() -
                                -[docs] def update( self, gt: List[str], @@ -369,8 +356,7 @@

                                Source code for doctr.utils.metrics

                                 
                                         Args:
                                             gt: list of groung-truth character sequences
                                -            pred: list of predicted character sequences
                                -        """
                                +            pred: list of predicted character sequences"""
                                 
                                         if len(gt) != len(pred):
                                             raise AssertionError("prediction size does not match with ground-truth labels size")
                                @@ -382,11 +368,10 @@ 

                                Source code for doctr.utils.metrics

                                             self.unidecode += int(_unidecode)
                                             self.unicase += int(_unicase)
                                 
                                -        self.total += len(gt)
                                - + self.total += len(gt)
                                -[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics @@ -415,12 +400,11 @@

                                Source code for doctr.utils.metrics

                                 
                                 
                                 def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                                -    """Computes the IoU between two sets of bounding boxes
                                +    """Compute the IoU between two sets of bounding boxes
                                 
                                     Args:
                                         boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
                                         boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
                                -
                                     Returns:
                                         the IoU matrix of shape (N, M)
                                     """
                                @@ -444,13 +428,12 @@ 

                                Source code for doctr.utils.metrics

                                 
                                 
                                 def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                                -    """Computes the IoA (intersection over area) between two sets of bounding boxes:
                                +    """Compute the IoA (intersection over area) between two sets of bounding boxes:
                                     ioa(i, j) = inter(i, j) / area(i)
                                 
                                     Args:
                                         boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
                                         boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
                                -
                                     Returns:
                                         the IoA matrix of shape (N, M)
                                     """
                                @@ -474,7 +457,7 @@ 

                                Source code for doctr.utils.metrics

                                 
                                 
                                 def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
                                -    """Computes the IoU between two sets of boolean masks
                                +    """Compute the IoU between two sets of boolean masks
                                 
                                     Args:
                                         masks_1: boolean masks of shape (N, H, W)
                                @@ -490,84 +473,19 @@ 

                                Source code for doctr.utils.metrics

                                     iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
                                 
                                     if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
                                +        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
                                +        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
                                         axes = tuple(range(2, masks_1.ndim + 1))
                                -        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                                -        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                                -        iou_mat = intersection / union
                                -
                                -    return iou_mat
                                -
                                -
                                -def polygon_iou(
                                -    polys_1: np.ndarray,
                                -    polys_2: np.ndarray,
                                -    mask_shape: Tuple[int, int],
                                -    use_broadcasting: bool = False
                                -) -> np.ndarray:
                                -    """Computes the IoU between two sets of rotated bounding boxes
                                -
                                -    Args:
                                -        polys_1: rotated bounding boxes of shape (N, 4, 2)
                                -        polys_2: rotated bounding boxes of shape (M, 4, 2)
                                -        mask_shape: spatial shape of the intermediate masks
                                -        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
                                -
                                -    Returns:
                                -        the IoU matrix of shape (N, M)
                                -    """
                                -
                                -    if polys_1.ndim != 3 or polys_2.ndim != 3:
                                -        raise AssertionError("expects boxes to be in format (N, 4, 2)")
                                -
                                -    iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
                                -
                                -    if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
                                -        if use_broadcasting:
                                -            masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
                                -            masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
                                -            iou_mat = mask_iou(masks_1, masks_2)
                                -        else:
                                -            # Save memory by doing the computation for each pair
                                -            for idx, b1 in enumerate(polys_1):
                                -                m1 = _rbox_to_mask(b1, mask_shape)
                                -                for _idx, b2 in enumerate(polys_2):
                                -                    m2 = _rbox_to_mask(b2, mask_shape)
                                -                    iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
                                +        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
                                 
                                     return iou_mat
                                 
                                 
                                -def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                                -    """Converts a rotated bounding box to a boolean mask
                                -
                                -    Args:
                                -        box: rotated bounding box of shape (4, 2)
                                -        shape: spatial shapes of the output masks
                                -
                                -    Returns:
                                -        the boolean mask of the specified shape
                                -    """
                                -
                                -    mask = np.zeros(shape, dtype=np.uint8)
                                -    # Get absolute coords
                                -    if box.dtype != int:
                                -        abs_box = box.copy()
                                -        abs_box[:, 0] = abs_box[:, 0] * shape[1]
                                -        abs_box[:, 1] = abs_box[:, 1] * shape[0]
                                -        abs_box = abs_box.round().astype(int)
                                -    else:
                                -        abs_box = box
                                -        abs_box[2:] = abs_box[2:] + 1
                                -    cv2.fillPoly(mask, [abs_box - 1], 1)
                                -
                                -    return mask.astype(bool)
                                -
                                -
                                 def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                                -    """Converts rotated bounding boxes to boolean masks
                                +    """Convert boxes to masks
                                 
                                     Args:
                                -        boxes: rotated bounding boxes of shape (N, 4, 2)
                                +        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
                                         shape: spatial shapes of the output masks
                                 
                                     Returns:
                                @@ -580,8 +498,8 @@ 

                                Source code for doctr.utils.metrics

                                         # Get absolute coordinates
                                         if boxes.dtype != np.int:
                                             abs_boxes = boxes.copy()
                                -            abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
                                -            abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
                                +            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
                                +            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
                                             abs_boxes = abs_boxes.round().astype(np.int)
                                         else:
                                             abs_boxes = boxes
                                @@ -589,7 +507,9 @@ 

                                Source code for doctr.utils.metrics

                                 
                                         # TODO: optimize slicing to improve vectorization
                                         for idx, _box in enumerate(abs_boxes):
                                -            cv2.fillPoly(masks[idx], [_box - 1], 1)
                                +            box = rbbox_to_polygon(_box)
                                +            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
                                +
                                     return masks.astype(bool)
                                 
                                 
                                @@ -632,76 +552,66 @@ 

                                Source code for doctr.utils.metrics

                                 
                                 
                                 
                                -[docs] +[docs] class LocalizationConfusion: - r"""Implements common confusion metrics and mean IoU for localization evaluation. + """Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ - Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ - Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ - meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) + \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ + Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \forall y \in \mathcal{B}, - g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, + \\forall y \\in \\mathcal{B}, + g_X(y) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + Example:: + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, - use_polygons: bool = False, + rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh - self.use_polygons = use_polygons + self.rotated_bbox = rotated_bbox self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() -
                                -[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: - """Updates the metric - - Args: - gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - """ if preds.shape[0] > 0: # Compute IoU - if self.use_polygons: - iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting) + if self.rotated_bbox: + mask_gts = rbox_to_mask(gts, shape=self.mask_shape) + mask_preds = rbox_to_mask(preds, shape=self.mask_shape) + iou_mat = mask_iou(mask_gts, mask_preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=0).sum()) + self.tot_iou += float(iou_mat.max(axis=1).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -709,11 +619,10 @@

                                Source code for doctr.utils.metrics

                                 
                                         # Update counts
                                         self.num_gts += gts.shape[0]
                                -        self.num_preds += preds.shape[0]
                                - + self.num_preds += preds.shape[0]
                                -[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics @@ -742,65 +651,59 @@

                                Source code for doctr.utils.metrics

                                 
                                 
                                 
                                -[docs] +[docs] class OCRMetric: - r"""Implements an end-to-end OCR metric. + """Implements end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, - \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ - Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, + \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ + Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \forall (b, l) \in \mathcal{B} \times \mathcal{L}, - h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{L}` is the set of possible character sequences, + \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, + h_{B,L}(b, l) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ + & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, + :math:`\\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - >>> ['hello'], ['hello', 'world']) - >>> metric.summary() + Example:: + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + ['hello'], ['hello', 'world']) + >>> metric.summary() Args: iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, - use_polygons: bool = False, + rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh - self.use_polygons = use_polygons + self.rotated_bbox = rotated_bbox self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() -
                                -[docs] def update( self, gt_boxes: np.ndarray, @@ -808,14 +711,6 @@

                                Source code for doctr.utils.metrics

                                         gt_labels: List[str],
                                         pred_labels: List[str],
                                     ) -> None:
                                -        """Updates the metric
                                -
                                -        Args:
                                -            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
                                -            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
                                -            gt_labels: a list of N string labels
                                -            pred_labels: a list of M string labels
                                -        """
                                 
                                         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
                                             raise AssertionError("there should be the same number of boxes and string both for the ground truth "
                                @@ -823,12 +718,14 @@ 

                                Source code for doctr.utils.metrics

                                 
                                         # Compute IoU
                                         if pred_boxes.shape[0] > 0:
                                -            if self.use_polygons:
                                -                iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
                                +            if self.rotated_bbox:
                                +                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
                                +                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
                                +                iou_mat = mask_iou(mask_gts, mask_preds)
                                             else:
                                                 iou_mat = box_iou(gt_boxes, pred_boxes)
                                 
                                -            self.tot_iou += float(iou_mat.max(axis=0).sum())
                                +            self.tot_iou += float(iou_mat.max(axis=1).sum())
                                 
                                             # Assign pairs
                                             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
                                @@ -842,16 +739,15 @@ 

                                Source code for doctr.utils.metrics

                                                 self.unicase_matches += int(_unicase)
                                 
                                         self.num_gts += gt_boxes.shape[0]
                                -        self.num_preds += pred_boxes.shape[0]
                                - + self.num_preds += pred_boxes.shape[0]
                                -[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics Returns: - a tuple with the recall & precision for each string comparison and the mean IoU + a tuple with the recall & precision for each string comparison flexibility and the mean IoU """ # Recall @@ -885,134 +781,6 @@

                                Source code for doctr.utils.metrics

                                         self.unidecode_matches = 0
                                         self.unicase_matches = 0
                                - - -
                                -[docs] -class DetectionMetric: - r"""Implements an object detection metric. - - The aggregated metrics are computed as follows: - - .. math:: - \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, - \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ - Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) - - with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and - :math:`y`, and the function :math:`h_{B, C}` defined as: - - .. math:: - \forall (b, c) \in \mathcal{B} \times \mathcal{C}, - h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{C}` is the set of possible class indices, - :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - - >>> import numpy as np - >>> from doctr.utils import DetectionMetric - >>> metric = DetectionMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) - >>> metric.summary() - - Args: - iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory - """ - - def __init__( - self, - iou_thresh: float = 0.5, - use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, - ) -> None: - self.iou_thresh = iou_thresh - self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting - self.reset() - -
                                -[docs] - def update( - self, - gt_boxes: np.ndarray, - pred_boxes: np.ndarray, - gt_labels: np.ndarray, - pred_labels: np.ndarray, - ) -> None: - """Updates the metric - - Args: - gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - gt_labels: an array of class indices of shape (N,) - pred_labels: an array of class indices of shape (M,) - """ - - if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: - raise AssertionError("there should be the same number of boxes and string both for the ground truth " - "and the predictions") - - # Compute IoU - if pred_boxes.shape[0] > 0: - if self.use_polygons: - iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting) - else: - iou_mat = box_iou(gt_boxes, pred_boxes) - - self.tot_iou += float(iou_mat.max(axis=0).sum()) - - # Assign pairs - gt_indices, pred_indices = linear_sum_assignment(-iou_mat) - is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh - # Category comparison - self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) - - self.num_gts += gt_boxes.shape[0] - self.num_preds += pred_boxes.shape[0]
                                - - -
                                -[docs] - def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: - """Computes the aggregated metrics - - Returns: - a tuple with the recall & precision for each class prediction and the mean IoU - """ - - # Recall - recall = self.num_matches / self.num_gts if self.num_gts > 0 else None - - # Precision - precision = self.num_matches / self.num_preds if self.num_preds > 0 else None - - # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None - - return recall, precision, mean_iou
                                - - - def reset(self) -> None: - self.num_gts = 0 - self.num_preds = 0 - self.tot_iou = 0. - self.num_matches = 0
                                -
                                @@ -1045,7 +813,7 @@

                                Source code for doctr.utils.metrics

                                       
                                     
                                   
                                - + diff --git a/v0.5.1/_modules/doctr/utils/visualization.html b/v0.5.1/_modules/doctr/utils/visualization.html index 8e1fbb5732..21743f6182 100644 --- a/v0.5.1/_modules/doctr/utils/visualization.html +++ b/v0.5.1/_modules/doctr/utils/visualization.html @@ -226,32 +226,20 @@

                                Source code for doctr.utils.visualization

                                -# Copyright (C) 2021-2022, Mindee.
                                +# Copyright (C) 2021, Mindee.
                                 
                                 # This program is licensed under the Apache License version 2.
                                 # See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                 
                                -from copy import deepcopy
                                -from typing import Any, Dict, List, Optional, Tuple, Union
                                -
                                -import cv2
                                -import matplotlib.patches as patches
                                 import matplotlib.pyplot as plt
                                +from matplotlib.figure import Figure
                                +import matplotlib.patches as patches
                                 import mplcursors
                                +from PIL import ImageFont, ImageDraw, Image
                                 import numpy as np
                                -from matplotlib.figure import Figure
                                -from PIL import Image, ImageDraw
                                -from unidecode import unidecode
                                +import cv2
                                +from typing import Tuple, List, Dict, Any, Union
                                 
                                -from .common_types import BoundingBox, Polygon4P
                                -from .fonts import get_font
                                +from .common_types import BoundingBox, RotatedBbox
                                 
                                -__all__ = ['visualize_page', 'synthesize_page', 'draw_boxes']
                                +__all__ = ['visualize_page', 'synthetize_page']
                                 
                                 
                                -def rect_patch(
                                -    geometry: BoundingBox,
                                +def create_rect_patch(
                                +    geometry: Union[BoundingBox, RotatedBbox],
                                +    label: str,
                                     page_dimensions: Tuple[int, int],
                                -    label: Optional[str] = None,
                                -    color: Tuple[float, float, float] = (0, 0, 0),
                                +    color: Tuple[int, int, int],
                                     alpha: float = 0.3,
                                     linewidth: int = 2,
                                     fill: bool = True,
                                -    preserve_aspect_ratio: bool = False
                                -) -> patches.Rectangle:
                                -    """Create a matplotlib rectangular patch for the element
                                +) -> patches.Patch:
                                +    """Create a matplotlib patch (rectangle) bounding the element
                                 
                                     Args:
                                         geometry: bounding box of the element
                                -        page_dimensions: dimensions of the Page in format (height, width)
                                         label: label to display when hovered
                                +        page_dimensions: dimensions of the Page
                                         color: color to draw box
                                         alpha: opacity parameter to fill the boxes, 0 = transparent
                                         linewidth: line width
                                -        fill: whether the patch should be filled
                                -        preserve_aspect_ratio: pass True if you passed True to the predictor
                                 
                                     Returns:
                                         a rectangular Patch
                                     """
                                -
                                -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
                                -        raise ValueError("invalid geometry format")
                                -
                                -    # Unpack
                                     height, width = page_dimensions
                                -    (xmin, ymin), (xmax, ymax) = geometry
                                -    # Switch to absolute coords
                                -    if preserve_aspect_ratio:
                                -        width = height = max(height, width)
                                -    xmin, w = xmin * width, (xmax - xmin) * width
                                -    ymin, h = ymin * height, (ymax - ymin) * height
                                -
                                -    return patches.Rectangle(
                                -        (xmin, ymin),
                                -        w,
                                -        h,
                                -        fill=fill,
                                -        linewidth=linewidth,
                                -        edgecolor=(*color, alpha),
                                -        facecolor=(*color, alpha),
                                -        label=label,
                                -    )
                                -
                                -
                                -def polygon_patch(
                                -    geometry: np.ndarray,
                                -    page_dimensions: Tuple[int, int],
                                -    label: Optional[str] = None,
                                -    color: Tuple[float, float, float] = (0, 0, 0),
                                -    alpha: float = 0.3,
                                -    linewidth: int = 2,
                                -    fill: bool = True,
                                -    preserve_aspect_ratio: bool = False
                                -) -> patches.Polygon:
                                -    """Create a matplotlib polygon patch for the element
                                -
                                -    Args:
                                -        geometry: bounding box of the element
                                -        page_dimensions: dimensions of the Page in format (height, width)
                                -        label: label to display when hovered
                                -        color: color to draw box
                                -        alpha: opacity parameter to fill the boxes, 0 = transparent
                                -        linewidth: line width
                                -        fill: whether the patch should be filled
                                -        preserve_aspect_ratio: pass True if you passed True to the predictor
                                -
                                -    Returns:
                                -        a polygon Patch
                                -    """
                                -
                                -    if not geometry.shape == (4, 2):
                                -        raise ValueError("invalid geometry format")
                                -
                                -    # Unpack
                                -    height, width = page_dimensions
                                -    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
                                -    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
                                -
                                -    return patches.Polygon(
                                -        geometry,
                                -        fill=fill,
                                -        linewidth=linewidth,
                                -        edgecolor=(*color, alpha),
                                -        facecolor=(*color, alpha),
                                -        label=label,
                                -    )
                                -
                                -
                                -def create_obj_patch(
                                -    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
                                -    page_dimensions: Tuple[int, int],
                                -    **kwargs: Any,
                                -) -> patches.Patch:
                                -    """Create a matplotlib patch for the element
                                -
                                -    Args:
                                -        geometry: bounding box (straight or rotated) of the element
                                -        page_dimensions: dimensions of the page in format (height, width)
                                -
                                -    Returns:
                                -        a matplotlib Patch
                                -    """
                                -    if isinstance(geometry, tuple):
                                -        if len(geometry) == 2:  # straight word BB (2 pts)
                                -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                                -        elif len(geometry) == 4:  # rotated word BB (4 pts)
                                -            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)  # type: ignore[arg-type]
                                -    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
                                -        return polygon_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                                -    raise ValueError("invalid geometry format")
                                +    if len(geometry) == 5:
                                +        x, y, w, h, a = geometry  # type: ignore[misc]
                                +        x, w = x * width, w * width
                                +        y, h = y * height, h * height
                                +        points = cv2.boxPoints(((x, y), (w, h), a))
                                +        return patches.Polygon(
                                +            points,
                                +            fill=fill,
                                +            linewidth=linewidth,
                                +            edgecolor=(*color, alpha),
                                +            facecolor=(*color, alpha),
                                +            label=label
                                +        )
                                +    else:
                                +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
                                +        xmin, xmax = xmin * width, xmax * width
                                +        ymin, ymax = ymin * height, ymax * height
                                +        return patches.Rectangle(
                                +            (xmin, ymin),
                                +            xmax - xmin,
                                +            ymax - ymin,
                                +            fill=fill,
                                +            linewidth=linewidth,
                                +            edgecolor=(*color, alpha),
                                +            facecolor=(*color, alpha),
                                +            label=label
                                +        )
                                 
                                 
                                 
                                -[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -442,15 +360,16 @@

                                Source code for doctr.utils.visualization

                                 ) -> Figure:
                                     """Visualize a full page with predicted blocks, lines and words
                                 
                                -    >>> import numpy as np
                                -    >>> import matplotlib.pyplot as plt
                                -    >>> from doctr.utils.visualization import visualize_page
                                -    >>> from doctr.models import ocr_db_crnn
                                -    >>> model = ocr_db_crnn(pretrained=True)
                                -    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                -    >>> out = model([[input_page]])
                                -    >>> visualize_page(out[0].pages[0].export(), input_page)
                                -    >>> plt.show()
                                +    Example::
                                +        >>> import numpy as np
                                +        >>> import matplotlib.pyplot as plt
                                +        >>> from doctr.utils.visualization import visualize_page
                                +        >>> from doctr.models import ocr_db_crnn
                                +        >>> model = ocr_db_crnn(pretrained=True)
                                +        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                +        >>> out = model([[input_page]])
                                +        >>> visualize_page(out[0].pages[0].export(), input_page)
                                +        >>> plt.show()
                                 
                                     Args:
                                         page: the exported Page of a Document
                                @@ -475,8 +394,7 @@ 

                                Source code for doctr.utils.visualization

                                 
                                     for block in page['blocks']:
                                         if not words_only:
                                -            rect = create_obj_patch(block['geometry'], page['dimensions'],
                                -                                    label='block', color=(0, 1, 0), linewidth=1, **kwargs)
                                +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                                             # add patch on figure
                                             ax.add_patch(rect)
                                             if interactive:
                                @@ -485,16 +403,14 @@ 

                                Source code for doctr.utils.visualization

                                 
                                         for line in block['lines']:
                                             if not words_only:
                                -                rect = create_obj_patch(line['geometry'], page['dimensions'],
                                -                                        label='line', color=(1, 0, 0), linewidth=1, **kwargs)
                                +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                                                 ax.add_patch(rect)
                                                 if interactive:
                                                     artists.append(rect)
                                 
                                             for word in line['words']:
                                -                rect = create_obj_patch(word['geometry'], page['dimensions'],
                                -                                        label=f"{word['value']} (confidence: {word['confidence']:.2%})",
                                -                                        color=(0, 0, 1), **kwargs)
                                +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
                                +                                         page['dimensions'], (0, 0, 1), **kwargs)
                                                 ax.add_patch(rect)
                                                 if interactive:
                                                     artists.append(rect)
                                @@ -509,24 +425,21 @@ 

                                Source code for doctr.utils.visualization

                                                             int(page['dimensions'][1] * word['geometry'][0][0]),
                                                             int(page['dimensions'][0] * word['geometry'][0][1])
                                                         )
                                -
                                -                    if len(word['geometry']) == 2:
                                -                        # We draw only if boxes are in straight format
                                -                        ax.text(
                                -                            *text_loc,
                                -                            word['value'],
                                -                            size=10,
                                -                            alpha=0.5,
                                -                            color=(0, 0, 1),
                                -                        )
                                +                    ax.text(
                                +                        *text_loc,
                                +                        word['value'],
                                +                        size=10,
                                +                        alpha=0.5,
                                +                        color=(0, 0, 1),
                                +                    )
                                 
                                         if display_artefacts:
                                             for artefact in block['artefacts']:
                                -                rect = create_obj_patch(
                                +                rect = create_rect_patch(
                                                     artefact['geometry'],
                                +                    'artefact',
                                                     page['dimensions'],
                                -                    label='artefact',
                                -                    color=(0.5, 0.5, 0.5),
                                +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                                                     linewidth=1,
                                                     **kwargs
                                                 )
                                @@ -543,12 +456,10 @@ 

                                Source code for doctr.utils.visualization

                                 
                                 
                                 
                                -
                                -[docs] -def synthesize_page( +def synthetize_page( page: Dict[str, Any], draw_proba: bool = False, - font_family: Optional[str] = None, + font_size: int = 13, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -556,12 +467,10 @@

                                Source code for doctr.utils.visualization

                                         page: exported Page object to represent
                                         draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
                                         font_size: size of the font, default font = 13
                                -        font_family: family of the font
                                 
                                     Return:
                                -        the synthesized page
                                +        A np array (drawn page)
                                     """
                                -
                                     # Draw template
                                     h, w = page["dimensions"]
                                     response = 255 * np.ones((h, w, 3), dtype=np.int32)
                                @@ -572,19 +481,20 @@ 

                                Source code for doctr.utils.visualization

                                             for word in line["words"]:
                                                 # Get aboslute word geometry
                                                 (xmin, ymin), (xmax, ymax) = word["geometry"]
                                -                xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
                                -                ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
                                +                xmin, xmax = int(w * xmin), int(w * xmax)
                                +                ymin, ymax = int(h * ymin), int(h * ymax)
                                 
                                                 # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
                                -                font = get_font(font_family, int(0.75 * (ymax - ymin)))
                                -                img = Image.new('RGB', (xmax - xmin, ymax - ymin), color=(255, 255, 255))
                                +                h_box, w_box = ymax - ymin, xmax - xmin
                                +                h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75))
                                +                img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255))
                                                 d = ImageDraw.Draw(img)
                                +
                                                 # Draw in black the value of the word
                                -                try:
                                -                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
                                -                except UnicodeEncodeError:
                                -                    # When character cannot be encoded, use its unidecode version
                                -                    d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
                                +                d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0))
                                +
                                +                # Resize back to box size
                                +                img = img.resize((w_box, h_box), Image.NEAREST)
                                 
                                                 # Colorize if draw_proba
                                                 if draw_proba:
                                @@ -598,40 +508,7 @@ 

                                Source code for doctr.utils.visualization

                                                 # Write to response page
                                                 response[ymin:ymax, xmin:xmax, :] = np.array(img)
                                 
                                -    return response
                                - - - -def draw_boxes( - boxes: np.ndarray, - image: np.ndarray, - color: Optional[Tuple[int, int, int]] = None, - **kwargs -) -> None: - """Draw an array of relative straight boxes on an image - - Args: - boxes: array of relative boxes, of shape (*, 4) - image: np array, float32 or uint8 - color: color to use for bounding box edges - """ - h, w = image.shape[:2] - # Convert boxes to absolute coords - _boxes = deepcopy(boxes) - _boxes[:, [0, 2]] *= w - _boxes[:, [1, 3]] *= h - _boxes = _boxes.astype(np.int32) - for box in _boxes.tolist(): - xmin, ymin, xmax, ymax = box - image = cv2.rectangle( - image, - (xmin, ymin), - (xmax, ymax), - color=color if isinstance(color, tuple) else (0, 0, 255), - thickness=2 - ) - plt.imshow(image) - plt.plot(**kwargs) + return response
                                @@ -664,7 +541,7 @@

                                Source code for doctr.utils.visualization

                                       
                                     
                                   
                                - + diff --git a/v0.5.1/_modules/index.html b/v0.5.1/_modules/index.html index 9baab3420f..c887b618c2 100644 --- a/v0.5.1/_modules/index.html +++ b/v0.5.1/_modules/index.html @@ -226,32 +226,20 @@ - + diff --git a/v0.5.1/_sources/changelog.rst.txt b/v0.5.1/_sources/changelog.rst.txt index 55a0cbd0d1..430097d6c8 100644 --- a/v0.5.1/_sources/changelog.rst.txt +++ b/v0.5.1/_sources/changelog.rst.txt @@ -1,26 +1,6 @@ Changelog ========= -v0.5.0 (2021-12-31) -------------------- -Release note: `v0.5.0 `_ - -v0.4.1 (2021-11-22) -------------------- -Release note: `v0.4.1 `_ - -v0.4.0 (2021-10-01) -------------------- -Release note: `v0.4.0 `_ - -v0.3.1 (2021-08-27) -------------------- -Release note: `v0.3.1 `_ - -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.5.1/_sources/contributing/contributing.md.txt b/v0.5.1/_sources/contributing/contributing.md.txt index 485e9c68d4..7e2a849de3 100644 --- a/v0.5.1/_sources/contributing/contributing.md.txt +++ b/v0.5.1/_sources/contributing/contributing.md.txt @@ -2,8 +2,6 @@ Everything you need to know to contribute efficiently to the project. - - ## Codebase structure - [doctr](https://github.com/mindee/doctr/blob/main/doctr) - The package codebase @@ -11,10 +9,9 @@ Everything you need to know to contribute efficiently to the project. - [docs](https://github.com/mindee/doctr/blob/main/docs) - Library documentation building - [scripts](https://github.com/mindee/doctr/blob/main/scripts) - Example scripts - [references](https://github.com/mindee/doctr/blob/main/references) - Reference training scripts -- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities +- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities - [api](https://github.com/mindee/doctr/blob/main/api) - A minimal template to deploy a REST API with docTR - ## Continuous Integration This project uses the following integrations to ensure proper codebase maintenance: @@ -24,13 +21,11 @@ This project uses the following integrations to ensure proper codebase maintenan As a contributor, you will only have to ensure coverage of your code by adding appropriate unit testing of your code. - - ## Feedback ### Feature requests & bug report -Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues). +Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues). First, check whether the topic wasn't already covered in an open / closed issue. If not, feel free to open a new one! When doing so, use issue templates whenever possible and provide enough information for other contributors to jump in. @@ -38,7 +33,6 @@ First, check whether the topic wasn't already covered in an open / closed issue. If you are wondering how to do something with docTR, or a more general question, you should consider checking out Github [discussions](https://github.com/mindee/doctr/discussions). See it as a Q&A forum, or the docTR-specific StackOverflow! - ## Developing docTR ### Developer mode installation @@ -46,7 +40,9 @@ If you are wondering how to do something with docTR, or a more general question, Install all additional dependencies with the following command: ```shell +python -m pip install --upgrade pip pip install -e .[dev] +pre-commit install ``` ### Commits @@ -54,7 +50,6 @@ pip install -e .[dev] - **Code**: ensure to provide docstrings to your Python code. In doing so, please follow [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) so it can ease the process of documentation later. - **Commit message**: please follow [Udacity guide](http://udacity.github.io/git-styleguide/) - ### Unit tests In order to run the same unit tests as the CI workflows, you can run unittests locally: @@ -71,52 +66,27 @@ To run all quality checks together make quality ``` -#### Lint verification +#### Code style verification -To ensure that your incoming PR complies with the lint settings, you need to install [flake8](https://flake8.pycqa.org/en/latest/) and run the following command from the repository's root folder: +To run all style checks together ```shell -flake8 ./ +make style ``` -This will read the `.flake8` setting file and let you know whether your commits need some adjustments. - -#### Import order - -In order to ensure there is a common import order convention, run [isort](https://github.com/PyCQA/isort) as follows: - -```shell -isort **/*.py -``` -This will reorder the imports of your local files. - -#### Annotation typing -Additionally, to catch type-related issues and have a cleaner codebase, annotation typing are expected. After installing [mypy](https://github.com/python/mypy), you can run the verifications as follows: - -```shell -mypy --config-file mypy.ini doctr/ -``` -The `mypy.ini` file will be read to check your typing. - -#### Docstring format +### Modifying the documentation -To keep a sane docstring structure, if you install [pydocstyle](https://github.com/PyCQA/pydocstyle), you can verify your docstrings as follows: +The current documentation is built using `sphinx` thanks to our CI. +You can build the documentation locally: ```shell -pydocstyle doctr/ +make docs-single-version ``` -The `.pydocstyle` file will be read to configure this operation. +Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the `_build` directory. Additionally, you may need to clear your web browser's cache to see the modifications. -### Modifying the documentation - -In order to check locally your modifications to the documentation: -```shell -make docs-single-version -``` You can now open your local version of the documentation located at `docs/_build/index.html` in your browser - ## Let's connect Should you wish to connect somewhere else than on GitHub, feel free to join us on [Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-uzgmljfl-MotFVfH~IdEZxjp~0zldww), where you will find a `#doctr` channel! diff --git a/v0.5.1/_sources/datasets.rst.txt b/v0.5.1/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.5.1/_sources/datasets.rst.txt +++ b/v0.5.1/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.5.1/_sources/getting_started/installing.rst.txt b/v0.5.1/_sources/getting_started/installing.rst.txt index 539c0dc1c5..46d4177b30 100644 --- a/v0.5.1/_sources/getting_started/installing.rst.txt +++ b/v0.5.1/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires `Python `_ 3.9 or higher. Prerequisites @@ -14,16 +14,10 @@ Whichever OS you are running, you will need to install at least TensorFlow or Py * `TensorFlow 2 `_ * `PyTorch `_ -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them using `Homebrew `_ as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. +For MacBooks with M1 chip, you will need some additional packages or specific versions: +* `TensorFlow 2 Metal Plugin `_ +* `PyTorch >= 1.12.0 `_ Via Python Package ================== @@ -43,19 +37,36 @@ We strive towards reducing framework-specific dependencies to a minimum, but som .. code:: bash - pip install python-doctr[tensorflow] + pip install "python-doctr[tf]" + # or with preinstalled packages for visualization & html & contrib module support + pip install "python-doctr[tf,viz,html,contib]" .. tab:: PyTorch .. code:: bash - pip install python-doctr[pytorch] + pip install "python-doctr[torch]" + # or with preinstalled packages for visualization & html & contrib module support + pip install "python-doctr[torch,viz,html,contrib]" + + + + +Via Conda (Only for Linux) +========================== + +Install the last stable release of the package using `conda `_: + +.. code:: bash + + conda config --set channel_priority strict + conda install -c techMindee -c pypdfium2-team -c bblanchon -c defaults -c conda-forge python-doctr Via Git ======= -Install the library in developper mode: +Install the library in developer mode: .. tabs:: diff --git a/v0.5.1/_sources/index.rst.txt b/v0.5.1/_sources/index.rst.txt index 980aa2e3a8..fc3ff89fdf 100644 --- a/v0.5.1/_sources/index.rst.txt +++ b/v0.5.1/_sources/index.rst.txt @@ -1,8 +1,7 @@ -******************************** -docTR: Document Text Recognition -******************************** +DocTR: Document Text Recognition +================================ -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -13,6 +12,9 @@ DocTR provides an easy and powerful way to extract valuable information from you * |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. +Welcome to the documentation of `DocTR `_! + + Main Features ------------- @@ -21,18 +23,24 @@ Main Features * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor * |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, minimal dependencies -* |:tools:| Actively maintained by Mindee -* |:factory:| Easy integration (available templates for browser demo & API deployment) +* |:bird:| Light package, small dependencies +* |:tools:| Daily maintained +* |:factory:| Easy integration + +Getting Started +--------------- .. toctree:: :maxdepth: 2 - :caption: Getting started - :hidden: - getting_started/installing - notebooks + installing + + +Build & train your predictor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) +* Fine-tune or train from scratch any detection or recognition model to specialize on your data Model zoo @@ -40,63 +48,36 @@ Model zoo Text detection models """"""""""""""""""""" -* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ -* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ + * `DBNet `_ (Differentiable Binarization) + * `LinkNet `_ Text recognition models """"""""""""""""""""""" -* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ -* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ -* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ + * `SAR `_ (Show, Attend and Read) + * `CRNN `_ (Convolutional Recurrent Neural Network) + * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) Supported datasets ^^^^^^^^^^^^^^^^^^ -* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. -* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. -* SROIE from `ICDAR 2019 `_. -* IIIT-5k from `CVIT `_. -* Street View Text from `"End-to-End Scene Text Recognition" `_. -* SynthText from `Visual Geometry Group `_. -* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. -* IC03 from `ICDAR 2003 `_. -* IC13 from `ICDAR 2013 `_. -* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. + * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. + * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. + * SROIE from `ICDAR 2019 `_. .. toctree:: :maxdepth: 2 - :caption: Using docTR - :hidden: + :caption: Notes - using_doctr/using_models - using_doctr/using_model_export + changelog .. toctree:: :maxdepth: 2 :caption: Package Reference - :hidden: - - modules/datasets - modules/io - modules/models - modules/transforms - modules/utils - -.. toctree:: - :maxdepth: 2 - :caption: Contributing - :hidden: - - contributing/code_of_conduct - contributing/contributing - - -.. toctree:: - :maxdepth: 2 - :caption: Notes - :hidden: - - changelog + datasets + documents + models + transforms + utils diff --git a/v0.5.1/_sources/installing.rst.txt b/v0.5.1/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.5.1/_sources/installing.rst.txt +++ b/v0.5.1/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.5.1/_sources/io.rst.txt b/v0.5.1/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.5.1/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.5.1/_sources/models.rst.txt b/v0.5.1/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.5.1/_sources/models.rst.txt +++ b/v0.5.1/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.1/_sources/modules/datasets.rst.txt b/v0.5.1/_sources/modules/datasets.rst.txt index e40b1c506a..872212a121 100644 --- a/v0.5.1/_sources/modules/datasets.rst.txt +++ b/v0.5.1/_sources/modules/datasets.rst.txt @@ -3,51 +3,57 @@ doctr.datasets .. currentmodule:: doctr.datasets -Whether it is for training or for evaluation, having predefined objects to access datasets in your prefered framework -can be a significant save of time. - - .. _datasets: -Available Datasets ------------------- -Here are all datasets that are available through docTR: - - -Public datasets -^^^^^^^^^^^^^^^ +doctr.datasets +-------------- .. autoclass:: FUNSD + .. autoclass:: SROIE + .. autoclass:: CORD + .. autoclass:: IIIT5K + .. autoclass:: SVT + .. autoclass:: SVHN + .. autoclass:: SynthText + .. autoclass:: IC03 + .. autoclass:: IC13 + .. autoclass:: IMGUR5K -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ +.. autoclass:: MJSynth + +.. autoclass:: IIITHWS .. autoclass:: DocArtefacts + +.. autoclass:: WILDRECEIPT + +Synthetic dataset generator +--------------------------- + .. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ +.. autoclass:: WordGenerator -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. +Custom dataset loader +--------------------- .. autoclass:: DetectionDataset + .. autoclass:: RecognitionDataset -.. autoclass:: OCRDataset +.. autoclass:: OCRDataset -Data Loading ------------- -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Dataloader +--------------------- .. autoclass:: doctr.datasets.loader.DataLoader @@ -70,6 +76,9 @@ of vocabs. * - digits - 10 - 0123456789 + * - hindi_digits + - 10 + - ٠١٢٣٤٥٦٧٨٩ * - ascii_letters - 52 - abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ @@ -79,6 +88,24 @@ of vocabs. * - currency - 5 - £€¥¢฿ + * - ancient_greek + - 48 + - αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ + * - arabic_letters + - 37 + - ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي + * - generic_cyrillic_letters + - 58 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ + * - persian_letters + - 5 + - پچڢڤگ + * - arabic_diacritics + - 2 + - 'ًٌٍَُِّْ' + * - arabic_punctuation + - 5 + - ؟؛«»— * - latin - 94 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ @@ -97,8 +124,53 @@ of vocabs. * - spanish - 116 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ + * - italian + - 120 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ * - german - 108 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + * - arabic + - 101 + - ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيپچڢڤگ؟؛«»—0123456789٠١٢٣٤٥٦٧٨٩'ًٌٍَُِّْ'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ + * - czech + - 130 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ + * - polish + - 118 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ąćęłńóśźżĄĆĘŁŃÓŚŹŻ + * - dutch + - 114 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ + * - norwegian + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿æøåÆØÅ + * - danish + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿æøåÆØÅ + * - finnish + - 104 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöÄÖ + * - swedish + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ + * - ukrainian + - 115 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴ + * - vietnamese + - 236 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ + * - hebrew + - 123 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪ + * - hindi + - 71 + - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰ + * - bangla + - 70 + - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯ + * - multilingual + - 195 + - english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & § .. autofunction:: encode_sequences diff --git a/v0.5.1/_sources/modules/models.rst.txt b/v0.5.1/_sources/modules/models.rst.txt index 485b36ebd4..2baf095eed 100644 --- a/v0.5.1/_sources/modules/models.rst.txt +++ b/v0.5.1/_sources/modules/models.rst.txt @@ -25,12 +25,26 @@ doctr.models.classification .. autofunction:: doctr.models.classification.mobilenet_v3_large_r -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +.. autofunction:: doctr.models.classification.mobilenet_v3_small_crop_orientation + +.. autofunction:: doctr.models.classification.mobilenet_v3_small_page_orientation .. autofunction:: doctr.models.classification.magc_resnet31 +.. autofunction:: doctr.models.classification.vit_s + +.. autofunction:: doctr.models.classification.vit_b + +.. autofunction:: doctr.models.classification.textnet_tiny + +.. autofunction:: doctr.models.classification.textnet_small + +.. autofunction:: doctr.models.classification.textnet_base + .. autofunction:: doctr.models.classification.crop_orientation_predictor +.. autofunction:: doctr.models.classification.page_orientation_predictor + doctr.models.detection ---------------------- @@ -45,6 +59,12 @@ doctr.models.detection .. autofunction:: doctr.models.detection.db_mobilenet_v3_large +.. autofunction:: doctr.models.detection.fast_tiny + +.. autofunction:: doctr.models.detection.fast_small + +.. autofunction:: doctr.models.detection.fast_base + .. autofunction:: doctr.models.detection.detection_predictor @@ -61,6 +81,12 @@ doctr.models.recognition .. autofunction:: doctr.models.recognition.master +.. autofunction:: doctr.models.recognition.vitstr_small + +.. autofunction:: doctr.models.recognition.vitstr_base + +.. autofunction:: doctr.models.recognition.parseq + .. autofunction:: doctr.models.recognition.recognition_predictor @@ -68,3 +94,15 @@ doctr.models.zoo ---------------- .. autofunction:: doctr.models.ocr_predictor + +.. autofunction:: doctr.models.kie_predictor + + +doctr.models.factory +-------------------- + +.. autofunction:: doctr.models.factory.login_to_hub + +.. autofunction:: doctr.models.factory.from_hub + +.. autofunction:: doctr.models.factory.push_to_hf_hub diff --git a/v0.5.1/_sources/modules/transforms.rst.txt b/v0.5.1/_sources/modules/transforms.rst.txt index 7f90325e4d..7fc02f4cc4 100644 --- a/v0.5.1/_sources/modules/transforms.rst.txt +++ b/v0.5.1/_sources/modules/transforms.rst.txt @@ -28,6 +28,7 @@ Here are all transformations that are available through docTR: .. autoclass:: GaussianNoise .. autoclass:: RandomHorizontalFlip .. autoclass:: RandomShadow +.. autoclass:: RandomResize Composing transformations diff --git a/v0.5.1/_sources/notebooks.md.txt b/v0.5.1/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.5.1/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.5.1/_sources/notebooks.rst.txt b/v0.5.1/_sources/notebooks.rst.txt index e8971fceee..96f9e80edb 100644 --- a/v0.5.1/_sources/notebooks.rst.txt +++ b/v0.5.1/_sources/notebooks.rst.txt @@ -14,4 +14,4 @@ Here are some notebooks compiled for users to better leverage the library capabi +--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ | `[Artefact detection] `_ | Object detection for artefacts in documents | .. image:: https://colab.research.google.com/assets/colab-badge.svg | | | | :target: https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb | -+--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ diff --git a/v0.5.1/_sources/transforms.rst.txt b/v0.5.1/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.5.1/_sources/transforms.rst.txt +++ b/v0.5.1/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.5.1/_sources/using_doctr/using_model_export.rst.txt b/v0.5.1/_sources/using_doctr/using_model_export.rst.txt index 992f4e9866..c62c36169b 100644 --- a/v0.5.1/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.5.1/_sources/using_doctr/using_model_export.rst.txt @@ -3,69 +3,112 @@ Preparing your model for inference A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! -.. currentmodule:: doctr.models.export +.. currentmodule:: doctr.models.utils -Model compression ------------------ +Model optimization +------------------ -This section is meant to help you perform inference with compressed versions of your model. +This section is meant to help you perform inference with optimized versions of your model. -TensorFlow Lite -^^^^^^^^^^^^^^^ +Half-precision +^^^^^^^^^^^^^^ -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: +**NOTE:** We support half-precision inference for PyTorch and TensorFlow models only on **GPU devices**. - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() +Half-precision (or FP16) is a binary floating-point format that occupies 16 bits in computer memory. -Half-precision +Advantages: + +- Faster inference +- Less memory usage + +.. tabs:: + + .. tab:: TensorFlow + + .. code:: python3 + + import tensorflow as tf + from tensorflow.keras import mixed_precision + mixed_precision.set_global_policy('mixed_float16') + predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) + + .. tab:: PyTorch + + .. code:: python3 + + import torch + predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True).cuda().half() + res = predictor(doc) + + +Export to ONNX ^^^^^^^^^^^^^^ -If you want to convert it to half-precision using your TFLite converter +ONNX (Open Neural Network Exchange) is an open and interoperable format for representing and exchanging machine learning models. +It defines a common format for representing models, including the network structure, layer types, parameters, and metadata. + +.. tabs:: + + .. tab:: TensorFlow + + .. code:: python3 + + import tensorflow as tf + from doctr.models import vitstr_small + from doctr.models.utils import export_model_to_onnx + + batch_size = 16 + input_shape = (3, 32, 128) + model = vitstr_small(pretrained=True, exportable=True) + dummy_input = [tf.TensorSpec([batch_size, input_shape], tf.float32, name="input")] + model_path, output = export_model_to_onnx(model, model_name="vitstr.onnx", dummy_input=dummy_input) + + + .. tab:: PyTorch - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() + .. code:: python3 + import torch + from doctr.models import vitstr_small + from doctr.models.utils import export_model_to_onnx -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ + batch_size = 16 + input_shape = (32, 128, 3) + model = vitstr_small(pretrained=True, exportable=True) + dummy_input = torch.rand((batch_size, input_shape), dtype=torch.float32) + model_path = export_model_to_onnx(model, model_name="vitstr.onnx, dummy_input=dummy_input) -Finally if you wish to quantize the model with your TFLite converter - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() +Using your ONNX exported model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To use your exported model, we have build a dedicated lightweight package called `OnnxTR `_. +The package doesn't require PyTorch or TensorFlow to be installed - build on top of ONNXRuntime. +It is simple and easy-to-use (with the same interface you know already from docTR), that allows you to perform inference with your exported model. -Using SavedModel ----------------- +- `Installation `_ +- `Loading custom exported model `_ -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: +.. code:: shell + pip install onnxtr[cpu] - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') +.. code:: python3 -And loaded just as easily: + from onnxtr.io import DocumentFile + from onnxtr.models import ocr_predictor, parseq, linknet_resnet18 + # Load your documents + single_img_doc = DocumentFile.from_images("path/to/your/img.jpg") + # Load your exported model/s + reco_model = parseq("path_to_custom_model.onnx", vocab="ABC") + det_model = linknet_resnet18("path_to_custom_model.onnx") + predictor = ocr_predictor(det_arch=det_model, reco_arch=reco_model) + # Or use any of the pre-trained models + predictor = ocr_predictor(det_arch="linknet_resnet18", reco_arch="parseq") - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') + # Get your results + res = predictor(single_img_doc) diff --git a/v0.5.1/_sources/using_doctr/using_models.rst.txt b/v0.5.1/_sources/using_doctr/using_models.rst.txt index dcdf118b66..e6e5006f2e 100644 --- a/v0.5.1/_sources/using_doctr/using_models.rst.txt +++ b/v0.5.1/_sources/using_doctr/using_models.rst.txt @@ -4,8 +4,6 @@ Choosing the right model The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. currentmodule:: doctr.models - For a given task, docTR provides a Predictor, which is composed of 2 components: * PreProcessor: a module in charge of making inputs directly usable by the deep learning model. @@ -24,27 +22,57 @@ Available architectures The following architectures are currently supported: -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -We also provide 2 models working with any kind of rotated documents: - -* `linknet_resnet18_rotation `_ -* `db_resnet50_rotation `_ +* :py:meth:`linknet_resnet18 ` +* :py:meth:`linknet_resnet34 ` +* :py:meth:`linknet_resnet50 ` +* :py:meth:`db_resnet50 ` +* :py:meth:`db_mobilenet_v3_large ` +* :py:meth:`fast_tiny ` +* :py:meth:`fast_small ` +* :py:meth:`fast_base ` For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ ++------------------------------------------------------------------------------------+----------------------------+----------------------------+--------------------+ +| | FUNSD | CORD | | ++================+=================================+=================+===============+============+===============+============+===============+====================+ +| **Backend** | **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **sec/it (B: 1)** | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | db_resnet50 | (1024, 1024, 3) | 25.2 M | 84.39 | 85.86 | 93.70 | 83.24 | 1.2 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 80.29 | 70.90 | 84.70 | 67.76 | 0.5 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet18 | (1024, 1024, 3) | 11.5 M | 81.37 | 84.08 | 85.71 | 83.70 | 0.7 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet34 | (1024, 1024, 3) | 21.6 M | 82.20 | 85.49 | 87.63 | 87.17 | 0.8 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet50 | (1024, 1024, 3) | 28.8 M | 80.70 | 83.51 | 86.46 | 84.94 | 1.1 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_tiny | (1024, 1024, 3) | 13.5 M (8.5M) | 85.29 | 85.34 | 93.46 | 75.99 | 0.7 (0.4) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_small | (1024, 1024, 3) | 14.7 M (9.7M) | 85.50 | 86.89 | 94.05 | 78.33 | 0.7 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_base | (1024, 1024, 3) | 16.3 M (10.6M)| 85.22 | 86.97 | 94.18 | 84.74 | 0.8 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_resnet34 | (1024, 1024, 3) | 22.4 M | 82.76 | 76.75 | 89.20 | 71.74 | 0.8 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_resnet50 | (1024, 1024, 3) | 25.4 M | 83.56 | 86.68 | 92.61 | 86.39 | 1.1 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 82.69 | 84.63 | 94.51 | 70.28 | 0.5 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet18 | (1024, 1024, 3) | 11.5 M | 81.64 | 85.52 | 88.92 | 82.74 | 0.6 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet34 | (1024, 1024, 3) | 21.6 M | 81.62 | 82.95 | 86.26 | 81.06 | 0.7 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet50 | (1024, 1024, 3) | 28.8 M | 81.78 | 82.47 | 87.29 | 85.54 | 1.0 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_tiny | (1024, 1024, 3) | 13.5 M (8.5M) | 84.90 | 85.04 | 93.73 | 76.26 | 0.7 (0.4) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_small | (1024, 1024, 3) | 14.7 M (9.7M) | 85.36 | 86.68 | 94.09 | 78.53 | 0.7 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_base | (1024, 1024, 3) | 16.3 M (10.6M)| 84.95 | 86.73 | 94.39 | 85.36 | 0.8 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). @@ -52,19 +80,21 @@ Explanations about the metrics being used are available in :ref:`metrics`. *Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). +Seconds per iteration (with a batch size of 1) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz`. Detection predictors ^^^^^^^^^^^^^^^^^^^^ -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. +:py:meth:`detection_predictor ` wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. + +.. code:: python3 - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) + import numpy as np + from doctr.models import detection_predictor + predictor = detection_predictor('db_resnet50') + dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = model([dummy_img]) You can pass specific boolean arguments to the predictor: @@ -74,10 +104,10 @@ You can pass specific boolean arguments to the predictor: For instance, this snippet will instantiates a detection predictor able to detect text on rotated documents while preserving the aspect ratio: - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50_rotation', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) +.. code:: python3 -NB: for the moment, `db_resnet50_rotation` is pretrained in Pytorch only and `linknet_resnet18_rotation` in Tensorflow only. + from doctr.models import detection_predictor + predictor = detection_predictor('db_resnet50', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) Text Recognition @@ -91,80 +121,86 @@ Available architectures The following architectures are currently supported: -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ +* :py:meth:`crnn_vgg16_bn ` +* :py:meth:`crnn_mobilenet_v3_small ` +* :py:meth:`crnn_mobilenet_v3_large ` +* :py:meth:`sar_resnet31 ` +* :py:meth:`master ` +* :py:meth:`vitstr_small ` +* :py:meth:`vitstr_base ` +* :py:meth:`parseq ` For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - ++-----------------------------------------------------------------------------------+----------------------------+----------------------------+--------------------+ +| | FUNSD | CORD | | ++================+=================================+=================+==============+============+===============+============+===============+====================+ +| **Backend** | **Architecture** | **Input shape** | **# params** | **Exact** | **Partial** | **Exact** | **Partial** | **sec/it (B: 64)** | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_vgg16_bn | (32, 128, 3) | 15.8 M | 88.12 | 88.85 | 94.68 | 95.10 | 0.9 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_mobilenet_v3_small | (32, 128, 3) | 2.1 M | 86.88 | 87.61 | 92.28 | 92.73 | 0.25 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_mobilenet_v3_large | (32, 128, 3) | 4.5 M | 87.44 | 88.12 | 94.14 | 94.55 | 0.34 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | master | (32, 128, 3) | 58.8 M | 87.44 | 88.21 | 93.83 | 94.25 | 22.3 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | sar_resnet31 | (32, 128, 3) | 57.2 M | 87.67 | 88.48 | 94.21 | 94.66 | 7.1 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | vitstr_small | (32, 128, 3) | 21.4 M | 83.01 | 83.84 | 86.57 | 87.00 | 2.0 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | vitstr_base | (32, 128, 3) | 85.2 M | 85.98 | 86.70 | 90.47 | 90.95 | 5.8 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | parseq | (32, 128, 3) | 23.8 M | 81.62 | 82.29 | 79.13 | 79.52 | 3.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_vgg16_bn | (32, 128, 3) | 15.8 M | 86.54 | 87.41 | 94.29 | 94.69 | 0.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_mobilenet_v3_small | (32, 128, 3) | 2.1 M | 87.25 | 87.99 | 93.91 | 94.34 | 0.05 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_mobilenet_v3_large | (32, 128, 3) | 4.5 M | 87.38 | 88.09 | 94.46 | 94.92 | 0.08 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | master | (32, 128, 3) | 58.7 M | 88.57 | 89.39 | 95.73 | 96.21 | 17.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | sar_resnet31 | (32, 128, 3) | 55.4 M | 88.10 | 88.88 | 94.83 | 95.29 | 4.9 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | vitstr_small | (32, 128, 3) | 21.4 M | 88.00 | 88.82 | 95.40 | 95.78 | 1.5 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | vitstr_base | (32, 128, 3) | 85.2 M | 88.33 | 89.09 | 95.32 | 95.71 | 4.1 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | parseq | (32, 128, 3) | 23.8 M | 88.53 | 89.24 | 95.56 | 95.91 | 2.2 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ + All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). Explanations about the metric being used (exact match) are available in :ref:`metrics`. While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) +.. code:: python3 + + from doctr.models import recognition_predictor + predictor = recognition_predictor('crnn_vgg16_bn') + print(predictor.model.cfg['vocab']) *Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). +Seconds per iteration (with a batch size of 64) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz`. Recognition predictors ^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. +:py:meth:`recognition_predictor ` wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) +.. code:: python3 + + import numpy as np + from doctr.models import recognition_predictor + predictor = recognition_predictor('crnn_vgg16_bn') + dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) + out = model([dummy_img]) End-to-End OCR @@ -175,92 +211,119 @@ The task consists of both localizing and transcribing textual elements in a give Available architectures ^^^^^^^^^^^^^^^^^^^^^^^ -You can use any combination of detection and recognition models supporte by docTR. +You can use any combination of detection and recognition models supported by docTR. For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ ++---------------------------------------------------------------------------+----------------------------+----------------------------+ +| | FUNSD | CORD | ++================+==========================================================+============================+============+===============+ +| **Backend** | **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_vgg16_bn | 73.45 | 74.73 | 85.79 | 76.21 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_mobilenet_v3_small | 72.66 | 73.93 | 83.43 | 74.11 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_mobilenet_v3_large | 72.86 | 74.13 | 85.16 | 75.65 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + master | 72.73 | 74.00 | 84.13 | 75.05 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + sar_resnet31 | 73.23 | 74.51 | 85.34 | 76.03 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + vitstr_small | 68.57 | 69.77 | 78.24 | 69.51 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + vitstr_base | 70.96 | 72.20 | 82.10 | 72.94 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + parseq | 68.85 | 70.05 | 72.38 | 64.30 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_vgg16_bn | 72.43 | 75.13 | 85.05 | 79.33 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_mobilenet_v3_small | 73.06 | 75.79 | 84.64 | 78.94 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_mobilenet_v3_large | 73.17 | 75.90 | 84.96 | 79.25 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + master | 73.90 | 76.66 | 85.84 | 80.07 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + sar_resnet31 | 73.58 | 76.33 | 85.64 | 79.88 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + vitstr_small | 73.06 | 75.79 | 85.95 | 80.17 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + vitstr_base | 73.70 | 76.46 | 85.76 | 79.99 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + parseq | 73.52 | 76.27 | 85.91 | 80.13 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Gvision text detection | 59.50 | 62.50 | 75.30 | 59.03 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Gvision doc. text detection | 64.00 | 53.30 | 68.90 | 61.10 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | AWS textract | 78.10 | 83.00 | 87.50 | 66.00 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Azure Form Recognizer (v3.2) | 79.42 | 85.89 | 89.62 | 88.93 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ + All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). Explanations about the metrics being used are available in :ref:`metrics`. *Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - Two-stage approaches ^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with :py:meth:`ocr_predictor `. + +.. code:: python3 - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) + import numpy as np + from doctr.models import ocr_predictor + model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = model([input_page]) You can pass specific boolean arguments to the predictor: -* `assume_straight_pages` -* `preserve_aspect_ratio` -* `symmetric_pad` +* `assume_straight_pages`: if you work with straight documents only, it will fit straight bounding boxes to the text areas. +* `preserve_aspect_ratio`: if you want to preserve the aspect ratio of your documents while resizing before sending them to the model. +* `symmetric_pad`: if you choose to preserve the aspect ratio, it will pad the image symmetrically and not from the bottom-right. Those 3 are going straight to the detection predictor, as mentioned above (in the detection part). +Additional arguments which can be passed to the `ocr_predictor` are: + * `export_as_straight_boxes`: If you work with rotated and skewed documents but you still want to export straight bounding boxes and not polygons, set it to True. +* `straighten_pages`: If you want to straighten the pages before sending them to the detection model, set it to True. For instance, this snippet instantiates an end-to-end ocr_predictor working with rotated documents, which preserves the aspect ratio of the documents, and returns polygons: - >>> from doctr.model import ocr_predictor - >>> model = ocr_predictor('linknet_resnet18_rotation', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor('linknet_resnet18', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) + + +Additionally, you can change the batch size of the underlying detection and recognition predictors to optimize the performance depending on your hardware: + +* `det_bs`: batch size for the detection model (default: 2) +* `reco_bs`: batch size for the recognition model (default: 128) + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, det_bs=4, reco_bs=1024) + +To modify the output structure you can pass the following arguments to the predictor which will be handled by the underlying `DocumentBuilder`: + +* `resolve_lines`: whether words should be automatically grouped into lines (default: True) +* `resolve_blocks`: whether lines should be automatically grouped into blocks (default: False) +* `paragraph_break`: relative length of the minimum space separating paragraphs (default: 0.035) + +For example to disable the automatic grouping of lines into blocks: + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, resolve_blocks=False) What should I do with the output? @@ -287,11 +350,19 @@ Here is a typical `Document` layout:: )] ) +To get only the text content of the `Document`, you can use the `render` method:: + + text_output = result.render() + +For reference, here is the output for the `Document` above:: + + No. RECEIPT DATE + You can also export them as a nested dict, more appropriate for JSON format:: json_output = result.export() -For reference, here is the JSON export for the same `Document` as above:: +For reference, here is the export for the same `Document` as above:: { 'pages': [ @@ -310,17 +381,23 @@ For reference, here is the JSON export for the same `Document` as above:: { 'value': 'No.', 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) + 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)), + 'objectness_score': 0.96, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'RECEIPT', 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) + 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'DATE', 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) + 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, } ] } @@ -332,14 +409,18 @@ For reference, here is the JSON export for the same `Document` as above:: ] } -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: +To export the outpout as XML (hocr-format) you can use the `export_as_xml` method: + +.. code-block:: python xml_output = result.export_as_xml() for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] + xml_bytes_string = output[0] + xml_element = output[1] -For reference, here is a sample XML byte string output:: +For reference, here is a sample XML byte string output: + +.. code-block:: xml @@ -362,3 +443,74 @@ For reference, here is a sample XML byte string output:: + + +Advanced options +^^^^^^^^^^^^^^^^ +We provide a few advanced options to customize the behavior of the predictor to your needs: + +* Modify the binarization threshold for the detection model. +* Modify the box threshold for the detection model. + +This is useful to detect (possible less) text regions more accurately with a higher threshold, or to detect more text regions with a lower threshold. + + +.. code:: python3 + + import numpy as np + from doctr.models import ocr_predictor + predictor = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + + # Modify the binarization threshold and the box threshold + predictor.det_predictor.model.postprocessor.bin_thresh = 0.5 + predictor.det_predictor.model.postprocessor.box_thresh = 0.2 + + input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = predictor([input_page]) + + +* Disable page orientation classification + +If you deal with documents which contains only small rotations (~ -45 to 45 degrees), you can disable the page orientation classification to speed up the inference. + +This will only have an effect with `assume_straight_pages=False` and/or `straighten_pages=True` and/or `detect_orientation=True`. + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_page_orientation=True) + + +* Disable crop orientation classification + +If you deal with documents which contains only horizontal text, you can disable the crop orientation classification to speed up the inference. + +This will only have an effect with `assume_straight_pages=False` and/or `straighten_pages=True`. + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_crop_orientation=True) + + +* Add a hook to the `ocr_predictor` to manipulate the location predictions before the crops are passed to the recognition model. + +.. code:: python3 + + from doctr.model import ocr_predictor + + class CustomHook: + def __call__(self, loc_preds): + # Manipulate the location predictions here + # 1. The outpout structure needs to be the same as the input location predictions + # 2. Be aware that the coordinates are relative and needs to be between 0 and 1 + return loc_preds + + my_hook = CustomHook() + + predictor = ocr_predictor(pretrained=True) + # Add a hook in the middle of the pipeline + predictor.add_hook(my_hook) + # You can also add multiple hooks which will be executed sequentially + for hook in [my_hook, my_hook, my_hook]: + predictor.add_hook(hook) diff --git a/v0.5.1/_sources/using_model_export.rst.txt b/v0.5.1/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.5.1/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.5.1/_sources/using_models.rst.txt b/v0.5.1/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.5.1/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
                                -
                                -

                                - - Hello - XML - World - -

                                -
                                - - \ No newline at end of file diff --git a/v0.5.1/_sources/utils.rst.txt b/v0.5.1/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.5.1/_sources/utils.rst.txt +++ b/v0.5.1/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.5.1/_static/documentation_options.js b/v0.5.1/_static/documentation_options.js index aeb8991ab7..a7b5cbe04a 100644 --- a/v0.5.1/_static/documentation_options.js +++ b/v0.5.1/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.5.1a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.5.1/changelog.html b/v0.5.1/changelog.html index 736a41f814..6ed2620fb7 100644 --- a/v0.5.1/changelog.html +++ b/v0.5.1/changelog.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Changelog - docTR documentation @@ -227,33 +227,21 @@ @@ -295,26 +283,6 @@

                                Changelog

                                -
                                -

                                v0.5.0 (2021-12-31)

                                -

                                Release note: v0.5.0

                                -
                                -
                                -

                                v0.4.1 (2021-11-22)

                                -

                                Release note: v0.4.1

                                -
                                -
                                -

                                v0.4.0 (2021-10-01)

                                -

                                Release note: v0.4.0

                                -
                                -
                                -

                                v0.3.1 (2021-08-27)

                                -

                                Release note: v0.3.1

                                -
                                -
                                -

                                v0.3.0 (2021-07-02)

                                -

                                Release note: v0.3.0

                                -

                                v0.2.1 (2021-05-28)

                                Release note: v0.2.1

                                @@ -338,15 +306,23 @@

                                v0.1.0 (2021-03-05) - - + +
                                +
                                + Next +
                                +
                                doctr.datasets
                                +
                                + +
                                +
                                Previous
                                -
                                Contributing to docTR
                                +
                                Installation
                                @@ -381,11 +357,6 @@

                                v0.1.0 (2021-03-05)

                                diff --git a/v0.5.1/contributing/code_of_conduct.html b/v0.5.1/contributing/code_of_conduct.html index 46e3f0cda8..7aa6177698 100644 --- a/v0.5.1/contributing/code_of_conduct.html +++ b/v0.5.1/contributing/code_of_conduct.html @@ -235,10 +235,16 @@

                                Using docTR

                                Package Reference

                      @@ -639,61 +735,61 @@

                      docTR private datasets>>> img, target = train_set[0] -
                      -
                      Parameters:
                      -
                        -
                      • img_folder – path to the images folder

                      • -
                      • labels_path – pathe to the json file containing all labels (character sequences)

                      • -
                      • **kwargs – keyword arguments from AbstractDataset.

                      • -
                      -
                      -
                      +
                      +

                      Args:

                      +
                      +

                      img_folder: path to the images folder +labels_path: pathe to the json file containing all labels (character sequences) +**kwargs: keyword arguments from AbstractDataset.

                      +
                      +

                      class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]

                      Implements an OCR dataset

                      -
                      -
                      Parameters:
                      -
                        -
                      • img_folder – local path to image folder (all jpg at the root)

                      • -
                      • label_file – local path to the label file

                      • -
                      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                      • -
                      • **kwargs – keyword arguments from AbstractDataset.

                      • -
                      -
                      -
                      +
                      >>> from doctr.datasets import OCRDataset
                      +>>> train_set = OCRDataset(img_folder="/path/to/images",
                      +>>>                        label_file="/path/to/labels.json")
                      +>>> img, target = train_set[0]
                      +
                      +
                      +
                      +

                      Args:

                      +
                      +

                      img_folder: local path to image folder (all jpg at the root) +label_file: local path to the label file +use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) +**kwargs: keyword arguments from AbstractDataset.

                      +
                      +
                      - -
                      -

                      Data Loading

                      -

                      Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

                      +
                      +

                      Dataloader

                      -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
                      +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, collate_fn: Callable | None = None)[source]

                      Implements a dataset wrapper for fast data loading

                      -
                      >>> from doctr.datasets import FUNSD, DataLoader
                      +
                      >>> from doctr.datasets import CORD, DataLoader
                       >>> train_set = CORD(train=True, download=True)
                       >>> train_loader = DataLoader(train_set, batch_size=32)
                       >>> train_iter = iter(train_loader)
                       >>> images, targets = next(train_iter)
                       
                      -
                      -
                      Parameters:
                      -
                        -
                      • dataset – the dataset

                      • -
                      • shuffle – whether the samples should be shuffled before passing it to the iterator

                      • -
                      • batch_size – number of elements in each batch

                      • -
                      • drop_last – if True, drops the last batch if it isn’t full

                      • -
                      • num_workers – number of workers to use for data loading

                      • -
                      • collate_fn – function to merge samples into a batch

                      • -
                      -
                      -
                      +
                      +

                      Args:

                      +
                      +

                      dataset: the dataset +shuffle: whether the samples should be shuffled before passing it to the iterator +batch_size: number of elements in each batch +drop_last: if True, drops the last batch if it isn’t full +collate_fn: function to merge samples into a batch

                      +
                      +
                      @@ -701,9 +797,9 @@

                      Data Loading

                      Supported Vocabs

                      Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets of vocabs.

                      -
                      - - +
                      +
                      docTR Vocabs
                      +@@ -720,69 +816,159 @@

                      Data Loading

                      + + + + + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      docTR Vocabs

                      ascii_letters

                      hindi_digits

                      10

                      ٠١٢٣٤٥٦٧٨٩

                      ascii_letters

                      52

                      abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

                      punctuation

                      punctuation

                      32

                      !”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

                      currency

                      currency

                      5

                      £€¥¢฿

                      latin

                      ancient_greek

                      48

                      αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ

                      arabic_letters

                      37

                      ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي

                      generic_cyrillic_letters

                      58

                      абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ

                      persian_letters

                      5

                      پچڢڤگ

                      arabic_diacritics

                      2

                      ‘ًٌٍَُِّْ’

                      arabic_punctuation

                      5

                      ؟؛«»—

                      latin

                      94

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

                      english

                      english

                      100

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

                      legacy_french

                      legacy_french

                      123

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                      french

                      french

                      126

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

                      portuguese

                      portuguese

                      131

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

                      spanish

                      spanish

                      116

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

                      italian

                      120

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ

                      german

                      108

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

                      arabic

                      101

                      ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيپچڢڤگ؟؛«»—0123456789٠١٢٣٤٥٦٧٨٩’ًٌٍَُِّْ’!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~

                      czech

                      130

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ

                      polish

                      118

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿ąćęłńóśźżĄĆĘŁŃÓŚŹŻ

                      dutch

                      114

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ

                      norwegian

                      106

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿æøåÆØÅ

                      danish

                      106

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿æøåÆØÅ

                      finnish

                      104

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöÄÖ

                      swedish

                      106

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿åäöÅÄÖ

                      ukrainian

                      115

                      абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴

                      vietnamese

                      236

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ

                      hebrew

                      123

                      0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪

                      hindi

                      71

                      अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰

                      bangla

                      70

                      অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯

                      multilingual

                      195

                      english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & §

                      -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
                      +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False) ndarray[source]

                      Encode character sequences using a given vocab as mapping

                      -
                      -
                      Parameters:
                      -
                        -
                      • sequences – the list of character sequences of size N

                      • -
                      • vocab – the ordered vocab to use for encoding

                      • -
                      • target_size – maximum length of the encoded data

                      • -
                      • eos – encoding of End Of String

                      • -
                      • sos – optional encoding of Start Of String

                      • -
                      • pad – optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD

                      • -
                      • dynamic_seq_length – if target_size is specified, uses it as upper bound and enables dynamic sequence size

                      • -
                      -
                      -
                      Returns:
                      -

                      the padded encoded data as a tensor

                      -
                      -
                      +
                      +

                      Args:

                      +
                      +

                      sequences: the list of character sequences of size N +vocab: the ordered vocab to use for encoding +target_size: maximum length of the encoded data +eos: encoding of End Of String +sos: optional encoding of Start Of String +pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD +dynamic_seq_length: if target_size is specified, uses it as upper bound and enables dynamic sequence size

                      +
                      +
                      +
                      +

                      Returns:

                      +
                      +

                      the padded encoded data as a tensor

                      +
                      +

                      @@ -802,14 +988,14 @@

                      Data Loading - +
                      Previous
                      -
                      Preparing your model for inference
                      +
                      doctr.contrib
                      @@ -844,8 +1030,7 @@

                      Data Loading
                      • doctr.datasets
                          -
                        • Available Datasets
                            -
                          • Public datasets
                              +
                            • doctr.datasets
                            • -
                            • docTR synthetic datasets
                                -
                              • DocArtefacts
                              • +
                              • Synthetic dataset generator
                              • -
                              • docTR private datasets -
                              • -
                              • Data Loading
                                  +
                                • Dataloader
                                • @@ -891,7 +1077,7 @@

                                  Data Loading + diff --git a/v0.5.1/modules/io.html b/v0.5.1/modules/io.html index c503b8ff82..0706457520 100644 --- a/v0.5.1/modules/io.html +++ b/v0.5.1/modules/io.html @@ -235,10 +235,16 @@

                                  Using docTR

                                  Package Reference

                                    +
                                  • doctr.contrib
                                  • doctr.datasets
                                  • doctr.io
                                  • doctr.models
                                  • @@ -305,18 +311,19 @@

                                    WordA Word is an uninterrupted sequence of characters.

                                    -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray)[source]
                                    +class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray, objectness_score: float, crop_orientation: Dict[str, Any])[source]

                                    Implements a word element

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • value – the text string of the word

                                    • -
                                    • confidence – the confidence associated with the text prediction

                                    • -
                                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                                    • -
                                    • size (the page's)

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    value: the text string of the word +confidence: the confidence associated with the text prediction +geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to +the page’s size +objectness_score: the objectness score of the detection +crop_orientation: the general orientation of the crop in degrees and its confidence

                                    +
                                    +
                                    @@ -325,18 +332,19 @@

                                    LineA Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

                                    -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
                                    +class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None, objectness_score: float | None = None)[source]

                                    Implements a line element as a collection of words

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • words – list of word elements

                                    • -
                                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    words: list of word elements +geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                                    +
                                    +

                                    the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing +all words in it.

                                    +
                                    +
                                    +
                                    @@ -347,16 +355,17 @@

                                    Artefact
                                    class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]

                                    Implements a non-textual element

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • artefact_type – the type of artefact

                                    • -
                                    • confidence – the confidence of the type prediction

                                    • -
                                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    artefact_type: the type of artefact +confidence: the confidence of the type prediction +geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                                    +
                                    +

                                    the page’s size.

                                    +
                                    +
                                    +
                                    @@ -365,19 +374,20 @@

                                    Block

                                    A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

                                    -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
                                    +class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None, objectness_score: float | None = None)[source]

                                    Implements a block element as a collection of lines and artefacts

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • lines – list of line elements

                                    • -
                                    • artefacts – list of artefacts

                                    • -
                                    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    lines: list of line elements +artefacts: list of artefacts +geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

                                    +
                                    +

                                    the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing +all lines and artefacts in it.

                                    +
                                    +
                                    +
                                    @@ -386,34 +396,34 @@

                                    PageA Page is a collection of Blocks that were on the same physical page.

                                    -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
                                    +class doctr.io.Page(page: ndarray, blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]

                                    Implements a page element as a collection of blocks

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • blocks – list of block elements

                                    • -
                                    • page_idx – the index of the page in the input raw document

                                    • -
                                    • dimensions – the page size in pixels in format (height, width)

                                    • -
                                    • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

                                    • -
                                    • language – a dictionary with the language value and confidence of the prediction

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    page: image encoded as a numpy array in uint8 +blocks: list of block elements +page_idx: the index of the page in the input raw document +dimensions: the page size in pixels in format (height, width) +orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction +language: a dictionary with the language value and confidence of the prediction

                                    +
                                    -show(page: ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) None[source]
                                    +show(interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) None[source]

                                    Overlay the result on a given image

                                    Parameters:
                                      -
                                    • page – image encoded as a numpy array in uint8

                                    • interactive – whether the display should be interactive

                                    • preserve_aspect_ratio – pass True if you passed True to the predictor

                                    • +
                                    • **kwargs – additional keyword arguments passed to the matplotlib.pyplot.show method

                                    +
                                    @@ -424,22 +434,18 @@

                                    Document
                                    class doctr.io.Document(pages: List[Page])[source]

                                    Implements a document element as a collection of pages

                                    -
                                    -
                                    Parameters:
                                    -

                                    pages – list of page elements

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    pages: list of page elements

                                    +
                                    -show(pages: List[ndarray], **kwargs) None[source]
                                    +show(**kwargs) None[source]

                                    Overlay the result on a given image

                                    -
                                    -
                                    Parameters:
                                    -

                                    pages – list of images encoded as numpy arrays in uint8

                                    -
                                    -
                                    +
                                    @@ -449,98 +455,113 @@

                                    File reading
                                    -doctr.io.read_pdf(file: str | Path | bytes, scale: float = 2, **kwargs: Any) List[ndarray][source]
                                    +doctr.io.read_pdf(file: str | Path | bytes, scale: float = 2, rgb_mode: bool = True, password: str | None = None, **kwargs: Any) List[ndarray][source]

                                    Read a PDF file and convert it into an image in numpy format

                                    -
                                    >>> from doctr.documents import read_pdf
                                    +
                                    >>> from doctr.io import read_pdf
                                     >>> doc = read_pdf("path/to/your/doc.pdf")
                                     
                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • file – the path to the PDF file

                                    • -
                                    • scale – rendering scale (1 corresponds to 72dpi)

                                    • -
                                    • kwargs – additional parameters to pypdfium2._helpers.pdf_renderer.render_pdf_topil()

                                    • -
                                    -
                                    -
                                    Returns:
                                    -

                                    the list of pages decoded as numpy ndarray of shape H x W x C

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    file: the path to the PDF file +scale: rendering scale (1 corresponds to 72dpi) +rgb_mode: if True, the output will be RGB, otherwise BGR +password: a password to unlock the document, if encrypted +**kwargs: additional parameters to pypdfium2.PdfPage.render()

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the list of pages decoded as numpy ndarray of shape H x W x C

                                    +
                                    +
                                    doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]

                                    Read an image file into numpy format

                                    -
                                    >>> from doctr.documents import read_img
                                    ->>> page = read_img("path/to/your/doc.jpg")
                                    +
                                    >>> from doctr.io import read_img_as_numpy
                                    +>>> page = read_img_as_numpy("path/to/your/doc.jpg")
                                     
                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • file – the path to the image file

                                    • -
                                    • output_size – the expected output size of each page in format H x W

                                    • -
                                    • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

                                    • -
                                    -
                                    -
                                    Returns:
                                    -

                                    the page decoded as numpy ndarray of shape H x W x 3

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    file: the path to the image file +output_size: the expected output size of each page in format H x W +rgb_output: whether the output ndarray channel order should be RGB instead of BGR.

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the page decoded as numpy ndarray of shape H x W x 3

                                    +
                                    +
                                    doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]

                                    Read an image file as a TensorFlow tensor

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • img_path – location of the image file

                                    • -
                                    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                    • -
                                    -
                                    -
                                    Returns:
                                    -

                                    decoded image as a tensor

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    img_path: location of the image file +dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    decoded image as a tensor

                                    +
                                    +
                                    doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]

                                    Read a byte stream as a TensorFlow tensor

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • img_content – bytes of a decoded image

                                    • -
                                    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                    • -
                                    -
                                    -
                                    Returns:
                                    -

                                    decoded image as a tensor

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    img_content: bytes of a decoded image +dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    decoded image as a tensor

                                    +
                                    +
                                    doctr.io.read_html(url: str, **kwargs: Any) bytes[source]

                                    Read a PDF file and convert it into an image in numpy format

                                    -
                                    >>> from doctr.documents import read_html
                                    +
                                    >>> from doctr.io import read_html
                                     >>> doc = read_html("https://www.yoursite.com")
                                     
                                    -
                                    -
                                    Parameters:
                                    -

                                    url – URL of the target web page

                                    -
                                    -
                                    Returns:
                                    -

                                    decoded PDF file as a bytes stream

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    url: URL of the target web page +**kwargs: keyword arguments from weasyprint.HTML

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    decoded PDF file as a bytes stream

                                    +
                                    +
                                    @@ -551,54 +572,69 @@

                                    File reading classmethod from_pdf(file: str | Path | bytes, **kwargs) List[ndarray][source]

                                    Read a PDF file

                                    -
                                    >>> from doctr.documents import DocumentFile
                                    +
                                    >>> from doctr.io import DocumentFile
                                     >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
                                     
                                    -
                                    -
                                    Parameters:
                                    -

                                    file – the path to the PDF file or a binary stream

                                    -
                                    -
                                    Returns:
                                    -

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    file: the path to the PDF file or a binary stream +**kwargs: additional parameters to pypdfium2.PdfPage.render()

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    +
                                    +

                                    classmethod from_url(url: str, **kwargs) List[ndarray][source]

                                    Interpret a web page as a PDF document

                                    -
                                    >>> from doctr.documents import DocumentFile
                                    +
                                    >>> from doctr.io import DocumentFile
                                     >>> doc = DocumentFile.from_url("https://www.yoursite.com")
                                     
                                    -
                                    -
                                    Parameters:
                                    -

                                    url – the URL of the target web page

                                    -
                                    -
                                    Returns:
                                    -

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    url: the URL of the target web page +**kwargs: additional parameters to pypdfium2.PdfPage.render()

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    +
                                    +
                                    classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]

                                    Read an image file (or a collection of image files) and convert it into an image in numpy format

                                    -
                                    >>> from doctr.documents import DocumentFile
                                    +
                                    >>> from doctr.io import DocumentFile
                                     >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
                                     
                                    -
                                    -
                                    Parameters:
                                    -

                                    files – the path to the image file or a binary stream, or a collection of those

                                    -
                                    -
                                    Returns:
                                    -

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    files: the path to the image file or a binary stream, or a collection of those +**kwargs: additional parameters to doctr.io.image.read_img_as_numpy()

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the list of pages decoded as numpy ndarray of shape H x W x 3

                                    +
                                    +
                                    @@ -720,7 +756,7 @@

                                    File reading + diff --git a/v0.5.1/modules/models.html b/v0.5.1/modules/models.html index d1a394239b..e836bd7887 100644 --- a/v0.5.1/modules/models.html +++ b/v0.5.1/modules/models.html @@ -235,10 +235,16 @@

                                    Using docTR

                                    Package Reference

                                  • doctr.models.detection
                                  • @@ -957,11 +1570,21 @@

                                    doctr.models.zoocrnn_mobilenet_v3_large()
                                  • sar_resnet31()
                                  • master()
                                  • +
                                  • vitstr_small()
                                  • +
                                  • vitstr_base()
                                  • +
                                  • parseq()
                                  • recognition_predictor()
                                • doctr.models.zoo +
                                • +
                                • doctr.models.factory
                                @@ -975,7 +1598,7 @@

                                doctr.models.zoo + diff --git a/v0.5.1/modules/transforms.html b/v0.5.1/modules/transforms.html index ed36d83f74..1684036838 100644 --- a/v0.5.1/modules/transforms.html +++ b/v0.5.1/modules/transforms.html @@ -235,10 +235,16 @@

                                Using docTR

                                Package Reference

                              • Composing transformations
                                  @@ -798,7 +830,7 @@

                                  Composing transformations + diff --git a/v0.5.1/modules/utils.html b/v0.5.1/modules/utils.html index 3a218197cb..f9836a1705 100644 --- a/v0.5.1/modules/utils.html +++ b/v0.5.1/modules/utils.html @@ -235,10 +235,16 @@

                                  Using docTR

                                  Package Reference

                                    +
                                  • doctr.contrib
                                  • doctr.datasets
                                  • doctr.io
                                  • doctr.models
                                  • @@ -314,38 +320,25 @@

                                    Visualization>>> plt.show() -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • page – the exported Page of a Document

                                    • -
                                    • image – np array of the page, needs to have the same shape than page[‘dimensions’]

                                    • -
                                    • words_only – whether only words should be displayed

                                    • -
                                    • display_artefacts – whether artefacts should be displayed

                                    • -
                                    • scale – figsize of the largest windows side

                                    • -
                                    • interactive – whether the plot should be interactive

                                    • -
                                    • add_labels – for static plot, adds text labels on top of bounding box

                                    • -
                                    -
                                    -
                                    - - -
                                    -
                                    -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_family: str | None = None) ndarray[source]
                                    -

                                    Draw a the content of the element page (OCR response) on a blank page.

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • page – exported Page object to represent

                                    • -
                                    • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                                    • -
                                    • font_size – size of the font, default font = 13

                                    • -
                                    • font_family – family of the font

                                    • -
                                    -
                                    -
                                    Returns:
                                    -

                                    the synthesized page

                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    page: the exported Page of a Document +image: np array of the page, needs to have the same shape than page[‘dimensions’] +words_only: whether only words should be displayed +display_artefacts: whether artefacts should be displayed +scale: figsize of the largest windows side +interactive: whether the plot should be interactive +add_labels: for static plot, adds text labels on top of bounding box +**kwargs: keyword arguments for the polygon patch

                                    +
                                    +
                                    +
                                    +

                                    Returns:

                                    +
                                    +

                                    the matplotlib figure

                                    +
                                    +
                                    @@ -385,14 +378,13 @@

                                    Visualization update(gt: List[str], pred: List[str]) None[source]

                                    Update the state of the metric with new predictions

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • gt – list of groung-truth character sequences

                                    • -
                                    • pred – list of predicted character sequences

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    gt: list of groung-truth character sequences +pred: list of predicted character sequences

                                    +
                                    +
                                    @@ -401,8 +393,11 @@

                                    Visualization
                                    Returns:
                                    -

                                    a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

                                    +

                                      +
                                    • a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii

                                    • +
                                    • counterpart and its lower-case anyascii counterpart

                                    • +
                                    +

                                    @@ -411,7 +406,7 @@

                                    Visualization
                                    -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                    +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                    Implements common confusion metrics and mean IoU for localization evaluation.

                                    The aggregated metrics are computed as follows:

                                    @@ -442,28 +437,23 @@

                                    Visualization>>> metric.summary()

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                    • -
                                    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                    • -
                                    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                    • -
                                    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                    +
                                    update(gts: ndarray, preds: ndarray) None[source]

                                    Updates the metric

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                    • -
                                    • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                    +
                                    +
                                    @@ -471,17 +461,18 @@

                                    Visualizationsummary() Tuple[float | None, float | None, float | None][source]

                                    Computes the aggregated metrics

                                    -
                                    Returns:
                                    +
                                    Return type:

                                    a tuple with the recall, precision and meanIoU scores

                                    +
                                    -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                    +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                    Implements an end-to-end OCR metric.

                                    The aggregated metrics are computed as follows:

                                    @@ -516,30 +507,25 @@

                                    Visualization>>> metric.summary()

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                    • -
                                    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                    • -
                                    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                    • -
                                    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                    +
                                    update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]

                                    Updates the metric

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                    • -
                                    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                    • -
                                    • gt_labels – a list of N string labels

                                    • -
                                    • pred_labels – a list of M string labels

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones +gt_labels: a list of N string labels +pred_labels: a list of M string labels

                                    +
                                    +
                                    @@ -547,17 +533,18 @@

                                    Visualizationsummary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

                                    Computes the aggregated metrics

                                    -
                                    Returns:
                                    +
                                    Return type:

                                    a tuple with the recall & precision for each string comparison and the mean IoU

                                    +
                                    -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                    +class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                    Implements an object detection metric.

                                    The aggregated metrics are computed as follows:

                                    @@ -592,30 +579,25 @@

                                    Visualization>>> metric.summary()

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                    • -
                                    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                    • -
                                    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                    • -
                                    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                    +
                                    update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]

                                    Updates the metric

                                    -
                                    -
                                    Parameters:
                                    -
                                      -
                                    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                    • -
                                    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                    • -
                                    • gt_labels – an array of class indices of shape (N,)

                                    • -
                                    • pred_labels – an array of class indices of shape (M,)

                                    • -
                                    -
                                    -
                                    +
                                    +

                                    Args:

                                    +
                                    +

                                    gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones +gt_labels: an array of class indices of shape (N,) +pred_labels: an array of class indices of shape (M,)

                                    +
                                    +
                                    @@ -623,12 +605,13 @@

                                    Visualizationsummary() Tuple[float | None, float | None, float | None][source]

                                    Computes the aggregated metrics

                                    -
                                    Returns:
                                    +
                                    Return type:

                                    a tuple with the recall & precision for each class prediction and the mean IoU

                                    +
                                    @@ -692,7 +675,6 @@

                                    Visualizationdoctr.utils
                                    • Visualization
                                    • Task evaluation
                                        @@ -729,7 +711,7 @@

                                        Visualization + diff --git a/v0.5.1/notebooks.html b/v0.5.1/notebooks.html index 50c69b38cf..42abaa6cfd 100644 --- a/v0.5.1/notebooks.html +++ b/v0.5.1/notebooks.html @@ -235,10 +235,16 @@

                                        Using docTR

                                        Package Reference

                                          +
                                        • doctr.contrib
                                        • doctr.datasets
                                        • doctr.io
                                        • doctr.models
                                        • @@ -375,7 +381,7 @@

                                          docTR Notebooks + diff --git a/v0.5.1/objects.inv b/v0.5.1/objects.inv index dd8cd5f569..a22d2ce821 100644 Binary files a/v0.5.1/objects.inv and b/v0.5.1/objects.inv differ diff --git a/v0.5.1/search.html b/v0.5.1/search.html index 1ed48711a1..fea94ac955 100644 --- a/v0.5.1/search.html +++ b/v0.5.1/search.html @@ -227,32 +227,20 @@ - + diff --git a/v0.5.1/searchindex.js b/v0.5.1/searchindex.js index 46bed16bb2..231483d7a6 100644 --- a/v0.5.1/searchindex.js +++ b/v0.5.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "Annotation typing": [[2, "annotation-typing"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[5, "available-datasets"]], "Available architectures": [[12, "available-architectures"], [12, "id1"], [12, "id3"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choosing the right model": [[12, null]], "Code quality": [[2, "code-quality"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Data Loading": [[5, "data-loading"]], "Detection predictors": [[12, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Docstring format": [[2, "docstring-format"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[12, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[11, "half-precision"]], "Import order": [[2, "import-order"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Lint verification": [[2, "lint-verification"]], "Main Features": [[4, "main-features"]], "Model compression": [[11, "model-compression"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Post-training quantization": [[11, "post-training-quantization"]], "Preparing your model for inference": [[11, null]], "Prerequisites": [[3, "prerequisites"]], "Public datasets": [[5, "public-datasets"]], "Questions": [[2, "questions"]], "Recognition predictors": [[12, "recognition-predictors"]], "Scope": [[1, "scope"]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Task evaluation": [[9, "task-evaluation"]], "TensorFlow Lite": [[11, "tensorflow-lite"]], "Text Detection": [[12, "text-detection"]], "Text Recognition": [[12, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition model zoo": [[12, "id5"]], "Text recognition models": [[4, "text-recognition-models"]], "Two-stage approaches": [[12, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Using SavedModel": [[11, "using-savedmodel"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[12, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id2"]], "docTR private datasets": [[5, "doctr-private-datasets"]], "docTR synthetic datasets": [[5, "doctr-synthetic-datasets"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 9], "0": [1, 5, 8, 9, 12], "00": 12, "01": 12, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 12, "02562": 7, "03": 12, "035": [], "0361328125": 12, "04": 12, "05": [], "06": 12, "06640625": 12, "07": [], "08": [8, 12], "09": [], "0966796875": 12, "1": [5, 6, 7, 8, 9, 11, 12], "10": [5, 9, 12], "100": [5, 8, 9, 11, 12], "1000": 12, "101": [], "1024": [7, 9, 11, 12], "104": [], "106": [], "108": 5, "1095": [], "11": 12, "110": 9, "1107": [], "114": [], "115": [], "1156": [], "116": 5, "118": [], "11800h": [], "11th": [], "12": 12, "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 12], "13": [9, 12], "130": [], "13068": [], "131": 5, "1337891": [], "1357421875": 12, "1396484375": 12, "14": 12, "1420": 12, "14470v1": [], "149": [], "15": 12, "150": [9, 12], "154": [], "1552": 12, "16": 7, "160": 7, "1630859375": 12, "1684": 12, "16x16": [], "17": [], "1778": 12, "1782": 12, "18": [7, 12], "185546875": 12, "19": [], "1900": 12, "1910": 7, "19342": [], "19370": [], "195": [], "19598": [], "199": 12, "1999": 12, "1m": 12, "2": [3, 4, 6, 8, 11, 12], "20": 12, "200": 9, "2000": [], "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": [], "21": 12, "2103": [], "2186": [], "21888": [], "22": [], "224": [7, 8, 11], "225": 8, "22672": [], "229": 8, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 12, "2504": 12, "255": [6, 7, 8, 9, 12], "256": 7, "257": [], "26": [], "26032": [], "264": [], "27": 12, "2700": [], "2710": 12, "2749": [], "28": [], "287": [], "29": 12, "296": [], "299": [], "2d": 12, "2m": 12, "3": [3, 4, 6, 7, 8, 9, 11, 12], "30": 12, "300": [], "3000": [], "301": [], "30595": 12, "30ghz": [], "31": [7, 12], "32": [5, 7, 8, 11, 12], "3232421875": 12, "33": 8, "33402": [], "33608": [], "34": [7, 12], "340": 12, "3456": 12, "35": 12, "3515625": 12, "36": 12, "360": [], "37": 12, "38": 12, "39": 12, "4": [7, 8, 9, 12], "40": [], "406": 8, "41": 12, "42": 12, "43": 12, "44": [], "45": 12, "456": 8, "46": 12, "47": 12, "472": [], "48": [7, 12], "485": 8, "49": 12, "49377": [], "5": [5, 8, 9, 12], "50": [7, 12], "51": 12, "51171875": 12, "512": 7, "52": [5, 12], "529": 12, "53": 12, "533": [], "54": [], "540": 12, "5478515625": 12, "55": [], "56": 12, "57": 12, "58": [], "580": 12, "5810546875": 12, "583": 12, "59": 12, "595": [], "597": 12, "5k": [4, 5], "5m": 12, "6": [3, 8, 12], "60": 8, "600": [7, 9, 12], "61": 12, "611": [], "62": 12, "625": [], "626": [], "629": [], "63": 12, "630": [], "64": [7, 8, 12], "640": [], "641": 12, "647": [], "65": 12, "66": 12, "660": [], "664": [], "666": [], "67": 12, "672": [], "68": 12, "689": [], "69": 12, "693": [], "694": [], "695": [], "6m": [], "7": 12, "70": [9, 12], "700": [], "701": [], "702": [], "707470": [], "71": 12, "7100000": [], "713": [], "7141797": [], "7149": [], "72": 12, "72dpi": 6, "73": 12, "73257": [], "733": [], "74": 12, "745": [], "75": [8, 12], "753": [], "7581382": [], "76": 12, "77": 12, "772": [], "772875": [], "78": 12, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 12, "793533": [], "796": [], "798": [], "7m": 12, "8": [7, 8, 12], "80": 12, "800": [7, 9, 12], "81": 12, "817": [], "82": 12, "8275l": 12, "83": 12, "830": [], "84": 12, "849": [], "85": 12, "8564453125": 12, "857": 12, "85875": [], "86": 12, "860": [], "8603515625": 12, "862": [], "863": [], "87": 12, "8707": [], "875": [], "88": [], "89": 12, "8m": 12, "9": [], "90": 12, "90k": [], "90kdict32px": [], "91": 12, "913": [], "914085328578949": 12, "917": [], "92": 12, "921": [], "93": 12, "94": [5, 12], "95": [9, 12], "9578408598899841": 12, "96": 12, "97": 12, "98": 12, "99": 12, "9949972033500671": 12, "A": [1, 2, 4, 5, 6, 7, 10, 11], "And": 11, "As": 2, "Be": [], "Being": 1, "By": [], "For": [1, 2, 3, 12], "If": [2, 3, 6, 7, 11, 12], "In": [2, 5], "It": 8, "Its": [4, 7], "No": [1, 12], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12], "Then": [], "To": [2, 3, 12], "_": [1, 5, 7, 11], "__call__": [], "_build": 2, "_helper": 6, "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": 5, "abl": [5, 12], "about": [1, 12], "abov": 12, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 5, 6, 12], "account": [1, 11], "accur": [], "accuraci": 9, "achiev": 11, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9], "add_hook": [], "add_label": 9, "addit": [2, 6], "addition": [2, 11, 12], "address": [1, 6], "adjust": [2, 8], "advanc": 1, "advantag": [], "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [2, 12], "ag": 1, "again": [], "aggreg": [5, 9], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 12], "allow": 1, "along": 12, "alreadi": 2, "also": [1, 12], "alwai": [], "an": [1, 2, 4, 5, 6, 7, 9, 11, 12], "analysi": 6, "ancient_greek": [], "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 11, 12], "annot": 5, "anot": [], "anoth": [3, 5, 7], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": [], "appropri": [1, 2, 12], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 12], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 7, "architectur": [4, 7], "archiv": [], "area": 12, "arg": [5, 7], "argument": [5, 12], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 12], "artefact_typ": 6, "artifici": [], "arxiv": 7, "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 12], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 12], "astyp": [7, 9, 11, 12], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": [], "autoregress": [], "avail": [1, 4, 8], "averag": [8, 12], "avoid": [1, 3], "aw": [4, 12], "awar": [], "azur": [], "b": 9, "b_j": 9, "back": 2, "backbon": 7, "backend": 12, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": [4, 7], "baselin": [4, 7, 12], "bash": [], "batch": [5, 7, 8, 12], "batch_siz": 5, "bblanchon": [], "bbox": 12, "becaus": [], "been": [5, 9, 12], "befor": [5, 7, 8, 12], "begin": 9, "behavior": 1, "being": [9, 12], "belong": 12, "below": 12, "benchmark": 12, "best": 1, "beta": [], "better": [10, 12], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 12], "bit": [], "blank": 9, "block": [9, 12], "block_1_1": 12, "blue": 9, "blur": 8, "bmvc": 5, "bn": [], "bodi": [1, 12], "bool": [5, 6, 7, 8, 9], "boolean": [7, 12], "both": [4, 5, 8, 12], "bottom": [7, 12], "bound": [5, 6, 7, 8, 9, 12], "box": [5, 6, 7, 8, 9, 12], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": [], "byte": [6, 12], "c": [6, 9], "c5": 12, "c_j": 9, "cach": 5, "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 5, 11, 12], "capabl": [2, 10, 12], "case": [5, 9, 12], "catch": 2, "cf": 12, "cfg": 12, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": [], "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 12], "charactergener": 5, "characterist": 1, "charg": 12, "charset": 12, "chart": 6, "check": [2, 12], "checkpoint": [], "chip": [], "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 12], "class_nam": [], "classif": [], "classif_mobilenet_v3_smal": 7, "classmethod": 6, "cleaner": 2, "clear": [], "clone": 3, "close": 2, "co": [], "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6], "combin": 12, "command": 2, "comment": 1, "commit": 1, "common": [1, 2, 8, 9], "commun": 1, "compar": 4, "comparison": [9, 12], "competit": 5, "compil": [10, 12], "complaint": 1, "complementari": 9, "complet": [], "compli": 2, "compon": 12, "compos": [5, 12], "comprehens": 12, "comput": [5, 9, 12], "conf_threshold": [], "confid": [6, 9, 12], "config": 2, "configur": 2, "confus": 9, "consecut": [8, 12], "consequ": 1, "consid": [1, 2, 5, 6, 9, 12], "consist": 12, "consolid": [4, 5], "constant": 8, "constraint": 11, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 12], "content": [5, 6, 9, 12], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": 11, "convent": 2, "convers": 6, "convert": [6, 8, 11], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 12], "cord": [4, 5, 12], "core": 9, "corner": 12, "correct": 8, "correspond": [3, 6, 12], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 12], "creat": [], "crnn": [4, 7], "crnn_mobilenet_v3_larg": [7, 12], "crnn_mobilenet_v3_smal": [7, 12], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 12], "crop": [7, 8, 12], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": [], "currenc": 5, "current": 12, "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": [], "d": 5, "daili": [], "danish": [], "data": [6, 8, 9], "dataload": 5, "dataset": [7, 12], "dataset_info": 5, "date": 12, "db": [], "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 12], "db_resnet34": [], "db_resnet50": [7, 11, 12], "db_resnet50_rot": 12, "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 12], "def": 11, "default": [6, 9, 11], "defer": 5, "defin": 9, "deform": [], "degre": 8, "degress": 6, "delet": [], "delimit": 12, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": 7, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": 12, "detect": [5, 9, 10], "detect_languag": [], "detect_orient": [], "detection_predictor": [7, 12], "detection_task": [], "detectiondataset": 5, "detectionmetr": 9, "detectionpredictor": 7, "detector": [], "deterior": [], "determin": 1, "dev": 2, "develop": 3, "developp": 3, "deviat": 8, "devic": [], "dict": [6, 9, 12], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5], "dimens": [6, 9, 12], "dimension": 8, "direct": 5, "directli": 12, "directori": [], "disabl": 1, "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 12, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 12, "divers": 1, "divid": 6, "do": [2, 3, 11], "doc": [2, 6, 12], "docartefact": 5, "docstr": [], "doctr": [3, 11, 12], "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [5, 7, 9, 10, 12], "documentbuild": [], "documentfil": 6, "doesn": [], "don": 12, "done": 8, "download": 5, "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 11], "dual": [], "dummi": [], "dummy_img": 12, "dummy_input": [], "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 8, 9, 12], "eas": 2, "easi": [4, 9], "easier": [], "easili": [6, 9, 11, 12], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 12], "element": [5, 6, 7, 9, 12], "els": 2, "email": 1, "empathi": 1, "en": 12, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 12], "encode_sequ": 5, "encount": 2, "encrypt": [], "end": [4, 5, 7, 9], "english": 5, "enough": [2, 12], "ensur": 2, "entir": [], "entri": 5, "environ": 1, "eo": 5, "equiv": 12, "error": [], "estim": [], "etc": 6, "ethnic": 1, "evalu": [5, 12], "event": 1, "everyon": 1, "everyth": [2, 12], "exact": [9, 12], "exactmatch": [], "exampl": [1, 2, 4, 5, 7], "exchang": [], "exclud": [], "execut": [], "exist": [], "expand": 8, "expect": [2, 6, 8, 9], "experi": 1, "explan": [1, 12], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 11, 12], "export_as_straight_box": [7, 12], "export_as_xml": 12, "export_model_to_onnx": [], "express": [1, 8], "extens": 6, "extern": 1, "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fallback": 11, "fals": [5, 6, 7, 8, 9, 11, 12], "famili": 9, "faq": 1, "fascan": [], "fast": 5, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": 12, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": 2, "felix92": [], "few": [3, 11], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": [7, 11], "find": [2, 3], "fine": 12, "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 12], "fitz": [], "flag": 12, "flake8": 2, "flexibl": [], "flip": 8, "float": [6, 8, 9, 11], "float16": 11, "float32": [6, 7, 8, 11], "fn": 8, "focu": [], "focus": [1, 5], "folder": [2, 5, 11], "follow": [1, 2, 3, 5, 8, 9, 11, 12], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": [], "forg": [], "form": [4, 5, 12], "format": [5, 6, 9, 11, 12], "forpost": [4, 5], "forum": 2, "fp": 12, "fp16": [], "frac": 9, "frame": 12, "framework": [3, 5, 12], "free": [1, 2], "french": [5, 12], "friendli": 4, "from": [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12], "from_hub": [], "from_imag": 6, "from_keras_model": 11, "from_pdf": 6, "from_url": 6, "full": [5, 9, 12], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 12], "further": [], "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": [], "gender": 1, "gener": [2, 5], "generic_cyrillic_lett": [], "geometri": [4, 6, 12], "geq": 9, "german": 5, "get": 12, "get_artefact": [], "get_lin": [], "get_text_word": [], "get_word": [], "gettextword": [], "git": [], "github": [2, 3], "give": 1, "given": [5, 6, 8, 9, 12], "global": 7, "go": 12, "good": 11, "googl": 2, "googlevis": 4, "gpu": 4, "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 5, "gvision": 12, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9], "half": [], "handl": 5, "handwrit": 5, "handwritten": [], "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 5, 9, 11, 12], "head": [7, 12], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 12], "help": 11, "here": [3, 5, 8, 10, 12], "hf": [], "hf_hub_download": [], "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": [], "hocr": 12, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 5], "howev": 5, "hsv": 8, "html": [1, 2, 12], "http": [1, 3, 6, 7, 12], "hub": [], "hue": 8, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7, 8, 9, 11], "i7": [], "ic03": [4, 5], "ic13": [4, 5], "icdar": [4, 5], "icdar2019": 5, "id": 12, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": 5, "iiithw": [], "imag": [4, 5, 6, 7, 8, 9, 12], "imagenet": 7, "imageri": 1, "images_90k_norm": [], "img": [5, 8], "img_cont": 6, "img_fold": 5, "img_path": 6, "img_transform": 5, "imgur5k": [4, 5], "imgur5k_annot": 5, "imlist": [], "impact": 1, "implement": [5, 6, 8, 9, 11, 12], "import": [5, 6, 7, 8, 9, 11, 12], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5], "inclus": 1, "incom": 2, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inference_input_typ": 11, "inference_output_typ": 11, "inform": [1, 2, 4, 5, 12], "inherit": 11, "ini": 2, "input": [2, 6, 7, 8, 12], "input_crop": 7, "input_pag": [7, 9, 12], "input_shap": 11, "input_t": 11, "input_tensor": 7, "inspir": [1, 8], "instal": [], "instanc": [1, 12], "instanti": 12, "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "int8": 11, "integ": [9, 11], "integr": 4, "intel": [], "interact": [1, 6, 9], "interfac": [], "interoper": [], "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": 12, "involv": [1, 12], "io": [], "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7], "isn": 5, "isort": 2, "issu": [1, 2], "italian": [], "iter": [5, 8], "its": [5, 6, 8, 9, 12], "itself": [], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6], "json": [5, 12], "json_output": 12, "jump": 2, "just": [1, 11], "keep": 2, "kei": [], "kera": [7, 11], "kernel": 8, "kernel_s": 11, "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [1, 12], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9], "label_fil": 5, "label_fold": 5, "label_path": 5, "labels_path": 5, "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 12, "languag": [1, 4, 5, 6, 12], "larg": 7, "largest": 9, "last": [3, 5], "latenc": [], "later": 2, "latest": [3, 12], "latin": 5, "layer": [], "layout": 12, "lead": 1, "leader": 1, "learn": [1, 4, 7, 12], "least": 3, "left": [9, 12], "legacy_french": 5, "length": 5, "less": [], "let": [], "letter": [], "level": [1, 5, 9, 12], "levenshtein": [], "leverag": 10, "lf": [], "libffi": 3, "librari": [2, 3, 10], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 12], "line_1_1": 12, "link": [], "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 12], "linknet_resnet18_rot": 12, "linknet_resnet34": 7, "linknet_resnet50": 7, "linux": 3, "list": [5, 6, 8, 9], "ll": 9, "load": [4, 11], "load_state_dict": [], "load_weight": [], "loader": 5, "loc_pr": [], "local": [2, 4, 5, 7, 9, 12], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": [], "login_to_hub": [], "logo": 6, "look": 12, "love": [], "lower": [8, 9], "m": [9, 12], "m1": [], "macbook": [], "machin": [], "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 12], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 11, 12], "mani": [5, 12], "manipul": [], "map": 5, "map_loc": [], "mask_shap": 9, "master": [4, 7, 12], "match": [9, 12], "mathcal": 9, "matplotlib": 9, "max": [8, 9], "max_angl": 8, "max_area": 8, "max_char": 5, "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8, 11], "mbox": 9, "mean": [8, 9], "meaniou": 9, "meant": [6, 11], "measur": 12, "media": 1, "median": [], "meet": [], "member": 1, "memori": 9, "mention": 12, "merg": 5, "messag": 2, "meta": 12, "metadata": [], "metal": [], "method": [8, 12], "metric": [9, 12], "middl": [], "might": [11, 12], "min": 8, "min_area": 8, "min_char": 5, "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": 7, "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "mobilenetv3_larg": [], "mobilenetv3_smal": [], "modal": [], "mode": 3, "model": [5, 9], "model_nam": [], "model_path": [], "moder": 1, "modif": 2, "modifi": 7, "modul": [6, 8, 9, 12], "moment": 12, "more": [2, 9, 12], "most": 12, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "mypi": 2, "n": [5, 9], "na": [], "name": [5, 7, 12], "nation": 1, "natur": [1, 4, 5], "nb": 12, "ndarrai": [5, 6, 8, 9], "necessari": 3, "need": [2, 3, 5, 9], "neg": 8, "nest": 12, "nestedobject": [], "network": [4, 7], "neural": [4, 7], "new": [2, 9], "newer": [], "next": 5, "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 8, 9, 12], "normal": [7, 8], "norwegian": [], "note": [0, 5], "now": 2, "np": [7, 8, 9, 11, 12], "num_output_channel": 8, "num_sampl": 5, "num_work": 5, "number": [5, 8, 9, 12], "numpi": [6, 7, 9, 12], "o": 3, "obb": [], "obj_detect": [], "object": [5, 9, 10, 12], "objectness_scor": [], "oblig": 1, "obtain": 12, "occupi": [], "ocr": [4, 5, 7, 9], "ocr_carea": 12, "ocr_db_crnn": 9, "ocr_lin": 12, "ocr_pag": 12, "ocr_par": 12, "ocr_predictor": [7, 12], "ocrdataset": 5, "ocrmetr": 9, "ocrpredictor": 7, "ocrx_word": 12, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 12, "one": [2, 5, 7, 8, 12], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 12], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2], "oper": [2, 11], "opinion": 1, "opsset": 11, "optic": [4, 12], "optim": [4, 11], "option": 5, "order": [5, 6, 8], "org": [1, 7, 12], "organ": 6, "orient": [1, 6, 7, 12], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 9], "our": [7, 12], "out": [2, 7, 8, 9, 12], "outpout": 12, "output": [6, 8], "output_s": [6, 8], "outsid": [], "over": [3, 5, 9, 12], "overal": 1, "overlai": 6, "overview": [], "overwrit": [], "overwritten": [], "own": [4, 5], "p": [8, 9, 12], "packag": [2, 4, 9, 11], "pad": [5, 7, 8, 12], "page": [3, 5, 7, 9, 12], "page1": 6, "page2": 6, "page_1": 12, "page_idx": [6, 12], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 12, "paragraph": [], "paragraph_break": [], "param": [8, 12], "paramet": [4, 5, 6, 7, 8, 9], "pars": [4, 5], "parseq": [], "part": [5, 8, 12], "parti": 3, "partial": [], "particip": 1, "pass": [5, 6, 7, 12], "password": [], "patch": [], "path": [5, 6, 11], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": 1, "pdf": [6, 7, 10], "pdf_render": 6, "pdfpage": [], "peopl": 1, "per": [8, 12], "perform": [4, 6, 8, 9, 11, 12], "period": 1, "permiss": 1, "permut": [], "persian_lett": [], "person": [1, 5], "phase": 12, "photo": [], "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 12], "platinum": 12, "pleas": 2, "plot": 9, "plt": 9, "plug": [], "plugin": [], "png": 6, "point": [], "polici": [], "polish": [], "polit": 1, "polygon": [5, 12], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9], "post": [1, 12], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 12, "pr": 2, "pre": 7, "precis": [9, 12], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 5, "predict": [6, 7, 9], "predictor": [4, 6, 7], "prefer": 5, "preinstal": [], "preprocessor": 12, "prerequisit": [], "present": 10, "preserv": [7, 8, 12], "preserve_aspect_ratio": [6, 7, 8, 12], "pretrain": [4, 7, 9, 11, 12], "pretrained_backbon": [], "print": 12, "prior": 5, "privaci": 1, "privat": [1, 12], "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 12], "processor": 12, "produc": [10, 12], "product": 11, "profession": 1, "project": [2, 5], "promptli": 1, "proper": 2, "properli": 5, "properti": 11, "provid": [1, 2, 4, 5, 11, 12], "public": [1, 4], "publicli": 12, "publish": 1, "pull": [], "punctuat": 5, "pure": [], "purpos": 2, "push_to_hf_hub": [], "py": 2, "pydocstyl": 2, "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": [], "pytorch": [3, 4, 8, 12], "q": 2, "qr": 6, "qr_code": [], "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": [], "race": 1, "ramdisk": [], "rand": [7, 8, 9, 11, 12], "random": [7, 8, 9, 11, 12], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": [8, 11], "rassi": [], "ratio": [7, 8, 12], "raw": [6, 9], "re": [], "read": [2, 4, 5, 7], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 11, "real": [4, 7, 8], "reason": 1, "rebuild": [], "rebuilt": [], "recal": [9, 12], "receipt": [4, 5, 12], "reco_arch": 7, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": [5, 9], "recognition_predictor": [7, 12], "recognition_task": [], "recognitiondataset": 5, "recognitionpredictor": 7, "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 12], "regardless": 1, "region": [], "regroup": 9, "regular": [], "reject": 1, "rel": [6, 8, 9], "relat": [2, 6], "releas": [0, 3], "relev": [], "religion": 1, "relu": 11, "remov": 1, "render": 6, "render_pdf_topil": 6, "reorder": 2, "repo": [], "repo_id": [], "report": 1, "repositori": [2, 5], "repres": [1, 9, 12], "represent": [4, 7], "representative_dataset": 11, "request": 1, "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 12], "resnet": 7, "resnet18": 7, "resnet31": 7, "resnet34": 7, "resnet50": 7, "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": 11, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": [], "result": [2, 5, 6, 10, 12], "resum": 12, "return": [5, 6, 7, 9, 12], "reusabl": 12, "review": 1, "rgb": [6, 8], "rgb_mode": [], "rgb_output": 6, "right": [1, 7, 9], "road": 12, "robust": [4, 5], "root": [2, 5], "rotat": [5, 6, 7, 8, 9, 12], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 5, 6, 9, 12], "sampl": [5, 12], "sample_transform": 5, "sane": 2, "sar": [4, 7], "sar_resnet31": [7, 12], "sar_vgg16_bn": [], "satur": 8, "save": [5, 11], "saved_model": 11, "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": 2, "seamless": 4, "seamlessli": 12, "search": 7, "searchabl": 10, "sec": [], "second": 12, "section": [11, 12], "secur": 1, "see": [1, 2], "seemlessli": 4, "seen": 12, "segment": [4, 7, 12], "self": [], "semant": [4, 7], "send": 12, "sens": 9, "sensit": [5, 12], "separ": 12, "sequenc": [4, 5, 6, 7, 9, 12], "sequenti": [8, 11], "seri": 1, "serial": 11, "serialized_model": 11, "seriou": 1, "set": [1, 2, 5, 7, 9, 12], "set_global_polici": [], "sever": [6, 8, 12], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 11, 12], "share": [5, 12], "shift": 8, "shm": [], "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 5, "simpl": [4, 7], "simpler": 7, "sinc": [5, 12], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 12], "skew": 12, "slack": 2, "slightli": [], "small": [2, 7], "smallest": 6, "snapshot_download": [], "snippet": 12, "so": [2, 3, 5], "social": 1, "socio": 1, "some": [2, 3, 5, 10], "someth": 2, "somewher": 2, "sort": 1, "sourc": [5, 6, 7, 8, 9], "space": 1, "span": 12, "spanish": 5, "spatial": [6, 9], "special": [], "specif": [2, 3, 5, 9, 12], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": [], "sroie": [4, 5], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": 8, "step": [], "still": 12, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 12], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 12], "strive": 3, "strong": [4, 7], "structur": 12, "style": 2, "subset": [5, 12], "suggest": 2, "sum": 9, "summari": 9, "support": 12, "supported_op": 11, "supported_typ": 11, "sustain": 1, "svhn": [4, 5], "svt": 5, "swedish": [], "symbol": [], "symmetr": [7, 8, 12], "symmetric_pad": [7, 8, 12], "synthes": 9, "synthesize_pag": 9, "synthet": [], "synthtext": [4, 5], "system": 12, "t": [2, 5, 12], "tabl": [], "take": [1, 5, 11, 12], "target": [5, 6, 8, 9], "target_s": 5, "target_spec": 11, "task": [4, 5, 12], "task2": 5, "tax": 12, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 12], "tensorflow": [3, 4, 6, 7, 8, 12], "tensorspec": [], "term": 1, "test": [], "test_set": 5, "text": [5, 6, 7, 9], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 12], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 12], "tf": [3, 6, 7, 8, 11], "tf_model": 11, "tflite": 11, "tflite_builtins_int8": 11, "tfliteconvert": 11, "than": [2, 3, 9], "thank": [], "thei": [1, 9, 12], "them": [3, 5, 12], "thi": [1, 2, 3, 5, 9, 11, 12], "thing": [11, 12], "third": 3, "those": [1, 3, 6, 12], "threaten": 1, "threshold": [], "through": [1, 5, 8], "tilman": [], "time": [1, 4, 5, 7, 9], "tini": [], "titl": [6, 12], "tm": [], "tmp": [], "togeth": [2, 6], "tograi": 8, "tool": 5, "top": [9, 12], "topic": 2, "torch": [3, 8], "torchvis": 8, "total": [], "toward": [1, 3], "train": [2, 5, 7, 8, 12], "train_it": 5, "train_load": 5, "train_pytorch": [], "train_set": 5, "train_tensorflow": [], "trainabl": [4, 7], "tranform": 8, "transcrib": 12, "transfer": [4, 5], "transfo": 8, "transform": 5, "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12], "truth": 9, "tune": 11, "tupl": [5, 6, 8, 9], "turn": [], "two": 6, "txt": [], "type": [6, 12], "typic": 12, "u": [1, 2, 12], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 12], "ukrainian": [], "unaccept": 1, "underli": 5, "underneath": 6, "understand": [4, 5, 12], "unfortun": 12, "unidecod": 9, "uniform": [7, 8, 11], "uniformli": 8, "uninterrupt": [6, 12], "union": 9, "unittest": 2, "unlock": [], "unoffici": [], "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 12], "updat": 9, "upgrad": [], "upper": [5, 8], "uppercas": [], "url": 6, "us": [1, 2, 3, 5, 7, 9, 12], "usabl": 12, "usag": [], "use_broadcast": 9, "use_polygon": [5, 9], "useabl": 12, "user": [3, 4, 6, 10], "utf": 12, "util": 11, "v0": [], "v1": [], "v3": 7, "valid": [], "valu": [2, 6, 8, 12], "valuabl": 4, "variabl": [], "varieti": 5, "variou": 12, "veri": 7, "verifi": 2, "version": [1, 2, 11, 12], "vgg": 7, "vgg16": [], "vgg16_bn": [], "vgg16_bn_r": 7, "via": 1, "vietnames": [], "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": 5, "visiondataset": 5, "visiontransform": [], "visual": 4, "visualize_pag": 9, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": 12, "vocabulari": 5, "w": [6, 7, 8, 9], "w3": 12, "wa": 1, "wai": [1, 4, 5], "want": [11, 12], "warm": [], "warmup": 12, "wasn": 2, "we": [1, 2, 3, 4, 5, 6, 8, 12], "weasyprint": [], "web": 6, "websit": 5, "welcom": 1, "well": [1, 11], "were": [1, 6, 12], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9, 12], "whether": [2, 5, 6, 8, 9], "which": [1, 12], "whichev": 3, "while": [8, 12], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": [2, 11], "within": 1, "without": [1, 7], "wonder": 2, "word": [4, 5, 9, 12], "word_1_1": 12, "word_1_2": 12, "word_1_3": 12, "wordgener": 5, "words_onli": 9, "work": 12, "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 12], "worth": [], "wrap": 12, "wrapper": [5, 8], "write": [], "written": [1, 6], "www": [1, 6, 12], "x": [6, 8, 9], "x12larg": 12, "x_ascend": 12, "x_descend": 12, "x_i": 9, "x_size": 12, "x_wconf": 12, "xeon": 12, "xhtml": 12, "xmax": 6, "xmin": 6, "xml": 12, "xml_bytes_str": 12, "xml_element": 12, "xml_output": 12, "xmln": 12, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": 11, "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12], "your": [2, 4, 5, 6, 9, 12], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": [], "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": [], "2023": [], "2024": [], "22": 0, "27": 0, "28": 0, "29": [], "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": [], "7": [], "8": [], "9": [], "advanc": [], "annot": 2, "approach": 12, "architectur": 12, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [5, 12], "aw": [], "backbon": [], "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": 12, "classif": 7, "code": [1, 2], "codebas": 2, "commit": 2, "commun": [], "compos": 8, "compress": 11, "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": [], "correct": 1, "coven": 1, "custom": [], "data": 5, "dataload": [], "dataset": [4, 5], "detect": [4, 7, 12], "develop": 2, "do": 12, "docstr": 2, "doctr": [2, 4, 5, 6, 7, 8, 9, 10], "document": [2, 4, 6], "end": 12, "enforc": 1, "evalu": 9, "export": [], "factori": [], "featur": [2, 4], "feedback": 2, "file": 6, "format": 2, "from": [], "gener": [], "get": [], "git": 3, "guidelin": 1, "half": 11, "hub": [], "huggingfac": [], "i": 12, "implement": [], "import": 2, "infer": 11, "instal": [2, 3], "integr": 2, "io": 6, "lambda": [], "let": 2, "line": 6, "lint": 2, "linux": [], "lite": 11, "load": 5, "loader": [], "main": 4, "mode": 2, "model": [4, 7, 11, 12], "modifi": 2, "modul": [], "name": [], "note": [], "notebook": 10, "object": [], "ocr": 12, "onli": [], "onnx": [], "optim": [], "option": [], "order": 2, "orient": [], "our": 1, "output": 12, "own": [], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": 11, "pre": [], "precis": 11, "predictor": 12, "prepar": 11, "prerequisit": 3, "pretrain": [], "privat": 5, "process": [], "public": 5, "push": [], "python": 3, "qualiti": 2, "quantiz": 11, "question": 2, "read": 6, "readi": [], "recognit": [4, 7, 12], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [], "right": 12, "savedmodel": 11, "scope": 1, "share": [], "should": 12, "stage": 12, "standard": 1, "start": [], "structur": [2, 6], "style": [], "support": [4, 5, 8], "synthet": 5, "task": 9, "temporari": 1, "tensorflow": 11, "test": 2, "text": [4, 12], "train": 11, "transform": 8, "two": 12, "type": 2, "unit": 2, "us": 11, "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 12, "word": 6, "your": 11, "zoo": [4, 7, 12]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file diff --git a/v0.5.1/transforms.html b/v0.5.1/transforms.html index 0d1b5f7402..d42da50481 100644 --- a/v0.5.1/transforms.html +++ b/v0.5.1/transforms.html @@ -227,28 +227,21 @@ @@ -293,7 +286,7 @@

                                          doctr.transformstorchvision, we express transformations as composable modules.

                                          Supported transformations

                                          -

                                          Here are all transformations that are available through docTR:

                                          +

                                          Here are all transformations that are available through DocTR:

                                          class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
                                          @@ -364,7 +357,7 @@

                                          Supported transformations
                                          -class doctr.transforms.ToGray(num_output_channels: int = 1)[source]
                                          +class doctr.transforms.ToGray[source]

                                          Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

                                          Example::
                                          >>> from doctr.transforms import Normalize
                                          @@ -524,88 +517,6 @@ 

                                          Supported transformations -
                                          -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
                                          -

                                          Randomly rotate a tensor image and its boxes

                                          -https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png -
                                          -
                                          Parameters:
                                          -
                                            -
                                          • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

                                          • -
                                          • expand – whether the image should be padded before the rotation

                                          • -
                                          -
                                          -
                                          -

                                          - -
                                          -
                                          -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
                                          -

                                          Randomly crop a tensor image and its boxes

                                          -
                                          -
                                          Parameters:
                                          -
                                            -
                                          • scale – tuple of floats, relative (min_area, max_area) of the crop

                                          • -
                                          • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

                                          • -
                                          -
                                          -
                                          -
                                          - -
                                          -
                                          -class doctr.transforms.GaussianBlur(kernel_shape: int | Iterable[int], std: Tuple[float, float])[source]
                                          -

                                          Randomly adjust jpeg quality of a 3 dimensional RGB image

                                          -
                                          -
                                          Example::
                                          >>> from doctr.transforms import GaussianBlur
                                          ->>> import tensorflow as tf
                                          ->>> transfo = GaussianBlur(3, (.1, 5))
                                          ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
                                          -
                                          -
                                          -
                                          -
                                          -
                                          -
                                          Parameters:
                                          -
                                            -
                                          • kernel_shape – size of the blurring kernel

                                          • -
                                          • std – min and max value of the standard deviation

                                          • -
                                          -
                                          -
                                          -
                                          - -
                                          -
                                          -class doctr.transforms.ChannelShuffle[source]
                                          -

                                          Randomly shuffle channel order of a given image

                                          -
                                          - -
                                          -
                                          -class doctr.transforms.GaussianNoise(mean: float = 0.0, std: float = 1.0)[source]
                                          -

                                          Adds Gaussian Noise to the input tensor

                                          -
                                          -
                                          Example::
                                          >>> from doctr.transforms import GaussianNoise
                                          ->>> import tensorflow as tf
                                          ->>> transfo = GaussianNoise(0., 1.)
                                          ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
                                          -
                                          -
                                          -
                                          -
                                          -
                                          -
                                          Parameters:
                                          -
                                            -
                                          • mean – mean of the gaussian distribution

                                          • -
                                          • std – std of the gaussian distribution

                                          • -
                                          -
                                          -
                                          -
                                          -

                                          Composing transformations

                                          @@ -744,11 +655,6 @@

                                          Composing transformationsRandomHue
                                        • RandomGamma
                                        • RandomJpegQuality
                                        • -
                                        • RandomRotate
                                        • -
                                        • RandomCrop
                                        • -
                                        • GaussianBlur
                                        • -
                                        • ChannelShuffle
                                        • -
                                        • GaussianNoise
                                      • Composing transformations @@ -936,7 +1249,7 @@

                                        What should I do with the output? + diff --git a/v0.5.1/using_model_export.html b/v0.5.1/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.5.1/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                        -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        -
                                        -
                                        - - - - - Back to top - -
                                        - -
                                        - -
                                        - -
                                        -
                                        -
                                        -

                                        Preparing your model for inference

                                        -

                                        A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

                                        -
                                        -

                                        Model compression

                                        -

                                        This section is meant to help you perform inference with compressed versions of your model.

                                        -
                                        -

                                        TensorFlow Lite

                                        -

                                        TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

                                        -
                                        >>> import tensorflow as tf
                                        ->>> from tensorflow.keras import Sequential
                                        ->>> from doctr.models import conv_sequence
                                        ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
                                        ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
                                        ->>> serialized_model = converter.convert()
                                        -
                                        -
                                        -
                                        -
                                        -

                                        Half-precision

                                        -

                                        If you want to convert it to half-precision using your TFLite converter

                                        -
                                        >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                                        ->>> converter.target_spec.supported_types = [tf.float16]
                                        ->>> serialized_model = converter.convert()
                                        -
                                        -
                                        -
                                        -
                                        -

                                        Post-training quantization

                                        -

                                        Finally if you wish to quantize the model with your TFLite converter

                                        -
                                        >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
                                        ->>> # Float fallback for operators that do not have an integer implementation
                                        ->>> def representative_dataset():
                                        ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
                                        ->>> converter.representative_dataset = representative_dataset
                                        ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
                                        ->>> converter.inference_input_type = tf.int8
                                        ->>> converter.inference_output_type = tf.int8
                                        ->>> serialized_model = converter.convert()
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -

                                        Using SavedModel

                                        -

                                        Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

                                        -
                                        >>> import tensorflow as tf
                                        ->>> from doctr.models import db_resnet50
                                        ->>> model = db_resnet50(pretrained=True)
                                        ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
                                        ->>> _ = model(input_t, training=False)
                                        ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
                                        -
                                        -
                                        -

                                        And loaded just as easily:

                                        -
                                        >>> import tensorflow as tf
                                        ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
                                        -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        - -
                                        - -
                                        -
                                        - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/using_models.html b/v0.5.1/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.5.1/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                        -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        -
                                        -
                                        - - - - - Back to top - -
                                        - -
                                        - -
                                        - -
                                        -
                                        -
                                        -

                                        Choosing the right model

                                        -

                                        The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

                                        -

                                        For a given task, docTR provides a Predictor, which is composed of 2 components:

                                        -
                                          -
                                        • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

                                        • -
                                        • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

                                        • -
                                        -
                                        -

                                        Text Detection

                                        -

                                        The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

                                        -
                                        -

                                        Available architectures

                                        -

                                        The following architectures are currently supported:

                                        - -

                                        For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                        -
                                        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                        FUNSD

                                        CORD

                                        Architecture

                                        Input shape

                                        # params

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        FPS

                                        db_resnet50

                                        (1024, 1024, 3)

                                        25.2 M

                                        82.14

                                        87.64

                                        92.49

                                        89.66

                                        2.1

                                        db_mobilenet_v3_large

                                        (1024, 1024, 3)

                                        4.2 M

                                        79.35

                                        84.03

                                        81.14

                                        66.85

                                        -
                                        -

                                        All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                                        -

                                        Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

                                        -

                                        FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                        -
                                        -
                                        -

                                        Detection predictors

                                        -

                                        detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

                                        -
                                        >>> import numpy as np
                                        ->>> from doctr.models import detection_predictor
                                        ->>> predictor = detection_predictor('db_resnet50')
                                        ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                                        ->>> out = model([dummy_img])
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -

                                        Text Recognition

                                        -

                                        The task consists of transcribing the character sequence in a given image.

                                        -
                                        -

                                        Available architectures

                                        -

                                        The following architectures are currently supported:

                                        - -

                                        For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                        -
                                        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                        Text recognition model zoo

                                        Architecture

                                        Input shape

                                        # params

                                        FUNSD

                                        CORD

                                        FPS

                                        crnn_vgg16_bn

                                        (32, 128, 3)

                                        15.8M

                                        87.18

                                        92.93

                                        12.8

                                        crnn_mobilenet_v3_small

                                        (32, 128, 3)

                                        2.1M

                                        86.21

                                        90.56

                                        crnn_mobilenet_v3_large

                                        (32, 128, 3)

                                        4.5M

                                        86.95

                                        92.03

                                        sar_resnet31

                                        (32, 128, 3)

                                        56.2M

                                        87.70

                                        93.41

                                        2.7

                                        master

                                        (32, 128, 3)

                                        67.7M

                                        87.62

                                        93.27

                                        -
                                        -

                                        All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

                                        -

                                        While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

                                        -
                                        >>> from doctr.models import recognition_predictor
                                        ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                                        ->>> print(predictor.model.cfg['vocab'])
                                        -
                                        -
                                        -

                                        Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

                                        -

                                        FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                        -
                                        -
                                        -

                                        Recognition predictors

                                        -

                                        recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

                                        -
                                        >>> import numpy as np
                                        ->>> from doctr.models import recognition_predictor
                                        ->>> predictor = recognition_predictor('crnn_vgg16_bn')
                                        ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
                                        ->>> out = model([dummy_img])
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -

                                        End-to-End OCR

                                        -

                                        The task consists of both localizing and transcribing textual elements in a given image.

                                        -
                                        -

                                        Available architectures

                                        -

                                        You can use any combination of detection and recognition models supporte by docTR.

                                        -

                                        For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

                                        -
                                        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                        FUNSD

                                        CORD

                                        Architecture

                                        Recall

                                        Precision

                                        FPS

                                        Recall

                                        Precision

                                        FPS

                                        db_resnet50 + crnn_vgg16_bn

                                        71.25

                                        76.02

                                        0.85

                                        84.00

                                        81.42

                                        1.6

                                        db_resnet50 + master

                                        71.03

                                        76.06

                                        84.49

                                        81.94

                                        db_resnet50 + sar_resnet31

                                        71.25

                                        76.29

                                        0.27

                                        84.50

                                        81.96

                                        0.83

                                        db_resnet50 + crnn_mobilenet_v3_small

                                        69.85

                                        74.80

                                        80.85

                                        78.42

                                        0.83

                                        db_resnet50 + crnn_mobilenet_v3_large

                                        70.57

                                        75.57

                                        82.57

                                        80.08

                                        0.83

                                        db_mobilenet_v3_large + crnn_vgg16_bn

                                        67.73

                                        71.73

                                        71.65

                                        59.03

                                        Gvision text detection

                                        59.50

                                        62.50

                                        75.30

                                        70.00

                                        Gvision doc. text detection

                                        64.00

                                        53.30

                                        68.90

                                        61.10

                                        AWS textract

                                        78.10

                                        83.00

                                        87.50

                                        66.00

                                        -
                                        -

                                        All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

                                        -

                                        Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

                                        -

                                        FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

                                        -

                                        Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

                                        -
                                        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

                                        Receipts

                                        Invoices

                                        IDs

                                        US Tax Forms

                                        Resumes

                                        Road Fines

                                        Architecture

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        Recall

                                        Precision

                                        db_resnet50 + crnn_vgg16_bn (ours)

                                        78.70

                                        81.12

                                        65.80

                                        70.70

                                        50.25

                                        51.78

                                        79.08

                                        92.83

                                        db_resnet50 + master (ours)

                                        79.00

                                        81.42

                                        65.57

                                        69.86

                                        51.34

                                        52.90

                                        78.86

                                        92.57

                                        db_resnet50 + sar_resnet31 (ours)

                                        78.94

                                        81.37

                                        65.89

                                        70.79

                                        51.78

                                        53.35

                                        79.04

                                        92.78

                                        db_resnet50 + crnn_mobilenet_v3_small (ours)

                                        76.81

                                        79.15

                                        64.89

                                        69.61

                                        45.03

                                        46.38

                                        78.96

                                        92.11

                                        85.91

                                        87.20

                                        84.85

                                        85.86

                                        db_resnet50 + crnn_mobilenet_v3_large (ours)

                                        78.01

                                        80.39

                                        65.36

                                        70.11

                                        48.00

                                        49.43

                                        79.39

                                        92.62

                                        87.68

                                        89.00

                                        85.65

                                        86.67

                                        db_mobilenet_v3_large + crnn_vgg16_bn (ours)

                                        78.36

                                        74.93

                                        63.04

                                        68.41

                                        39.36

                                        41.75

                                        72.14

                                        89.97

                                        Gvision doc. text detection

                                        68.91

                                        59.89

                                        63.20

                                        52.85

                                        43.70

                                        29.21

                                        69.79

                                        65.68

                                        AWS textract

                                        75.77

                                        77.70

                                        70.47

                                        69.13

                                        46.39

                                        43.32

                                        84.31

                                        98.11

                                        -
                                        -
                                        -
                                        -

                                        Two-stage approaches

                                        -

                                        Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

                                        -
                                        >>> import numpy as np
                                        ->>> from doctr.models import ocr_predictor
                                        ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
                                        ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
                                        ->>> out = model([input_page])
                                        -
                                        -
                                        -
                                        -
                                        -

                                        What should I do with the output?

                                        -

                                        The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

                                        -

                                        Here is a typical Document layout:

                                        -
                                        Document(
                                        -  (pages): [Page(
                                        -    dimensions=(340, 600)
                                        -    (blocks): [Block(
                                        -      (lines): [Line(
                                        -        (words): [
                                        -          Word(value='No.', confidence=0.91),
                                        -          Word(value='RECEIPT', confidence=0.99),
                                        -          Word(value='DATE', confidence=0.96),
                                        -        ]
                                        -      )]
                                        -      (artefacts): []
                                        -    )]
                                        -  )]
                                        -)
                                        -
                                        -
                                        -

                                        You can also export them as a nested dict, more appropriate for JSON format:

                                        -
                                        json_output = result.export()
                                        -
                                        -
                                        -

                                        For reference, here is the JSON export for the same Document as above:

                                        -
                                        {
                                        -  'pages': [
                                        -      {
                                        -          'page_idx': 0,
                                        -          'dimensions': (340, 600),
                                        -          'orientation': {'value': None, 'confidence': None},
                                        -          'language': {'value': None, 'confidence': None},
                                        -          'blocks': [
                                        -              {
                                        -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                                        -                  'lines': [
                                        -                      {
                                        -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
                                        -                          'words': [
                                        -                              {
                                        -                                  'value': 'No.',
                                        -                                  'confidence': 0.914085328578949,
                                        -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
                                        -                              },
                                        -                              {
                                        -                                  'value': 'RECEIPT',
                                        -                                  'confidence': 0.9949972033500671,
                                        -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
                                        -                              },
                                        -                              {
                                        -                                  'value': 'DATE',
                                        -                                  'confidence': 0.9578408598899841,
                                        -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
                                        -                              }
                                        -                          ]
                                        -                      }
                                        -                  ],
                                        -                  'artefacts': []
                                        -              }
                                        -          ]
                                        -      }
                                        -  ]
                                        -}
                                        -
                                        -
                                        -

                                        To export the outpout as XML (hocr-format) you can use the export_as_xml method:

                                        -
                                        xml_output = result.export_as_xml()
                                        -for output in xml_output:
                                        -  xml_bytes_string = output[0]
                                        -  xml_element = output[1]
                                        -
                                        -
                                        -

                                        For reference, here is a sample XML byte string output:

                                        -
                                        <?xml version="1.0" encoding="UTF-8"?>
                                        -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
                                        -  <head>
                                        -    <title>docTR - hOCR</title>
                                        -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
                                        -    <meta name="ocr-system" content="doctr 0.5.0" />
                                        -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
                                        -  </head>
                                        -  <body>
                                        -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
                                        -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
                                        -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
                                        -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
                                        -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
                                        -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
                                        -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
                                        -        </span>
                                        -      </p>
                                        -    </div>
                                        -  </body>
                                        -</html>
                                        -
                                        -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        - -
                                        - -
                                        -
                                        - - - - - - - - \ No newline at end of file diff --git a/v0.5.1/utils.html b/v0.5.1/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.5.1/utils.html +++ b/v0.5.1/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

                                        Visualization -
                                        -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
                                        -

                                        Draw a the content of the element page (OCR response) on a blank page.

                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • page – exported Page object to represent

                                        • -
                                        • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                                        • -
                                        • font_size – size of the font, default font = 13

                                        • -
                                        • font_family – family of the font

                                        • -
                                        -
                                        -
                                        Returns:
                                        -

                                        the synthesized page

                                        -
                                        -
                                        -
                                        -

                                        Task evaluation

                                        @@ -382,20 +356,6 @@

                                        Visualization -
                                        -update(gt: List[str], pred: List[str]) None[source]
                                        -

                                        Update the state of the metric with new predictions

                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • gt – list of groung-truth character sequences

                                        • -
                                        • pred – list of predicted character sequences

                                        • -
                                        -
                                        -
                                        -
                                        -
                                        summary() Dict[str, float][source]
                                        @@ -412,14 +372,14 @@

                                        Visualization
                                        -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                        +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

                                        Implements common confusion metrics and mean IoU for localization evaluation.

                                        The aggregated metrics are computed as follows:

                                        \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

                                        with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

                                        Visualization
                                        Parameters:
                                        -
                                          -
                                        • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                        • -
                                        • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                        • -
                                        • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                        • -
                                        • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                        • -
                                        +

                                        iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                        -
                                        -
                                        -update(gts: ndarray, preds: ndarray) None[source]
                                        -

                                        Updates the metric

                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                        • -
                                        • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                        • -
                                        -
                                        -
                                        -
                                        -
                                        summary() Tuple[float | None, float | None, float | None][source]
                                        @@ -485,15 +426,15 @@

                                        Visualization
                                        -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                        -

                                        Implements an end-to-end OCR metric.

                                        +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

                                        Implements end-to-end OCR metric.

                                        The aggregated metrics are computed as follows:

                                        \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

                                        with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

                                        Visualization
                                        Parameters:
                                        -
                                          -
                                        • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                        • -
                                        • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                        • -
                                        • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                        • -
                                        • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                        • -
                                        -
                                        -

                                        -
                                        -
                                        -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
                                        -

                                        Updates the metric

                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                        • -
                                        • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                        • -
                                        • gt_labels – a list of N string labels

                                        • -
                                        • pred_labels – a list of M string labels

                                        • -
                                        +

                                        iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                        -
                                        -
                                        summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

                                        Computes the aggregated metrics

                                        Returns:
                                        -

                                        a tuple with the recall & precision for each string comparison and the mean IoU

                                        -
                                        -
                                        -
                                        - - - -
                                        -
                                        -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                        -

                                        Implements an object detection metric.

                                        -

                                        The aggregated metrics are computed as follows:

                                        -
                                        -
                                        -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
                                        -
                                        -

                                        with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

                                        -
                                        -
                                        -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
                                        -
                                        -

                                        where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

                                        -
                                        -
                                        Example::
                                        >>> import numpy as np
                                        ->>> from doctr.utils import DetectionMetric
                                        ->>> metric = DetectionMetric(iou_thresh=0.5)
                                        ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
                                        -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
                                        ->>> metric.summary()
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                        • -
                                        • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                        • -
                                        • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                        • -
                                        • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                        • -
                                        -
                                        -
                                        -
                                        -
                                        -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
                                        -

                                        Updates the metric

                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                        • -
                                        • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                        • -
                                        • gt_labels – an array of class indices of shape (N,)

                                        • -
                                        • pred_labels – an array of class indices of shape (M,)

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -summary() Tuple[float | None, float | None, float | None][source]
                                        -

                                        Computes the aggregated metrics

                                        -
                                        -
                                        Returns:
                                        -

                                        a tuple with the recall & precision for each class prediction and the mean IoU

                                        +

                                        a tuple with the recall & precision for each string comparison flexibility and the mean IoU

                                        @@ -649,15 +490,7 @@

                                        Visualization - -
                                        -
                                        - Next -
                                        -
                                        Changelog
                                        -
                                        - -
                                        + diff --git a/v0.6.0/_modules/doctr/datasets/classification/tensorflow.html b/v0.6.0/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.6.0/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                        -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        - -
                                        -
                                        - -
                                        -
                                        -
                                        - - - - - Back to top - -
                                        -
                                        - -
                                        - -
                                        -
                                        -

                                        Source code for doctr.datasets.classification.tensorflow

                                        -# Copyright (C) 2021, Mindee.
                                        -
                                        -# This program is licensed under the Apache License version 2.
                                        -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                        -
                                        -import tensorflow as tf
                                        -
                                        -from .base import _CharacterGenerator
                                        -
                                        -__all__ = ['CharacterGenerator']
                                        -
                                        -
                                        -
                                        -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
                                        - -
                                        -
                                        -
                                        -
                                        - - -
                                        -
                                        - - Made with Sphinx and @pradyunsg's - - Furo - -
                                        -
                                        - -
                                        -
                                        - -
                                        -
                                        - -
                                        -
                                        - - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/datasets/cord.html b/v0.6.0/_modules/doctr/datasets/cord.html index 45265b22b1..3b89955bd8 100644 --- a/v0.6.0/_modules/doctr/datasets/cord.html +++ b/v0.6.0/_modules/doctr/datasets/cord.html @@ -226,35 +226,20 @@

                                        Source code for doctr.datasets.cord

                                        -# Copyright (C) 2021-2022, Mindee.
                                        +# Copyright (C) 2021, Mindee.
                                         
                                        -# This program is licensed under the Apache License 2.0.
                                        -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                        +# This program is licensed under the Apache License version 2.
                                        +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                         
                                        -import json
                                         import os
                                        -from pathlib import Path
                                        -from typing import Any, Dict, List, Tuple, Union
                                        -
                                        +import json
                                         import numpy as np
                                        -from tqdm import tqdm
                                        +from pathlib import Path
                                        +from typing import List, Dict, Any, Tuple, Optional, Callable
                                         
                                         from .datasets import VisionDataset
                                        -from .utils import convert_target_to_relative, crop_bboxes_from_image
                                        +from doctr.utils.geometry import fit_rbbox
                                         
                                        -__all__ = ["CORD"]
                                        +__all__ = ['CORD']
                                         
                                         
                                         
                                        -[docs] +[docs] class CORD(VisionDataset): """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" <https://openreview.net/pdf?id=SJl3z659UH>`_. - .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0 - :align: center - - >>> from doctr.datasets import CORD - >>> train_set = CORD(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import CORD + >>> train_set = CORD(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) - recognition_task: whether the dataset should be used for recognition task + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ + TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_train.zip', + '45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8') - TRAIN = ( - "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0", - "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8", - ) - - TEST = ( - "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0", - "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58", - ) + TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/cord_test.zip', + '8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58') def __init__( self, train: bool = True, - use_polygons: bool = False, - recognition_task: bool = False, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, **kwargs: Any, ) -> None: url, sha256 = self.TRAIN if train else self.TEST - super().__init__( - url, - None, - sha256, - True, - pre_transforms=convert_target_to_relative if not recognition_task else None, - **kwargs, - ) + super().__init__(url, None, sha256, True, **kwargs) - # List images - tmp_root = os.path.join(self.root, "image") - self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = [] + # # List images + self.root = os.path.join(self._root, 'image') + self.data: List[Tuple[str, Dict[str, Any]]] = [] self.train = train - np_dtype = np.float32 - for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))): + self.sample_transforms = sample_transforms + for img_path in os.listdir(self.root): # File existence check - if not os.path.exists(os.path.join(tmp_root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") - + if not os.path.exists(os.path.join(self.root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") stem = Path(img_path).stem _targets = [] - with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f: + with open(os.path.join(self._root, 'json', f"{stem}.json"), 'rb') as f: label = json.load(f) for line in label["valid_line"]: for word in line["words"]: if len(word["text"]) > 0: x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"] y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"] - box: Union[List[float], np.ndarray] - if use_polygons: - # (x, y) coordinates of top left, top right, bottom right, bottom left corners - box = np.array( - [ - [x[0], y[0]], - [x[1], y[1]], - [x[2], y[2]], - [x[3], y[3]], - ], - dtype=np_dtype, - ) + if rotated_bbox: + box = list(fit_rbbox(np.array([ + [x[0], y[0]], + [x[1], y[1]], + [x[2], y[2]], + [x[3], y[3]], + ], dtype=np.float32))) else: - # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax + # Reduce 8 coords to 4 box = [min(x), min(y), max(x), max(y)] - _targets.append((word["text"], box)) + _targets.append((word['text'], box)) text_targets, box_targets = zip(*_targets) - if recognition_task: - crops = crop_bboxes_from_image( - img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0) - ) - for crop, label in zip(crops, list(text_targets)): - self.data.append((crop, label)) - else: - self.data.append( - (img_path, dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets))) - ) - - self.root = tmp_root + self.data.append(( + img_path, + dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=text_targets) + )) def extra_repr(self) -> str: return f"train={self.train}"
                                        @@ -443,7 +398,7 @@

                                        Source code for doctr.datasets.cord

                                               
                                             
                                           
                                        -
                                        + diff --git a/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.6.0/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

                                        Package Reference

                                        • doctr.datasets
                                        • -
                                        • doctr.io
                                        • +
                                        • doctr.documents
                                        • doctr.models
                                        • doctr.transforms
                                        • doctr.utils
                                        • @@ -284,7 +284,6 @@

                                          Source code for doctr.datasets.datasets.tensorflow

                                          from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

                                          Source code for doctr.datasets.datasets.tensorflow

                                          class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

                                          Source code for doctr.datasets.datasets.tensorflow

                                          +
                                          diff --git a/v0.6.0/_modules/doctr/datasets/detection.html b/v0.6.0/_modules/doctr/datasets/detection.html index a6da836fa7..43e148dc88 100644 --- a/v0.6.0/_modules/doctr/datasets/detection.html +++ b/v0.6.0/_modules/doctr/datasets/detection.html @@ -235,12 +235,15 @@

                                          Package Reference

                                          Source code for doctr.datasets.detection

                                          -# Copyright (C) 2021-2022, Mindee.
                                          +# Copyright (C) 2021-2024, Mindee.
                                           
                                           # This program is licensed under the Apache License 2.0.
                                           # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                           
                                           import json
                                           import os
                                          -from typing import Any, List, Tuple
                                          +from typing import Any, Dict, List, Tuple, Type, Union
                                           
                                           import numpy as np
                                           
                                          -from doctr.io.image import get_img_shape
                                          -from doctr.utils.geometry import convert_to_relative_coords
                                          +from doctr.file_utils import CLASS_NAME
                                           
                                           from .datasets import AbstractDataset
                                          +from .utils import pre_transform_multiclass
                                           
                                           __all__ = ["DetectionDataset"]
                                           
                                          @@ -320,6 +323,7 @@ 

                                          Source code for doctr.datasets.detection

                                               >>> img, target = train_set[0]
                                           
                                               Args:
                                          +    ----
                                                   img_folder: folder with all the images of the dataset
                                                   label_path: path to the annotations of each image
                                                   use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                          @@ -335,27 +339,60 @@ 

                                          Source code for doctr.datasets.detection

                                               ) -> None:
                                                   super().__init__(
                                                       img_folder,
                                          -            pre_transforms=lambda img, boxes: (img, convert_to_relative_coords(boxes, get_img_shape(img))),
                                          +            pre_transforms=pre_transform_multiclass,
                                                       **kwargs,
                                                   )
                                           
                                                   # File existence check
                                          +        self._class_names: List = []
                                                   if not os.path.exists(label_path):
                                                       raise FileNotFoundError(f"unable to locate {label_path}")
                                                   with open(label_path, "rb") as f:
                                                       labels = json.load(f)
                                           
                                          -        self.data: List[Tuple[str, np.ndarray]] = []
                                          +        self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = []
                                                   np_dtype = np.float32
                                                   for img_name, label in labels.items():
                                                       # File existence check
                                                       if not os.path.exists(os.path.join(self.root, img_name)):
                                                           raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
                                           
                                          -            polygons: np.ndarray = np.asarray(label["polygons"], dtype=np_dtype)
                                          -            geoms = polygons if use_polygons else np.concatenate((polygons.min(axis=1), polygons.max(axis=1)), axis=1)
                                          +            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
                                           
                                          -            self.data.append((img_name, np.asarray(geoms, dtype=np_dtype)))
                                          + self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + + def format_polygons( + self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type + ) -> Tuple[np.ndarray, List[str]]: + """Format polygons into an array + + Args: + ---- + polygons: the bounding boxes + use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) + np_dtype: dtype of array + + Returns: + ------- + geoms: bounding boxes as np array + polygons_classes: list of classes for each bounding box + """ + if isinstance(polygons, list): + self._class_names += [CLASS_NAME] + polygons_classes = [CLASS_NAME for _ in polygons] + _polygons: np.ndarray = np.asarray(polygons, dtype=np_dtype) + elif isinstance(polygons, dict): + self._class_names += list(polygons.keys()) + polygons_classes = [k for k, v in polygons.items() for _ in v] + _polygons = np.concatenate([np.asarray(poly, dtype=np_dtype) for poly in polygons.values() if poly], axis=0) + else: + raise TypeError(f"polygons should be a dictionary or list, it was {type(polygons)}") + geoms = _polygons if use_polygons else np.concatenate((_polygons.min(axis=1), _polygons.max(axis=1)), axis=1) + return geoms, polygons_classes + + @property + def class_names(self): + return sorted(set(self._class_names))
                                          @@ -389,7 +426,7 @@

                                          Source code for doctr.datasets.detection

                                                 
                                               
                                             
                                          -
                                          + diff --git a/v0.6.0/_modules/doctr/datasets/doc_artefacts.html b/v0.6.0/_modules/doctr/datasets/doc_artefacts.html index c695c96704..172122a216 100644 --- a/v0.6.0/_modules/doctr/datasets/doc_artefacts.html +++ b/v0.6.0/_modules/doctr/datasets/doc_artefacts.html @@ -235,12 +235,15 @@

                                          Package Reference

                                            +
                                          • doctr.contrib
                                          • doctr.datasets
                                          • doctr.io
                                          • doctr.models
                                          • @@ -290,7 +293,7 @@

                                            Source code for doctr.datasets.doc_artefacts

                                            -# Copyright (C) 2021-2022, Mindee.
                                            +# Copyright (C) 2021-2024, Mindee.
                                             
                                             # This program is licensed under the Apache License 2.0.
                                             # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                            @@ -320,6 +323,7 @@ 

                                            Source code for doctr.datasets.doc_artefacts

                                                >>> img, target = train_set[0]
                                             
                                                 Args:
                                            +    ----
                                                     train: whether the subset should be the training one
                                                     use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                     **kwargs: keyword arguments from `VisionDataset`.
                                            @@ -335,7 +339,6 @@ 

                                            Source code for doctr.datasets.doc_artefacts

                                            use_polygons: bool = False,
                                                     **kwargs: Any,
                                                 ) -> None:
                                            -
                                                     super().__init__(self.URL, None, self.SHA256, True, **kwargs)
                                                     self.train = train
                                             
                                            @@ -407,7 +410,7 @@ 

                                            Source code for doctr.datasets.doc_artefacts

                                               
                                            -
                                            +
                                            diff --git a/v0.6.0/_modules/doctr/datasets/funsd.html b/v0.6.0/_modules/doctr/datasets/funsd.html index e347b71ba8..2f5494dc2a 100644 --- a/v0.6.0/_modules/doctr/datasets/funsd.html +++ b/v0.6.0/_modules/doctr/datasets/funsd.html @@ -226,35 +226,20 @@

                                            Source code for doctr.datasets.funsd

                                            -# Copyright (C) 2021-2022, Mindee.
                                            +# Copyright (C) 2021, Mindee.
                                             
                                            -# This program is licensed under the Apache License 2.0.
                                            -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                            +# This program is licensed under the Apache License version 2.
                                            +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                             
                                            -import json
                                             import os
                                            -from pathlib import Path
                                            -from typing import Any, Dict, List, Tuple, Union
                                            -
                                            +import json
                                             import numpy as np
                                            -from tqdm import tqdm
                                            +from pathlib import Path
                                            +from typing import List, Dict, Any, Tuple, Optional, Callable
                                             
                                             from .datasets import VisionDataset
                                            -from .utils import convert_target_to_relative, crop_bboxes_from_image
                                             
                                            -__all__ = ["FUNSD"]
                                            +__all__ = ['FUNSD']
                                             
                                             
                                             
                                            -[docs] +[docs] class FUNSD(VisionDataset): """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" <https://arxiv.org/pdf/1905.13538.pdf>`_. - .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0 - :align: center - - >>> from doctr.datasets import FUNSD - >>> train_set = FUNSD(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import FUNSD + >>> train_set = FUNSD(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) - recognition_task: whether the dataset should be used for recognition task + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ - URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip" - SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f" - FILE_NAME = "funsd.zip" + URL = 'https://guillaumejaume.github.io/FUNSD/dataset.zip' + SHA256 = 'c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f' + FILE_NAME = 'funsd.zip' def __init__( self, train: bool = True, - use_polygons: bool = False, - recognition_task: bool = False, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, **kwargs: Any, ) -> None: - super().__init__( - self.URL, - self.FILE_NAME, - self.SHA256, - True, - pre_transforms=convert_target_to_relative if not recognition_task else None, - **kwargs, - ) + super().__init__(self.URL, self.FILE_NAME, self.SHA256, True, **kwargs) self.train = train - np_dtype = np.float32 + self.sample_transforms = sample_transforms # Use the subset - subfolder = os.path.join("dataset", "training_data" if train else "testing_data") + subfolder = os.path.join('dataset', 'training_data' if train else 'testing_data') # # List images - tmp_root = os.path.join(self.root, subfolder, "images") - self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = [] - for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))): + self.root = os.path.join(self._root, subfolder, 'images') + self.data: List[Tuple[str, Dict[str, Any]]] = [] + for img_path in os.listdir(self.root): # File existence check - if not os.path.exists(os.path.join(tmp_root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") - + if not os.path.exists(os.path.join(self.root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") stem = Path(img_path).stem - with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f: + with open(os.path.join(self._root, subfolder, 'annotations', f"{stem}.json"), 'rb') as f: data = json.load(f) - _targets = [ - (word["text"], word["box"]) - for block in data["form"] - for word in block["words"] - if len(word["text"]) > 0 - ] + _targets = [(word['text'], word['box']) for block in data['form'] + for word in block['words'] if len(word['text']) > 0] text_targets, box_targets = zip(*_targets) - if use_polygons: - # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners + if rotated_bbox: + # box_targets: xmin, ymin, xmax, ymax -> x, y, w, h, alpha = 0 box_targets = [ [ - [box[0], box[1]], - [box[2], box[1]], - [box[2], box[3]], - [box[0], box[3]], - ] - for box in box_targets + (box[0] + box[2]) / 2, (box[1] + box[3]) / 2, box[2] - box[0], box[3] - box[1], 0 + ] for box in box_targets ] - if recognition_task: - crops = crop_bboxes_from_image( - img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype) - ) - for crop, label in zip(crops, list(text_targets)): - # filter labels with unknown characters - if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]): - self.data.append((crop, label)) - else: - self.data.append( - ( - img_path, - dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)), - ) - ) - - self.root = tmp_root + self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=int), labels=text_targets))) def extra_repr(self) -> str: return f"train={self.train}"
                                            @@ -439,7 +387,7 @@

                                            Source code for doctr.datasets.funsd

                                                   
                                                 
                                               
                                            -
                                            +
                                            diff --git a/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html b/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html index 0c2ee4b290..1d6494d28c 100644 --- a/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html +++ b/v0.6.0/_modules/doctr/datasets/generator/tensorflow.html @@ -235,12 +235,15 @@

                                            Package Reference

                                              +
                                            • doctr.contrib
                                            • doctr.datasets
                                            • doctr.io
                                            • doctr.models
                                            • @@ -290,7 +293,7 @@

                                              Source code for doctr.datasets.generator.tensorflow

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -312,6 +315,7 @@ 

                                              Source code for doctr.datasets.generator.tensorflow

                                              >>> img, target = ds[0] Args: + ---- vocab: vocabulary to take the character from num_samples: number of samples that will be generated iterating over the dataset cache_samples: whether generated images should be cached firsthand @@ -325,7 +329,6 @@

                                              Source code for doctr.datasets.generator.tensorflow

                                              @staticmethod def collate_fn(samples): - images, targets = zip(*samples) images = tf.stack(images, axis=0) @@ -343,6 +346,7 @@

                                              Source code for doctr.datasets.generator.tensorflow

                                              >>> img, target = ds[0] Args: + ---- vocab: vocabulary to take the character from min_chars: minimum number of characters in a word max_chars: maximum number of characters in a word @@ -387,7 +391,7 @@

                                              Source code for doctr.datasets.generator.tensorflow

                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/ic03.html b/v0.6.0/_modules/doctr/datasets/ic03.html index 5c4159262c..6680bbc6d7 100644 --- a/v0.6.0/_modules/doctr/datasets/ic03.html +++ b/v0.6.0/_modules/doctr/datasets/ic03.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.ic03

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -322,9 +325,11 @@ 

                                              Source code for doctr.datasets.ic03

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `VisionDataset`.
                                                   """
                                               
                                              @@ -344,9 +349,9 @@ 

                                              Source code for doctr.datasets.ic03

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                              -
                                                       url, sha256, file_name = self.TRAIN if train else self.TEST
                                                       super().__init__(
                                                           url,
                                              @@ -356,8 +361,14 @@ 

                                              Source code for doctr.datasets.ic03

                                                           pre_transforms=convert_target_to_relative if not recognition_task else None,
                                                           **kwargs,
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                              +
                                                       self.train = train
                                              -        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       # Load xml data
                                              @@ -368,7 +379,7 @@ 

                                              Source code for doctr.datasets.ic03

                                                       xml_root = xml_tree.getroot()
                                               
                                                       for image in tqdm(iterable=xml_root, desc="Unpacking IC03", total=len(xml_root)):
                                              -            name, resolution, rectangles = image
                                              +            name, _resolution, rectangles = image
                                               
                                                           # File existence check
                                                           if not os.path.exists(os.path.join(tmp_root, name.text)):
                                              @@ -411,6 +422,8 @@ 

                                              Source code for doctr.datasets.ic03

                                                                   for crop, label in zip(crops, labels):
                                                                       if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                                                                           self.data.append((crop, label))
                                              +                elif detection_task:
                                              +                    self.data.append((name.text, boxes))
                                                               else:
                                                                   self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                                               
                                              @@ -451,7 +464,7 @@ 

                                              Source code for doctr.datasets.ic03

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/ic13.html b/v0.6.0/_modules/doctr/datasets/ic13.html index ad7decd47b..b7c4d9612e 100644 --- a/v0.6.0/_modules/doctr/datasets/ic13.html +++ b/v0.6.0/_modules/doctr/datasets/ic13.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.ic13

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -327,10 +330,12 @@ 

                                              Source code for doctr.datasets.ic13

                                                   >>> img, target = test_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       img_folder: folder with all the images of the dataset
                                                       label_folder: folder with all annotation files for the images
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `AbstractDataset`.
                                                   """
                                               
                                              @@ -340,11 +345,17 @@ 

                                              Source code for doctr.datasets.ic13

                                                       label_folder: str,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                                       super().__init__(
                                                           img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                               
                                                       # File existence check
                                                       if not os.path.exists(label_folder) or not os.path.exists(img_folder):
                                              @@ -352,13 +363,12 @@ 

                                              Source code for doctr.datasets.ic13

                                                               f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
                                                           )
                                               
                                              -        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       img_names = os.listdir(img_folder)
                                               
                                                       for img_name in tqdm(iterable=img_names, desc="Unpacking IC13", total=len(img_names)):
                                              -
                                                           img_path = Path(img_folder, img_name)
                                                           label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
                                               
                                              @@ -389,6 +399,8 @@ 

                                              Source code for doctr.datasets.ic13

                                                               crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
                                                               for crop, label in zip(crops, labels):
                                                                   self.data.append((crop, label))
                                              +            elif detection_task:
                                              +                self.data.append((img_path, box_targets))
                                                           else:
                                                               self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
                                              @@ -424,7 +436,7 @@

                                              Source code for doctr.datasets.ic13

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/iiit5k.html b/v0.6.0/_modules/doctr/datasets/iiit5k.html index a57ca892ab..4759d20b24 100644 --- a/v0.6.0/_modules/doctr/datasets/iiit5k.html +++ b/v0.6.0/_modules/doctr/datasets/iiit5k.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.iiit5k

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -324,9 +327,11 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `VisionDataset`.
                                                   """
                                               
                                              @@ -338,9 +343,9 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                              -
                                                       super().__init__(
                                                           self.URL,
                                                           None,
                                              @@ -349,6 +354,12 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                           pre_transforms=convert_target_to_relative if not recognition_task else None,
                                                           **kwargs,
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                              +
                                                       self.train = train
                                               
                                                       # Load mat data
                                              @@ -356,7 +367,7 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                       mat_file = "trainCharBound" if self.train else "testCharBound"
                                                       mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
                                               
                                              -        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
                                              @@ -367,28 +378,31 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                           if not os.path.exists(os.path.join(tmp_root, _raw_path)):
                                                               raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
                                               
                                              +            if use_polygons:
                                              +                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                              +                box_targets = [
                                              +                    [
                                              +                        [box[0], box[1]],
                                              +                        [box[0] + box[2], box[1]],
                                              +                        [box[0] + box[2], box[1] + box[3]],
                                              +                        [box[0], box[1] + box[3]],
                                              +                    ]
                                              +                    for box in box_targets
                                              +                ]
                                              +            else:
                                              +                # xmin, ymin, xmax, ymax
                                              +                box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
                                              +
                                                           if recognition_task:
                                                               self.data.append((_raw_path, _raw_label))
                                              +            elif detection_task:
                                              +                self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
                                                           else:
                                              -                if use_polygons:
                                              -                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
                                              -                    box_targets = [
                                              -                        [
                                              -                            [box[0], box[1]],
                                              -                            [box[0] + box[2], box[1]],
                                              -                            [box[0] + box[2], box[1] + box[3]],
                                              -                            [box[0], box[1] + box[3]],
                                              -                        ]
                                              -                        for box in box_targets
                                              -                    ]
                                              -                else:
                                              -                    # xmin, ymin, xmax, ymax
                                              -                    box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
                                              -
                                                               # label are casted to list where each char corresponds to the character's bounding box
                                              -                self.data.append(
                                              -                    (_raw_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(_raw_label)))
                                              -                )
                                              +                self.data.append((
                                              +                    _raw_path,
                                              +                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(_raw_label)),
                                              +                ))
                                               
                                                       self.root = tmp_root
                                               
                                              @@ -427,7 +441,7 @@ 

                                              Source code for doctr.datasets.iiit5k

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/imgur5k.html b/v0.6.0/_modules/doctr/datasets/imgur5k.html index 1663d73a38..10d5c082d2 100644 --- a/v0.6.0/_modules/doctr/datasets/imgur5k.html +++ b/v0.6.0/_modules/doctr/datasets/imgur5k.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.imgur5k

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -334,11 +337,13 @@ 

                                              Source code for doctr.datasets.imgur5k

                                                   >>> img, target = test_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       img_folder: folder with all the images of the dataset
                                                       label_path: path to the annotations file of the dataset
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `AbstractDataset`.
                                                   """
                                               
                                              @@ -349,17 +354,23 @@ 

                                              Source code for doctr.datasets.imgur5k

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                                       super().__init__(
                                                           img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                               
                                                       # File existence check
                                                       if not os.path.exists(label_path) or not os.path.exists(img_folder):
                                                           raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
                                               
                                              -        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       self.train = train
                                                       np_dtype = np.float32
                                               
                                              @@ -425,6 +436,8 @@ 

                                              Source code for doctr.datasets.imgur5k

                                                                               tmp_img = Image.fromarray(crop)
                                                                               tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                                                                               reco_images_counter += 1
                                              +                elif detection_task:
                                              +                    self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
                                                               else:
                                                                   self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
                                               
                                              @@ -471,7 +484,7 @@ 

                                              Source code for doctr.datasets.imgur5k

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/loader.html b/v0.6.0/_modules/doctr/datasets/loader.html index 0e7bdaac13..ba5bc217e0 100644 --- a/v0.6.0/_modules/doctr/datasets/loader.html +++ b/v0.6.0/_modules/doctr/datasets/loader.html @@ -226,35 +226,20 @@

                                              Source code for doctr.datasets.loader

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021, Mindee.
                                               
                                              -# This program is licensed under the Apache License 2.0.
                                              -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              +# This program is licensed under the Apache License version 2.
                                              +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                               
                                               import math
                                              -from typing import Callable, Optional
                                              -
                                              -import numpy as np
                                               import tensorflow as tf
                                              +import numpy as np
                                              +from typing import Optional
                                               
                                              -from doctr.utils.multithreading import multithread_exec
                                              +from .multithreading import multithread_exec
                                               
                                               __all__ = ["DataLoader"]
                                               
                                              @@ -324,23 +308,23 @@ 

                                              Source code for doctr.datasets.loader

                                               
                                               
                                               
                                              -[docs] +[docs] class DataLoader: """Implements a dataset wrapper for fast data loading - >>> from doctr.datasets import CORD, DataLoader - >>> train_set = CORD(train=True, download=True) - >>> train_loader = DataLoader(train_set, batch_size=32) - >>> train_iter = iter(train_loader) - >>> images, targets = next(train_iter) + Example:: + >>> from doctr.datasets import FUNSD, DataLoader + >>> train_set = CORD(train=True, download=True) + >>> train_loader = DataLoader(train_set, batch_size=32) + >>> train_iter = iter(train_loader) + >>> images, targets = next(train_iter) Args: dataset: the dataset shuffle: whether the samples should be shuffled before passing it to the iterator batch_size: number of elements in each batch drop_last: if `True`, drops the last batch if it isn't full - num_workers: number of workers to use for data loading - collate_fn: function to merge samples into a batch + workers: number of workers to use for data loading """ def __init__( @@ -349,24 +333,17 @@

                                              Source code for doctr.datasets.loader

                                                       shuffle: bool = True,
                                                       batch_size: int = 1,
                                                       drop_last: bool = False,
                                              -        num_workers: Optional[int] = None,
                                              -        collate_fn: Optional[Callable] = None,
                                              +        workers: Optional[int] = None,
                                                   ) -> None:
                                                       self.dataset = dataset
                                                       self.shuffle = shuffle
                                                       self.batch_size = batch_size
                                                       nb = len(self.dataset) / batch_size
                                                       self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
                                              -        if collate_fn is None:
                                              -            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
                                              -        else:
                                              -            self.collate_fn = collate_fn
                                              -        self.num_workers = num_workers
                                              +        self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, 'collate_fn') else default_collate
                                              +        self.workers = workers
                                                       self.reset()
                                               
                                              -    def __len__(self) -> int:
                                              -        return self.num_batches
                                              -
                                                   def reset(self) -> None:
                                                       # Updates indices after each epoch
                                                       self._num_yielded = 0
                                              @@ -382,9 +359,9 @@ 

                                              Source code for doctr.datasets.loader

                                                       if self._num_yielded < self.num_batches:
                                                           # Get next indices
                                                           idx = self._num_yielded * self.batch_size
                                              -            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
                                              +            indices = self.indices[idx: min(len(self.dataset), idx + self.batch_size)]
                                               
                                              -            samples = list(multithread_exec(self.dataset.__getitem__, indices, threads=self.num_workers))
                                              +            samples = multithread_exec(self.dataset.__getitem__, indices, threads=self.workers)
                                               
                                                           batch_data = self.collate_fn(samples)
                                               
                                              @@ -425,7 +402,7 @@ 

                                              Source code for doctr.datasets.loader

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/mjsynth.html b/v0.6.0/_modules/doctr/datasets/mjsynth.html index 35e5475785..91ee335673 100644 --- a/v0.6.0/_modules/doctr/datasets/mjsynth.html +++ b/v0.6.0/_modules/doctr/datasets/mjsynth.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.mjsynth

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -324,6 +327,7 @@ 

                                              Source code for doctr.datasets.mjsynth

                                                   >>> img, target = test_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       img_folder: folder with all the images of the dataset
                                                       label_path: path to the file with the labels
                                                       train: whether the subset should be the training one
                                              @@ -430,7 +434,7 @@ 

                                              Source code for doctr.datasets.mjsynth

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/ocr.html b/v0.6.0/_modules/doctr/datasets/ocr.html index 6bc825392e..2c4fb1b838 100644 --- a/v0.6.0/_modules/doctr/datasets/ocr.html +++ b/v0.6.0/_modules/doctr/datasets/ocr.html @@ -226,35 +226,20 @@

                                              Source code for doctr.datasets.ocr

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021, Mindee.
                                               
                                              -# This program is licensed under the Apache License 2.0.
                                              -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              +# This program is licensed under the Apache License version 2.
                                              +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                               
                                              -import json
                                               import os
                                              -from pathlib import Path
                                              -from typing import Any, Dict, List, Tuple
                                              -
                                              +import json
                                               import numpy as np
                                              +from pathlib import Path
                                              +from typing import List, Dict, Any, Tuple, Optional, Callable
                                               
                                               from .datasets import AbstractDataset
                                              +from doctr.utils.geometry import fit_rbbox
                                               
                                              -__all__ = ["OCRDataset"]
                                              +
                                              +__all__ = ['OCRDataset']
                                               
                                               
                                               
                                              -[docs] +[docs] class OCRDataset(AbstractDataset): """Implements an OCR dataset - >>> from doctr.datasets import OCRDataset - >>> train_set = OCRDataset(img_folder="/path/to/images", - >>> label_file="/path/to/labels.json") - >>> img, target = train_set[0] - Args: img_folder: local path to image folder (all jpg at the root) label_file: local path to the label file - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) - **kwargs: keyword arguments from `AbstractDataset`. + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `VisionDataset`. """ def __init__( self, img_folder: str, label_file: str, - use_polygons: bool = False, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, **kwargs: Any, ) -> None: - super().__init__(img_folder, **kwargs) + + self.sample_transforms = sample_transforms + self.root = img_folder # List images self.data: List[Tuple[str, Dict[str, Any]]] = [] - np_dtype = np.float32 - with open(label_file, "rb") as f: + with open(label_file, 'rb') as f: data = json.load(f) - for img_name, annotations in data.items(): + for file_dic in data: # Get image path - img_name = Path(img_name) + img_name = Path(os.path.basename(file_dic["raw-archive-filepath"])).stem + '.jpg' # File existence check if not os.path.exists(os.path.join(self.root, img_name)): raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") # handle empty images - if len(annotations["typed_words"]) == 0: - self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[]))) + if (len(file_dic["coordinates"]) == 0 or + (len(file_dic["coordinates"]) == 1 and file_dic["coordinates"][0] == "N/A")): + self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np.float32), labels=[]))) continue - # Unpack the straight boxes (xmin, ymin, xmax, ymax) - geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]] - if use_polygons: - # (x, y) coordinates of top left, top right, bottom right, bottom left corners - geoms = [ - [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]] # type: ignore[list-item] - for geom in geoms - ] - - text_targets = [obj["value"] for obj in annotations["typed_words"]] + is_valid: List[bool] = [] + box_targets: List[List[float]] = [] + for box in file_dic["coordinates"]: + if rotated_bbox: + x, y, w, h, alpha = fit_rbbox(np.asarray(box, dtype=np.float32)) + box = [x, y, w, h, alpha] + is_valid.append(w > 0 and h > 0) + else: + xs, ys = zip(*box) + box = [min(xs), min(ys), max(xs), max(ys)] + is_valid.append(box[0] < box[2] and box[1] < box[3]) + if is_valid[-1]: + box_targets.append(box) - self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))
                                              + text_targets = [word for word, _valid in zip(file_dic["string"], is_valid) if _valid] + self.data.append((img_name, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets)))
                                              @@ -395,7 +384,7 @@

                                              Source code for doctr.datasets.ocr

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/recognition.html b/v0.6.0/_modules/doctr/datasets/recognition.html index 49113214d2..52424168a9 100644 --- a/v0.6.0/_modules/doctr/datasets/recognition.html +++ b/v0.6.0/_modules/doctr/datasets/recognition.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.recognition

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -316,6 +319,7 @@ 

                                              Source code for doctr.datasets.recognition

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       img_folder: path to the images folder
                                                       labels_path: pathe to the json file containing all labels (character sequences)
                                                       **kwargs: keyword arguments from `AbstractDataset`.
                                              @@ -330,7 +334,7 @@ 

                                              Source code for doctr.datasets.recognition

                                                       super().__init__(img_folder, **kwargs)
                                               
                                                       self.data: List[Tuple[str, str]] = []
                                              -        with open(labels_path) as f:
                                              +        with open(labels_path, encoding="utf-8") as f:
                                                           labels = json.load(f)
                                               
                                                       for img_name, label in labels.items():
                                              @@ -380,7 +384,7 @@ 

                                              Source code for doctr.datasets.recognition

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/sroie.html b/v0.6.0/_modules/doctr/datasets/sroie.html index ef02ecf611..0425870abb 100644 --- a/v0.6.0/_modules/doctr/datasets/sroie.html +++ b/v0.6.0/_modules/doctr/datasets/sroie.html @@ -226,35 +226,20 @@

                                              Source code for doctr.datasets.sroie

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021, Mindee.
                                               
                                              -# This program is licensed under the Apache License 2.0.
                                              -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              +# This program is licensed under the Apache License version 2.
                                              +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                               
                                              -import csv
                                               import os
                                              -from pathlib import Path
                                              -from typing import Any, Dict, List, Tuple, Union
                                              -
                                              +import csv
                                               import numpy as np
                                              -from tqdm import tqdm
                                              +from pathlib import Path
                                              +from typing import List, Dict, Any, Tuple, Optional, Callable
                                               
                                               from .datasets import VisionDataset
                                              -from .utils import convert_target_to_relative, crop_bboxes_from_image
                                               
                                              -__all__ = ["SROIE"]
                                              +__all__ = ['SROIE']
                                               
                                               
                                               
                                              -[docs] +[docs] class SROIE(VisionDataset): """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction" <https://arxiv.org/pdf/2103.10213.pdf>`_. - .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0 - :align: center - - >>> from doctr.datasets import SROIE - >>> train_set = SROIE(train=True, download=True) - >>> img, target = train_set[0] + Example:: + >>> from doctr.datasets import SROIE + >>> train_set = SROIE(train=True, download=True) + >>> img, target = train_set[0] Args: train: whether the subset should be the training one - use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones) - recognition_task: whether the dataset should be used for recognition task + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) **kwargs: keyword arguments from `VisionDataset`. """ - TRAIN = ( - "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0", - "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f", - ) - TEST = ( - "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0", - "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2", - ) + TRAIN = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_train_task1.zip', + 'd4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f') + TEST = ('https://github.com/mindee/doctr/releases/download/v0.1.1/sroie2019_test.zip', + '41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2') def __init__( self, train: bool = True, - use_polygons: bool = False, - recognition_task: bool = False, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, **kwargs: Any, ) -> None: url, sha256 = self.TRAIN if train else self.TEST - super().__init__( - url, - None, - sha256, - True, - pre_transforms=convert_target_to_relative if not recognition_task else None, - **kwargs, - ) + super().__init__(url, None, sha256, True, **kwargs) + self.sample_transforms = sample_transforms self.train = train - tmp_root = os.path.join(self.root, "images") - self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = [] - np_dtype = np.float32 - - for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))): + if rotated_bbox: + raise NotImplementedError + # # List images + self.root = os.path.join(self._root, 'images') + self.data: List[Tuple[str, Dict[str, Any]]] = [] + for img_path in os.listdir(self.root): # File existence check - if not os.path.exists(os.path.join(tmp_root, img_path)): - raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}") - + if not os.path.exists(os.path.join(self.root, img_path)): + raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_path)}") stem = Path(img_path).stem - with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f: - _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0] - - labels = [",".join(row[8:]) for row in _rows] - # reorder coordinates (8 -> (4,2) -> - # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines - coords: np.ndarray = np.stack( - [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0 - ) - - if not use_polygons: - # xmin, ymin, xmax, ymax - coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1) - - if recognition_task: - crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords) - for crop, label in zip(crops, labels): - if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0: - self.data.append((crop, label)) - else: - self.data.append((img_path, dict(boxes=coords, labels=labels))) - - self.root = tmp_root + _targets = [] + with open(os.path.join(self._root, 'annotations', f"{stem}.txt"), encoding='latin') as f: + for row in csv.reader(f, delimiter=','): + # Safeguard for blank lines + if len(row) > 0: + # Label may contain commas + label = ",".join(row[8:]) + # Reduce 8 coords to 4 + p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x, p4_y = map(int, row[:8]) + left, right = min(p1_x, p2_x, p3_x, p4_x), max(p1_x, p2_x, p3_x, p4_x) + top, bot = min(p1_y, p2_y, p3_y, p4_y), max(p1_y, p2_y, p3_y, p4_y) + if len(label) > 0: + _targets.append((label, [left, top, right, bot])) + + text_targets, box_targets = zip(*_targets) + + self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np.float32), labels=text_targets))) def extra_repr(self) -> str: return f"train={self.train}"
                                              @@ -427,7 +391,7 @@

                                              Source code for doctr.datasets.sroie

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/svhn.html b/v0.6.0/_modules/doctr/datasets/svhn.html index 920c621edb..44f36099fa 100644 --- a/v0.6.0/_modules/doctr/datasets/svhn.html +++ b/v0.6.0/_modules/doctr/datasets/svhn.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.svhn

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -322,9 +325,11 @@ 

                                              Source code for doctr.datasets.svhn

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `VisionDataset`.
                                                   """
                                               
                                              @@ -345,9 +350,9 @@ 

                                              Source code for doctr.datasets.svhn

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                              -
                                                       url, sha256, name = self.TRAIN if train else self.TEST
                                                       super().__init__(
                                                           url,
                                              @@ -357,8 +362,14 @@ 

                                              Source code for doctr.datasets.svhn

                                                           pre_transforms=convert_target_to_relative if not recognition_task else None,
                                                           **kwargs,
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                              +
                                                       self.train = train
                                              -        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       tmp_root = os.path.join(self.root, "train" if train else "test")
                                              @@ -416,6 +427,8 @@ 

                                              Source code for doctr.datasets.svhn

                                                                   for crop, label in zip(crops, label_targets):
                                                                       if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                                                                           self.data.append((crop, label))
                                              +                elif detection_task:
                                              +                    self.data.append((img_name, box_targets))
                                                               else:
                                                                   self.data.append((img_name, dict(boxes=box_targets, labels=label_targets)))
                                               
                                              @@ -456,7 +469,7 @@ 

                                              Source code for doctr.datasets.svhn

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/svt.html b/v0.6.0/_modules/doctr/datasets/svt.html index 5dc615bffb..ff75309df4 100644 --- a/v0.6.0/_modules/doctr/datasets/svt.html +++ b/v0.6.0/_modules/doctr/datasets/svt.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.svt

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -322,9 +325,11 @@ 

                                              Source code for doctr.datasets.svt

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `VisionDataset`.
                                                   """
                                               
                                              @@ -336,9 +341,9 @@ 

                                              Source code for doctr.datasets.svt

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                              -
                                                       super().__init__(
                                                           self.URL,
                                                           None,
                                              @@ -347,8 +352,14 @@ 

                                              Source code for doctr.datasets.svt

                                                           pre_transforms=convert_target_to_relative if not recognition_task else None,
                                                           **kwargs,
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                              +
                                                       self.train = train
                                              -        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       # Load xml data
                                              @@ -361,7 +372,7 @@ 

                                              Source code for doctr.datasets.svt

                                                       xml_root = xml_tree.getroot()
                                               
                                                       for image in tqdm(iterable=xml_root, desc="Unpacking SVT", total=len(xml_root)):
                                              -            name, _, _, resolution, rectangles = image
                                              +            name, _, _, _resolution, rectangles = image
                                               
                                                           # File existence check
                                                           if not os.path.exists(os.path.join(tmp_root, name.text)):
                                              @@ -402,6 +413,8 @@ 

                                              Source code for doctr.datasets.svt

                                                               for crop, label in zip(crops, labels):
                                                                   if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
                                                                       self.data.append((crop, label))
                                              +            elif detection_task:
                                              +                self.data.append((name.text, boxes))
                                                           else:
                                                               self.data.append((name.text, dict(boxes=boxes, labels=labels)))
                                               
                                              @@ -442,7 +455,7 @@ 

                                              Source code for doctr.datasets.svt

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/synthtext.html b/v0.6.0/_modules/doctr/datasets/synthtext.html index bc75e7d5a0..b3cef0e63f 100644 --- a/v0.6.0/_modules/doctr/datasets/synthtext.html +++ b/v0.6.0/_modules/doctr/datasets/synthtext.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.datasets.synthtext

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -325,9 +328,11 @@ 

                                              Source code for doctr.datasets.synthtext

                                                   >>> img, target = train_set[0]
                                               
                                                   Args:
                                              +    ----
                                                       train: whether the subset should be the training one
                                                       use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
                                                       recognition_task: whether the dataset should be used for recognition task
                                              +        detection_task: whether the dataset should be used for detection task
                                                       **kwargs: keyword arguments from `VisionDataset`.
                                                   """
                                               
                                              @@ -339,9 +344,9 @@ 

                                              Source code for doctr.datasets.synthtext

                                                       train: bool = True,
                                                       use_polygons: bool = False,
                                                       recognition_task: bool = False,
                                              +        detection_task: bool = False,
                                                       **kwargs: Any,
                                                   ) -> None:
                                              -
                                                       super().__init__(
                                                           self.URL,
                                                           None,
                                              @@ -350,8 +355,14 @@ 

                                              Source code for doctr.datasets.synthtext

                                                           pre_transforms=convert_target_to_relative if not recognition_task else None,
                                                           **kwargs,
                                                       )
                                              +        if recognition_task and detection_task:
                                              +            raise ValueError(
                                              +                "`recognition_task` and `detection_task` cannot be set to True simultaneously. "
                                              +                + "To get the whole dataset with boxes and labels leave both parameters to False."
                                              +            )
                                              +
                                                       self.train = train
                                              -        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
                                              +        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any], np.ndarray]]] = []
                                                       np_dtype = np.float32
                                               
                                                       # Load mat data
                                              @@ -405,6 +416,8 @@ 

                                              Source code for doctr.datasets.synthtext

                                                                           tmp_img = Image.fromarray(crop)
                                                                           tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
                                                                           reco_images_counter += 1
                                              +            elif detection_task:
                                              +                self.data.append((img_path[0], np.asarray(word_boxes, dtype=np_dtype)))
                                                           else:
                                                               self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))
                                               
                                              @@ -453,7 +466,7 @@ 

                                              Source code for doctr.datasets.synthtext

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/datasets/utils.html b/v0.6.0/_modules/doctr/datasets/utils.html index bd2a7e1de1..499d3fff84 100644 --- a/v0.6.0/_modules/doctr/datasets/utils.html +++ b/v0.6.0/_modules/doctr/datasets/utils.html @@ -226,35 +226,20 @@

                                              Source code for doctr.datasets.utils

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021, Mindee.
                                               
                                              -# This program is licensed under the Apache License 2.0.
                                              -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              +# This program is licensed under the Apache License version 2.
                                              +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                               
                                               import string
                                               import unicodedata
                                              -from collections.abc import Sequence
                                              -from functools import partial
                                              -from pathlib import Path
                                              -from typing import Any, Dict, List, Optional
                                              -from typing import Sequence as SequenceType
                                              -from typing import Tuple, TypeVar, Union
                                              -
                                               import numpy as np
                                              -from PIL import Image
                                              -
                                              -from doctr.io.image import get_img_shape
                                              -from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
                                              +from typing import List, Optional, Any
                                               
                                               from .vocabs import VOCABS
                                               
                                              -__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences"]
                                              -
                                              -ImageTensor = TypeVar("ImageTensor")
                                              +__all__ = ['translate', 'encode_sequence', 'decode_sequence', 'encode_sequences']
                                               
                                               
                                               def translate(
                                                   input_string: str,
                                                   vocab_name: str,
                                              -    unknown_char: str = "■",
                                              +    unknown_char: str = '■',
                                               ) -> str:
                                                   """Translate a string input in a given vocabulary
                                               
                                              @@ -335,7 +308,7 @@ 

                                              Source code for doctr.datasets.utils

                                                   if VOCABS.get(vocab_name) is None:
                                                       raise KeyError("output vocabulary must be in vocabs dictionnary")
                                               
                                              -    translated = ""
                                              +    translated = ''
                                                   for char in input_string:
                                                       if char not in VOCABS[vocab_name]:
                                                           # we need to translate char into a vocab char
                                              @@ -343,15 +316,15 @@ 

                                              Source code for doctr.datasets.utils

                                                               # remove whitespaces
                                                               continue
                                                           # normalize character if it is not in vocab
                                              -            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
                                              -            if char == "" or char not in VOCABS[vocab_name]:
                                              +            char = unicodedata.normalize('NFD', char).encode('ascii', 'ignore').decode('ascii')
                                              +            if char == '' or char not in VOCABS[vocab_name]:
                                                               # if normalization fails or char still not in vocab, return unknown character)
                                                               char = unknown_char
                                                       translated += char
                                                   return translated
                                               
                                               
                                              -def encode_string(
                                              +def encode_sequence(
                                                   input_string: str,
                                                   vocab: str,
                                               ) -> List[int]:
                                              @@ -364,36 +337,30 @@ 

                                              Source code for doctr.datasets.utils

                                                   Returns:
                                                       A list encoding the input_string"""
                                               
                                              -    try:
                                              -        return list(map(vocab.index, input_string))
                                              -    except ValueError:
                                              -        raise ValueError("some characters cannot be found in 'vocab'")
                                              +    return list(map(vocab.index, input_string))  # type: ignore[arg-type]
                                               
                                               
                                               def decode_sequence(
                                              -    input_seq: Union[np.ndarray, SequenceType[int]],
                                              +    input_array: np.array,
                                                   mapping: str,
                                               ) -> str:
                                                   """Given a predefined mapping, decode the sequence of numbers to a string
                                               
                                                   Args:
                                              -        input_seq: array to decode
                                              +        input_array: array to decode
                                                       mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
                                               
                                                   Returns:
                                              -        A string, decoded from input_seq
                                              -    """
                                              +        A string, decoded from input_array"""
                                               
                                              -    if not isinstance(input_seq, (Sequence, np.ndarray)):
                                              -        raise TypeError("Invalid sequence type")
                                              -    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
                                              +    if not input_array.dtype == np.int_ or input_array.max() >= len(mapping):
                                                       raise AssertionError("Input must be an array of int, with max less than mapping size")
                                              -
                                              -    return "".join(map(mapping.__getitem__, input_seq))
                                              +    decoded = ''.join(mapping[idx] for idx in input_array)
                                              +    return decoded
                                               
                                               
                                               
                                              -[docs] +[docs] def encode_sequences( sequences: List[str], vocab: str, @@ -401,7 +368,6 @@

                                              Source code for doctr.datasets.utils

                                                   eos: int = -1,
                                                   sos: Optional[int] = None,
                                                   pad: Optional[int] = None,
                                              -    dynamic_seq_length: bool = False,
                                                   **kwargs: Any,
                                               ) -> np.ndarray:
                                                   """Encode character sequences using a given vocab as mapping
                                              @@ -413,7 +379,6 @@ 

                                              Source code for doctr.datasets.utils

                                                       eos: encoding of End Of String
                                                       sos: optional encoding of Start Of String
                                                       pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
                                              -        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
                                               
                                                   Returns:
                                                       the padded encoded data as a tensor
                                              @@ -422,32 +387,29 @@ 

                                              Source code for doctr.datasets.utils

                                                   if 0 <= eos < len(vocab):
                                                       raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
                                               
                                              -    if not isinstance(target_size, int) or dynamic_seq_length:
                                              -        # Maximum string length + EOS
                                              -        max_length = max(len(w) for w in sequences) + 1
                                              -        if isinstance(sos, int):
                                              -            max_length += 1
                                              -        if isinstance(pad, int):
                                              -            max_length += 1
                                              -        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
                                              +    if not isinstance(target_size, int):
                                              +        target_size = max(len(w) for w in sequences)
                                              +        if sos:
                                              +            target_size += 1
                                              +        if pad:
                                              +            target_size += 1
                                               
                                                   # Pad all sequences
                                              -    if isinstance(pad, int):  # pad with padding symbol
                                              +    if pad:  # pad with padding symbol
                                                       if 0 <= pad < len(vocab):
                                                           raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
                                                       # In that case, add EOS at the end of the word before padding
                                              -        default_symbol = pad
                                              +        encoded_data = np.full([len(sequences), target_size], pad, dtype=np.int32)
                                                   else:  # pad with eos symbol
                                              -        default_symbol = eos
                                              -    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
                                              +        encoded_data = np.full([len(sequences), target_size], eos, dtype=np.int32)
                                               
                                              -    # Encode the strings
                                              -    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
                                              -        if isinstance(pad, int):  # add eos at the end of the sequence
                                              -            seq.append(eos)
                                              -        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
                                              +    for idx, seq in enumerate(sequences):
                                              +        encoded_seq = encode_sequence(seq, vocab)
                                              +        if pad:  # add eos at the end of the sequence
                                              +            encoded_seq.append(eos)
                                              +        encoded_data[idx, :min(len(encoded_seq), target_size)] = encoded_seq[:min(len(encoded_seq), target_size)]
                                               
                                              -    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
                                              +    if sos:  # place eos symbol at the beginning of each sequence
                                                       if 0 <= sos < len(vocab):
                                                           raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
                                                       encoded_data = np.roll(encoded_data, 1)
                                              @@ -455,29 +417,6 @@ 

                                              Source code for doctr.datasets.utils

                                               
                                                   return encoded_data
                                              - - -def convert_target_to_relative(img: ImageTensor, target: Dict[str, Any]) -> Tuple[ImageTensor, Dict[str, Any]]: - - target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img)) - return img, target - - -def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]: - """Crop a set of bounding boxes from an image - Args: - img_path: path to the image - geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4) - Returns: - a list of cropped images - """ - img: np.ndarray = np.array(Image.open(img_path).convert("RGB")) - # Polygon - if geoms.ndim == 3 and geoms.shape[1:] == (4, 2): - return extract_rcrops(img, geoms.astype(dtype=int)) - if geoms.ndim == 2 and geoms.shape[1] == 4: - return extract_crops(img, geoms.astype(dtype=int)) - raise ValueError("Invalid geometry format")
                                              @@ -510,7 +449,7 @@

                                              Source code for doctr.datasets.utils

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/io/elements.html b/v0.6.0/_modules/doctr/io/elements.html index 1c83175f14..73dbca5877 100644 --- a/v0.6.0/_modules/doctr/io/elements.html +++ b/v0.6.0/_modules/doctr/io/elements.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.io.elements

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -304,16 +307,21 @@ 

                                              Source code for doctr.io.elements

                                               from xml.etree.ElementTree import Element as ETElement
                                               from xml.etree.ElementTree import SubElement
                                               
                                              -import matplotlib.pyplot as plt
                                               import numpy as np
                                               
                                               import doctr
                                              +from doctr.file_utils import requires_package
                                               from doctr.utils.common_types import BoundingBox
                                               from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
                                              +from doctr.utils.reconstitution import synthesize_kie_page, synthesize_page
                                               from doctr.utils.repr import NestedObject
                                              -from doctr.utils.visualization import synthesize_page, visualize_page
                                               
                                              -__all__ = ["Element", "Word", "Artefact", "Line", "Block", "Page", "Document"]
                                              +try:  # optional dependency for visualization
                                              +    from doctr.utils.visualization import visualize_kie_page, visualize_page
                                              +except ModuleNotFoundError:
                                              +    pass
                                              +
                                              +__all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page", "KIEPage", "Document"]
                                               
                                               
                                               class Element(NestedObject):
                                              @@ -331,10 +339,14 @@ 

                                              Source code for doctr.io.elements

                                               
                                                   def export(self) -> Dict[str, Any]:
                                                       """Exports the object into a nested dict format"""
                                              -
                                                       export_dict = {k: getattr(self, k) for k in self._exported_keys}
                                                       for children_name in self._children_names:
                                              -            export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
                                              +            if children_name in ["predictions"]:
                                              +                export_dict[children_name] = {
                                              +                    k: [item.export() for item in c] for k, c in getattr(self, children_name).items()
                                              +                }
                                              +            else:
                                              +                export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
                                               
                                                       return export_dict
                                               
                                              @@ -352,20 +364,32 @@ 

                                              Source code for doctr.io.elements

                                                   """Implements a word element
                                               
                                                   Args:
                                              +    ----
                                                       value: the text string of the word
                                                       confidence: the confidence associated with the text prediction
                                                       geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                                                       the page's size
                                              +        objectness_score: the objectness score of the detection
                                              +        crop_orientation: the general orientation of the crop in degrees and its confidence
                                                   """
                                               
                                              -    _exported_keys: List[str] = ["value", "confidence", "geometry"]
                                              +    _exported_keys: List[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
                                                   _children_names: List[str] = []
                                               
                                              -    def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, np.ndarray]) -> None:
                                              +    def __init__(
                                              +        self,
                                              +        value: str,
                                              +        confidence: float,
                                              +        geometry: Union[BoundingBox, np.ndarray],
                                              +        objectness_score: float,
                                              +        crop_orientation: Dict[str, Any],
                                              +    ) -> None:
                                                       super().__init__()
                                                       self.value = value
                                                       self.confidence = confidence
                                                       self.geometry = geometry
                                              +        self.objectness_score = objectness_score
                                              +        self.crop_orientation = crop_orientation
                                               
                                                   def render(self) -> str:
                                                       """Renders the full text of the element"""
                                              @@ -387,6 +411,7 @@ 

                                              Source code for doctr.io.elements

                                                   """Implements a non-textual element
                                               
                                                   Args:
                                              +    ----
                                                       artefact_type: the type of artefact
                                                       confidence: the confidence of the type prediction
                                                       geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                                              @@ -422,13 +447,14 @@ 

                                              Source code for doctr.io.elements

                                                   """Implements a line element as a collection of words
                                               
                                                   Args:
                                              +    ----
                                                       words: list of word elements
                                                       geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
                                                           the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
                                                           all words in it.
                                                   """
                                               
                                              -    _exported_keys: List[str] = ["geometry"]
                                              +    _exported_keys: List[str] = ["geometry", "objectness_score"]
                                                   _children_names: List[str] = ["words"]
                                                   words: List[Word] = []
                                               
                                              @@ -436,7 +462,11 @@ 

                                              Source code for doctr.io.elements

                                                       self,
                                                       words: List[Word],
                                                       geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
                                              +        objectness_score: Optional[float] = None,
                                                   ) -> None:
                                              +        # Compute the objectness score of the line
                                              +        if objectness_score is None:
                                              +            objectness_score = float(np.mean([w.objectness_score for w in words]))
                                                       # Resolve the geometry using the smallest enclosing bounding box
                                                       if geometry is None:
                                                           # Check whether this is a rotated or straight box
                                              @@ -445,6 +475,7 @@ 

                                              Source code for doctr.io.elements

                                               
                                                       super().__init__(words=words)
                                                       self.geometry = geometry
                                              +        self.objectness_score = objectness_score
                                               
                                                   def render(self) -> str:
                                                       """Renders the full text of the element"""
                                              @@ -453,21 +484,31 @@ 

                                              Source code for doctr.io.elements

                                                   @classmethod
                                                   def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                                       kwargs = {k: save_dict[k] for k in cls._exported_keys}
                                              -        kwargs.update(
                                              -            {
                                              -                "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
                                              -            }
                                              -        )
                                              +        kwargs.update({
                                              +            "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
                                              +        })
                                                       return cls(**kwargs)
                                              +class Prediction(Word): + """Implements a prediction element""" + + def render(self) -> str: + """Renders the full text of the element""" + return self.value + + def extra_repr(self) -> str: + return f"value='{self.value}', confidence={self.confidence:.2}, bounding_box={self.geometry}" + +
                                              [docs] class Block(Element): """Implements a block element as a collection of lines and artefacts Args: + ---- lines: list of line elements artefacts: list of artefacts geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to @@ -475,7 +516,7 @@

                                              Source code for doctr.io.elements

                                                           all lines and artefacts in it.
                                                   """
                                               
                                              -    _exported_keys: List[str] = ["geometry"]
                                              +    _exported_keys: List[str] = ["geometry", "objectness_score"]
                                                   _children_names: List[str] = ["lines", "artefacts"]
                                                   lines: List[Line] = []
                                                   artefacts: List[Artefact] = []
                                              @@ -485,7 +526,11 @@ 

                                              Source code for doctr.io.elements

                                                       lines: List[Line] = [],
                                                       artefacts: List[Artefact] = [],
                                                       geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
                                              +        objectness_score: Optional[float] = None,
                                                   ) -> None:
                                              +        # Compute the objectness score of the line
                                              +        if objectness_score is None:
                                              +            objectness_score = float(np.mean([w.objectness_score for line in lines for w in line.words]))
                                                       # Resolve the geometry using the smallest enclosing bounding box
                                                       if geometry is None:
                                                           line_boxes = [word.geometry for line in lines for word in line.words]
                                              @@ -497,6 +542,7 @@ 

                                              Source code for doctr.io.elements

                                               
                                                       super().__init__(lines=lines, artefacts=artefacts)
                                                       self.geometry = geometry
                                              +        self.objectness_score = objectness_score
                                               
                                                   def render(self, line_break: str = "\n") -> str:
                                                       """Renders the full text of the element"""
                                              @@ -505,12 +551,10 @@ 

                                              Source code for doctr.io.elements

                                                   @classmethod
                                                   def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                                       kwargs = {k: save_dict[k] for k in cls._exported_keys}
                                              -        kwargs.update(
                                              -            {
                                              -                "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
                                              -                "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
                                              -            }
                                              -        )
                                              +        kwargs.update({
                                              +            "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
                                              +            "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
                                              +        })
                                                       return cls(**kwargs)
                                              @@ -521,6 +565,8 @@

                                              Source code for doctr.io.elements

                                                   """Implements a page element as a collection of blocks
                                               
                                                   Args:
                                              +    ----
                                              +        page: image encoded as a numpy array in uint8
                                                       blocks: list of block elements
                                                       page_idx: the index of the page in the input raw document
                                                       dimensions: the page size in pixels in format (height, width)
                                              @@ -534,6 +580,7 @@ 

                                              Source code for doctr.io.elements

                                               
                                                   def __init__(
                                                       self,
                                              +        page: np.ndarray,
                                                       blocks: List[Block],
                                                       page_idx: int,
                                                       dimensions: Tuple[int, int],
                                              @@ -541,6 +588,7 @@ 

                                              Source code for doctr.io.elements

                                                       language: Optional[Dict[str, Any]] = None,
                                                   ) -> None:
                                                       super().__init__(blocks=blocks)
                                              +        self.page = page
                                                       self.page_idx = page_idx
                                                       self.dimensions = dimensions
                                                       self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
                                              @@ -555,25 +603,29 @@ 

                                              Source code for doctr.io.elements

                                               
                                               
                                              [docs] - def show(self, page: np.ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: + def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None: """Overlay the result on a given image Args: - page: image encoded as a numpy array in uint8 interactive: whether the display should be interactive preserve_aspect_ratio: pass True if you passed True to the predictor + **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method """ - visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) + requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed") + requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed") + import matplotlib.pyplot as plt + + visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio) plt.show(**kwargs)
                                              def synthesize(self, **kwargs) -> np.ndarray: """Synthesize the page from the predictions - Returns: + Returns + ------- synthesized page """ - return synthesize_page(self.export(), **kwargs) def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]: @@ -581,9 +633,11 @@

                                              Source code for doctr.io.elements

                                                       convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
                                               
                                                       Args:
                                              +        ----
                                                           file_title: the title of the XML file
                                               
                                                       Returns:
                                              +        -------
                                                           a tuple of the XML byte string, and its ElementTree
                                                       """
                                                       p_idx = self.page_idx
                                              @@ -688,12 +742,158 @@ 

                                              Source code for doctr.io.elements

                                               
                                               
                                               
                                              +class KIEPage(Element):
                                              +    """Implements a KIE page element as a collection of predictions
                                              +
                                              +    Args:
                                              +    ----
                                              +        predictions: Dictionary with list of block elements for each detection class
                                              +        page: image encoded as a numpy array in uint8
                                              +        page_idx: the index of the page in the input raw document
                                              +        dimensions: the page size in pixels in format (height, width)
                                              +        orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
                                              +        language: a dictionary with the language value and confidence of the prediction
                                              +    """
                                              +
                                              +    _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
                                              +    _children_names: List[str] = ["predictions"]
                                              +    predictions: Dict[str, List[Prediction]] = {}
                                              +
                                              +    def __init__(
                                              +        self,
                                              +        page: np.ndarray,
                                              +        predictions: Dict[str, List[Prediction]],
                                              +        page_idx: int,
                                              +        dimensions: Tuple[int, int],
                                              +        orientation: Optional[Dict[str, Any]] = None,
                                              +        language: Optional[Dict[str, Any]] = None,
                                              +    ) -> None:
                                              +        super().__init__(predictions=predictions)
                                              +        self.page = page
                                              +        self.page_idx = page_idx
                                              +        self.dimensions = dimensions
                                              +        self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
                                              +        self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
                                              +
                                              +    def render(self, prediction_break: str = "\n\n") -> str:
                                              +        """Renders the full text of the element"""
                                              +        return prediction_break.join(
                                              +            f"{class_name}: {p.render()}" for class_name, predictions in self.predictions.items() for p in predictions
                                              +        )
                                              +
                                              +    def extra_repr(self) -> str:
                                              +        return f"dimensions={self.dimensions}"
                                              +
                                              +    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
                                              +        """Overlay the result on a given image
                                              +
                                              +        Args:
                                              +            interactive: whether the display should be interactive
                                              +            preserve_aspect_ratio: pass True if you passed True to the predictor
                                              +            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
                                              +        """
                                              +        requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed")
                                              +        requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed")
                                              +        import matplotlib.pyplot as plt
                                              +
                                              +        visualize_kie_page(
                                              +            self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
                                              +        )
                                              +        plt.show(**kwargs)
                                              +
                                              +    def synthesize(self, **kwargs) -> np.ndarray:
                                              +        """Synthesize the page from the predictions
                                              +
                                              +        Args:
                                              +        ----
                                              +            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
                                              +
                                              +        Returns:
                                              +        -------
                                              +            synthesized page
                                              +        """
                                              +        return synthesize_kie_page(self.export(), **kwargs)
                                              +
                                              +    def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
                                              +        """Export the page as XML (hOCR-format)
                                              +        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
                                              +
                                              +        Args:
                                              +        ----
                                              +            file_title: the title of the XML file
                                              +
                                              +        Returns:
                                              +        -------
                                              +            a tuple of the XML byte string, and its ElementTree
                                              +        """
                                              +        p_idx = self.page_idx
                                              +        prediction_count: int = 1
                                              +        height, width = self.dimensions
                                              +        language = self.language if "language" in self.language.keys() else "en"
                                              +        # Create the XML root element
                                              +        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
                                              +        # Create the header / SubElements of the root element
                                              +        head = SubElement(page_hocr, "head")
                                              +        SubElement(head, "title").text = file_title
                                              +        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
                                              +        SubElement(
                                              +            head,
                                              +            "meta",
                                              +            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
                                              +        )
                                              +        SubElement(
                                              +            head,
                                              +            "meta",
                                              +            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
                                              +        )
                                              +        # Create the body
                                              +        body = SubElement(page_hocr, "body")
                                              +        SubElement(
                                              +            body,
                                              +            "div",
                                              +            attrib={
                                              +                "class": "ocr_page",
                                              +                "id": f"page_{p_idx + 1}",
                                              +                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
                                              +            },
                                              +        )
                                              +        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
                                              +        for class_name, predictions in self.predictions.items():
                                              +            for prediction in predictions:
                                              +                if len(prediction.geometry) != 2:
                                              +                    raise TypeError("XML export is only available for straight bounding boxes for now.")
                                              +                (xmin, ymin), (xmax, ymax) = prediction.geometry
                                              +                prediction_div = SubElement(
                                              +                    body,
                                              +                    "div",
                                              +                    attrib={
                                              +                        "class": "ocr_carea",
                                              +                        "id": f"{class_name}_prediction_{prediction_count}",
                                              +                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
                                              +                        {int(round(xmax * width))} {int(round(ymax * height))}",
                                              +                    },
                                              +                )
                                              +                prediction_div.text = prediction.value
                                              +                prediction_count += 1
                                              +
                                              +        return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
                                              +
                                              +    @classmethod
                                              +    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
                                              +        kwargs = {k: save_dict[k] for k in cls._exported_keys}
                                              +        kwargs.update({
                                              +            "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
                                              +        })
                                              +        return cls(**kwargs)
                                              +
                                              +
                                               
                                              [docs] class Document(Element): """Implements a document element as a collection of pages Args: + ---- pages: list of page elements """ @@ -712,32 +912,30 @@

                                              Source code for doctr.io.elements

                                               
                                               
                                              [docs] - def show(self, pages: List[np.ndarray], **kwargs) -> None: - """Overlay the result on a given image - - Args: - pages: list of images encoded as numpy arrays in uint8 - """ - for img, result in zip(pages, self.pages): - result.show(img, **kwargs)
                                              + def show(self, **kwargs) -> None: + """Overlay the result on a given image""" + for result in self.pages: + result.show(**kwargs)
                                              def synthesize(self, **kwargs) -> List[np.ndarray]: """Synthesize all pages from their predictions - Returns: + Returns + ------- list of synthesized pages """ - return [page.synthesize() for page in self.pages] def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: """Export the document as XML (hOCR-format) Args: + ---- **kwargs: additional keyword arguments passed to the Page.export_as_xml method Returns: + ------- list of tuple of (bytes, ElementTree) """ return [page.export_as_xml(**kwargs) for page in self.pages] @@ -748,6 +946,24 @@

                                              Source code for doctr.io.elements

                                                       kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
                                                       return cls(**kwargs)
                                              + + +class KIEDocument(Document): + """Implements a document element as a collection of pages + + Args: + ---- + pages: list of page elements + """ + + _children_names: List[str] = ["pages"] + pages: List[KIEPage] = [] # type: ignore[assignment] + + def __init__( + self, + pages: List[KIEPage], + ) -> None: + super().__init__(pages=pages) # type: ignore[arg-type]
                                              @@ -780,7 +996,7 @@

                                              Source code for doctr.io.elements

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/io/html.html b/v0.6.0/_modules/doctr/io/html.html index 363d346048..d5495fcd8a 100644 --- a/v0.6.0/_modules/doctr/io/html.html +++ b/v0.6.0/_modules/doctr/io/html.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.io.html

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                               
                                               from typing import Any
                                               
                                              -from weasyprint import HTML
                                              -
                                               __all__ = ["read_html"]
                                               
                                               
                                              @@ -307,15 +308,19 @@ 

                                              Source code for doctr.io.html

                                               def read_html(url: str, **kwargs: Any) -> bytes:
                                                   """Read a PDF file and convert it into an image in numpy format
                                               
                                              -    >>> from doctr.documents import read_html
                                              +    >>> from doctr.io import read_html
                                                   >>> doc = read_html("https://www.yoursite.com")
                                               
                                                   Args:
                                              +    ----
                                                       url: URL of the target web page
                                              +        **kwargs: keyword arguments from `weasyprint.HTML`
                                               
                                                   Returns:
                                              +    -------
                                                       decoded PDF file as a bytes stream
                                                   """
                                              +    from weasyprint import HTML
                                               
                                                   return HTML(url, **kwargs).write_pdf()
                                              @@ -351,7 +356,7 @@

                                              Source code for doctr.io.html

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/io/image/base.html b/v0.6.0/_modules/doctr/io/image/base.html index 4f3e51ee42..1ba249a68a 100644 --- a/v0.6.0/_modules/doctr/io/image/base.html +++ b/v0.6.0/_modules/doctr/io/image/base.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.io.image.base

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -315,18 +318,19 @@ 

                                              Source code for doctr.io.image.base

                                               ) -> np.ndarray:
                                                   """Read an image file into numpy format
                                               
                                              -    >>> from doctr.documents import read_img
                                              -    >>> page = read_img("path/to/your/doc.jpg")
                                              +    >>> from doctr.io import read_img_as_numpy
                                              +    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
                                               
                                                   Args:
                                              +    ----
                                                       file: the path to the image file
                                                       output_size: the expected output size of each page in format H x W
                                                       rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
                                               
                                                   Returns:
                                              +    -------
                                                       the page decoded as numpy ndarray of shape H x W x 3
                                                   """
                                              -
                                                   if isinstance(file, (str, Path)):
                                                       if not Path(file).is_file():
                                                           raise FileNotFoundError(f"unable to access {file}")
                                              @@ -380,7 +384,7 @@ 

                                              Source code for doctr.io.image.base

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/io/image/tensorflow.html b/v0.6.0/_modules/doctr/io/image/tensorflow.html index b40f3670bc..f9faeeab1c 100644 --- a/v0.6.0/_modules/doctr/io/image/tensorflow.html +++ b/v0.6.0/_modules/doctr/io/image/tensorflow.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.io.image.tensorflow

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                              @@ -300,28 +303,25 @@ 

                                              Source code for doctr.io.image.tensorflow

                                               import numpy as np
                                               import tensorflow as tf
                                               from PIL import Image
                                              -
                                              -if tf.__version__ >= "2.6.0":
                                              -    from tensorflow.keras.utils import img_to_array
                                              -else:
                                              -    from tensorflow.keras.preprocessing.image import img_to_array
                                              +from tensorflow.keras.utils import img_to_array
                                               
                                               from doctr.utils.common_types import AbstractPath
                                               
                                               __all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
                                               
                                               
                                              -def tensor_from_pil(pil_img: Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                                              +def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
                                                   """Convert a PIL Image to a TensorFlow tensor
                                               
                                                   Args:
                                              +    ----
                                                       pil_img: a PIL image
                                                       dtype: the output tensor data type
                                               
                                                   Returns:
                                              +    -------
                                                       decoded image as tensor
                                                   """
                                              -
                                                   npy_img = img_to_array(pil_img)
                                               
                                                   return tensor_from_numpy(npy_img, dtype)
                                              @@ -333,13 +333,14 @@ 

                                              Source code for doctr.io.image.tensorflow

                                                   """Read an image file as a TensorFlow tensor
                                               
                                                   Args:
                                              +    ----
                                                       img_path: location of the image file
                                                       dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                                               
                                                   Returns:
                                              +    -------
                                                       decoded image as a tensor
                                                   """
                                              -
                                                   if dtype not in (tf.uint8, tf.float16, tf.float32):
                                                       raise ValueError("insupported value for dtype")
                                               
                                              @@ -360,13 +361,14 @@ 

                                              Source code for doctr.io.image.tensorflow

                                                   """Read a byte stream as a TensorFlow tensor
                                               
                                                   Args:
                                              +    ----
                                                       img_content: bytes of a decoded image
                                                       dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                                               
                                                   Returns:
                                              +    -------
                                                       decoded image as a tensor
                                                   """
                                              -
                                                   if dtype not in (tf.uint8, tf.float16, tf.float32):
                                                       raise ValueError("insupported value for dtype")
                                               
                                              @@ -384,13 +386,14 @@ 

                                              Source code for doctr.io.image.tensorflow

                                                   """Read an image file as a TensorFlow tensor
                                               
                                                   Args:
                                              -        img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                                              +    ----
                                              +        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
                                                       dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
                                               
                                                   Returns:
                                              +    -------
                                                       same image as a tensor of shape (H, W, C)
                                                   """
                                              -
                                                   if dtype not in (tf.uint8, tf.float16, tf.float32):
                                                       raise ValueError("insupported value for dtype")
                                               
                                              @@ -404,6 +407,7 @@ 

                                              Source code for doctr.io.image.tensorflow

                                               
                                               
                                               def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
                                              +    """Get the shape of an image"""
                                                   return img.shape[:2]
                                               
                                              @@ -437,7 +441,7 @@

                                              Source code for doctr.io.image.tensorflow

                                                     
                                                   
                                                 
                                              -
                                              +
                                              diff --git a/v0.6.0/_modules/doctr/io/pdf.html b/v0.6.0/_modules/doctr/io/pdf.html index a4043623bb..91baf96f7b 100644 --- a/v0.6.0/_modules/doctr/io/pdf.html +++ b/v0.6.0/_modules/doctr/io/pdf.html @@ -235,12 +235,15 @@

                                              Package Reference

                                              Source code for doctr.io.pdf

                                              -# Copyright (C) 2021-2022, Mindee.
                                              +# Copyright (C) 2021-2024, Mindee.
                                               
                                               # This program is licensed under the Apache License 2.0.
                                               # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                               
                                              -from pathlib import Path
                                               from typing import Any, List, Optional
                                               
                                               import numpy as np
                                              @@ -317,31 +319,27 @@ 

                                              Source code for doctr.io.pdf

                                               ) -> List[np.ndarray]:
                                                   """Read a PDF file and convert it into an image in numpy format
                                               
                                              -    >>> from doctr.documents import read_pdf
                                              +    >>> from doctr.io import read_pdf
                                                   >>> doc = read_pdf("path/to/your/doc.pdf")
                                               
                                                   Args:
                                              +    ----
                                                       file: the path to the PDF file
                                                       scale: rendering scale (1 corresponds to 72dpi)
                                                       rgb_mode: if True, the output will be RGB, otherwise BGR
                                                       password: a password to unlock the document, if encrypted
                                              -        kwargs: additional parameters to :meth:`pypdfium2.PdfDocument.render_to`
                                              +        **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
                                               
                                                   Returns:
                                              +    -------
                                                       the list of pages decoded as numpy ndarray of shape H x W x C
                                                   """
                                              -
                                              -    if isinstance(file, Path):
                                              -        file = str(file)
                                              -    if not isinstance(file, (str, bytes)):
                                              -        raise TypeError("unsupported object type for argument 'file'")
                                              -
                                                   # Rasterise pages to numpy ndarrays with pypdfium2
                                              -    with pdfium.PdfDocument(file, password=password) as pdf:
                                              -        return [
                                              -            img
                                              -            for img, _ in pdf.render_to(pdfium.BitmapConv.numpy_ndarray, scale=scale, rev_byteorder=rgb_mode, **kwargs)
                                              -        ]
                                              + pdf = pdfium.PdfDocument(file, password=password) + try: + return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf] + finally: + pdf.close()
                                              @@ -375,7 +373,7 @@

                                              Source code for doctr.io.pdf

                                                     
                                                   
                                                 
                                              - + diff --git a/v0.6.0/_modules/doctr/io/reader.html b/v0.6.0/_modules/doctr/io/reader.html index 83e636dd7a..49cdc7d152 100644 --- a/v0.6.0/_modules/doctr/io/reader.html +++ b/v0.6.0/_modules/doctr/io/reader.html @@ -235,12 +235,15 @@

                                              Package Reference

                                                +
                                              • doctr.contrib
                                              • doctr.datasets
                                              • doctr.io
                                              • doctr.models
                                              • @@ -290,7 +293,7 @@

                                                Source code for doctr.io.reader

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -300,6 +303,7 @@ 

                                                Source code for doctr.io.reader

                                                 
                                                 import numpy as np
                                                 
                                                +from doctr.file_utils import requires_package
                                                 from doctr.utils.common_types import AbstractFile
                                                 
                                                 from .html import read_html
                                                @@ -320,16 +324,18 @@ 

                                                Source code for doctr.io.reader

                                                     def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
                                                         """Read a PDF file
                                                 
                                                -        >>> from doctr.documents import DocumentFile
                                                +        >>> from doctr.io import DocumentFile
                                                         >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
                                                 
                                                         Args:
                                                +        ----
                                                             file: the path to the PDF file or a binary stream
                                                +            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
                                                 
                                                         Returns:
                                                +        -------
                                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                                         """
                                                -
                                                         return read_pdf(file, **kwargs)
                                                @@ -339,15 +345,23 @@

                                                Source code for doctr.io.reader

                                                     def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
                                                         """Interpret a web page as a PDF document
                                                 
                                                -        >>> from doctr.documents import DocumentFile
                                                +        >>> from doctr.io import DocumentFile
                                                         >>> doc = DocumentFile.from_url("https://www.yoursite.com")
                                                 
                                                         Args:
                                                +        ----
                                                             url: the URL of the target web page
                                                +            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
                                                 
                                                         Returns:
                                                +        -------
                                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                                         """
                                                +        requires_package(
                                                +            "weasyprint",
                                                +            "`.from_url` requires weasyprint installed.\n"
                                                +            + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation",
                                                +        )
                                                         pdf_stream = read_html(url)
                                                         return cls.from_pdf(pdf_stream, **kwargs)
                                                @@ -358,13 +372,16 @@

                                                Source code for doctr.io.reader

                                                     def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
                                                         """Read an image file (or a collection of image files) and convert it into an image in numpy format
                                                 
                                                -        >>> from doctr.documents import DocumentFile
                                                +        >>> from doctr.io import DocumentFile
                                                         >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
                                                 
                                                         Args:
                                                +        ----
                                                             files: the path to the image file or a binary stream, or a collection of those
                                                +            **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`
                                                 
                                                         Returns:
                                                +        -------
                                                             the list of pages decoded as numpy ndarray of shape H x W x 3
                                                         """
                                                         if isinstance(files, (str, Path, bytes)):
                                                @@ -405,7 +422,7 @@ 

                                                Source code for doctr.io.reader

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.6.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index a0f857205e..0000000000 --- a/v0.6.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,688 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                                -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                -
                                                -
                                                - - - - - Back to top - -
                                                -
                                                - -
                                                - -
                                                -
                                                -

                                                Source code for doctr.models.backbones.mobilenet.tensorflow

                                                -# Copyright (C) 2021, Mindee.
                                                -
                                                -# This program is licensed under the Apache License version 2.
                                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                -
                                                -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
                                                -
                                                -from typing import Any, Dict, List, Optional, Tuple, Union
                                                -
                                                -import tensorflow as tf
                                                -from tensorflow.keras import layers
                                                -from tensorflow.keras.models import Sequential
                                                -
                                                -from ....datasets import VOCABS
                                                -from ...utils import conv_sequence, load_pretrained_params
                                                -
                                                -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
                                                -           "mobilenet_v3_large_r"]
                                                -
                                                -
                                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    'mobilenet_v3_large': {
                                                -        'mean': (0.694, 0.695, 0.693),
                                                -        'std': (0.299, 0.296, 0.301),
                                                -        'input_shape': (32, 32, 3),
                                                -        'vocab': VOCABS['legacy_french'],
                                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
                                                -    },
                                                -    'mobilenet_v3_large_r': {
                                                -        'mean': (0.694, 0.695, 0.693),
                                                -        'std': (0.299, 0.296, 0.301),
                                                -        'input_shape': (32, 32, 3),
                                                -        'vocab': VOCABS['french'],
                                                -        'url': None,
                                                -    },
                                                -    'mobilenet_v3_small': {
                                                -        'mean': (0.694, 0.695, 0.693),
                                                -        'std': (0.299, 0.296, 0.301),
                                                -        'input_shape': (32, 32, 3),
                                                -        'vocab': VOCABS['legacy_french'],
                                                -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
                                                -    },
                                                -    'mobilenet_v3_small_r': {
                                                -        'mean': (0.694, 0.695, 0.693),
                                                -        'std': (0.299, 0.296, 0.301),
                                                -        'input_shape': (32, 32, 3),
                                                -        'vocab': VOCABS['french'],
                                                -        'url': None,
                                                -    }
                                                -}
                                                -
                                                -
                                                -def hard_swish(x: tf.Tensor) -> tf.Tensor:
                                                -    return x * tf.nn.relu6(x + 3.) / 6.0
                                                -
                                                -
                                                -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
                                                -    if min_value is None:
                                                -        min_value = divisor
                                                -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
                                                -    # Make sure that round down does not go down by more than 10%.
                                                -    if new_v < 0.9 * v:
                                                -        new_v += divisor
                                                -    return new_v
                                                -
                                                -
                                                -class SqueezeExcitation(Sequential):
                                                -    """Squeeze and Excitation.
                                                -    """
                                                -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
                                                -        super().__init__(
                                                -            [
                                                -                layers.GlobalAveragePooling2D(),
                                                -                layers.Dense(chan // squeeze_factor, activation='relu'),
                                                -                layers.Dense(chan, activation='hard_sigmoid'),
                                                -                layers.Reshape((1, 1, chan))
                                                -            ]
                                                -        )
                                                -
                                                -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
                                                -        x = super().call(inputs, **kwargs)
                                                -        x = tf.math.multiply(inputs, x)
                                                -        return x
                                                -
                                                -
                                                -class InvertedResidualConfig:
                                                -    def __init__(
                                                -        self,
                                                -        input_channels: int,
                                                -        kernel: int,
                                                -        expanded_channels: int,
                                                -        out_channels: int,
                                                -        use_se: bool,
                                                -        activation: str,
                                                -        stride: Union[int, Tuple[int, int]],
                                                -        width_mult: float = 1,
                                                -    ) -> None:
                                                -        self.input_channels = self.adjust_channels(input_channels, width_mult)
                                                -        self.kernel = kernel
                                                -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
                                                -        self.out_channels = self.adjust_channels(out_channels, width_mult)
                                                -        self.use_se = use_se
                                                -        self.use_hs = activation == "HS"
                                                -        self.stride = stride
                                                -
                                                -    @staticmethod
                                                -    def adjust_channels(channels: int, width_mult: float):
                                                -        return _make_divisible(channels * width_mult, 8)
                                                -
                                                -
                                                -class InvertedResidual(layers.Layer):
                                                -    """InvertedResidual for mobilenet
                                                -
                                                -    Args:
                                                -        conf: configuration object for inverted residual
                                                -    """
                                                -    def __init__(
                                                -        self,
                                                -        conf: InvertedResidualConfig,
                                                -        **kwargs: Any,
                                                -    ) -> None:
                                                -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
                                                -        super().__init__(**kwargs)
                                                -
                                                -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
                                                -
                                                -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
                                                -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
                                                -
                                                -        _layers = []
                                                -        # expand
                                                -        if conf.expanded_channels != conf.input_channels:
                                                -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
                                                -
                                                -        # depth-wise
                                                -        _layers.extend(conv_sequence(
                                                -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
                                                -            groups=conf.expanded_channels,
                                                -        ))
                                                -
                                                -        if conf.use_se:
                                                -            _layers.append(SqueezeExcitation(conf.expanded_channels))
                                                -
                                                -        # project
                                                -        _layers.extend(conv_sequence(
                                                -            conf.out_channels, None, kernel_size=1, bn=True,
                                                -        ))
                                                -
                                                -        self.block = Sequential(_layers)
                                                -
                                                -    def call(
                                                -        self,
                                                -        inputs: tf.Tensor,
                                                -        **kwargs: Any,
                                                -    ) -> tf.Tensor:
                                                -
                                                -        out = self.block(inputs, **kwargs)
                                                -        if self.use_res_connect:
                                                -            out = tf.add(out, inputs)
                                                -
                                                -        return out
                                                -
                                                -
                                                -class MobileNetV3(Sequential):
                                                -    """Implements MobileNetV3, inspired from both:
                                                -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
                                                -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
                                                -    """
                                                -
                                                -    def __init__(
                                                -        self,
                                                -        layout: List[InvertedResidualConfig],
                                                -        input_shape: Tuple[int, int, int],
                                                -        include_top: bool = False,
                                                -        head_chans: int = 1024,
                                                -        num_classes: int = 1000,
                                                -    ) -> None:
                                                -
                                                -        _layers = [
                                                -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
                                                -                       input_shape=input_shape), name="stem")
                                                -        ]
                                                -
                                                -        for idx, conf in enumerate(layout):
                                                -            _layers.append(
                                                -                InvertedResidual(conf, name=f"inverted_{idx}"),
                                                -            )
                                                -
                                                -        _layers.append(
                                                -            Sequential(
                                                -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
                                                -                name="final_block"
                                                -            )
                                                -        )
                                                -
                                                -        if include_top:
                                                -            _layers.extend([
                                                -                layers.GlobalAveragePooling2D(),
                                                -                layers.Dense(head_chans, activation=hard_swish),
                                                -                layers.Dropout(0.2),
                                                -                layers.Dense(num_classes),
                                                -            ])
                                                -
                                                -        super().__init__(_layers)
                                                -
                                                -
                                                -def _mobilenet_v3(
                                                -    arch: str,
                                                -    pretrained: bool,
                                                -    input_shape: Optional[Tuple[int, int, int]] = None,
                                                -    **kwargs: Any
                                                -) -> MobileNetV3:
                                                -    input_shape = input_shape or default_cfgs[arch]['input_shape']
                                                -
                                                -    # cf. Table 1 & 2 of the paper
                                                -    if arch.startswith("mobilenet_v3_small"):
                                                -        inverted_residual_setting = [
                                                -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
                                                -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                                                -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
                                                -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                                                -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                                                -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
                                                -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
                                                -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
                                                -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                                                -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                                                -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
                                                -        ]
                                                -        head_chans = 1024
                                                -    else:
                                                -        inverted_residual_setting = [
                                                -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
                                                -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
                                                -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
                                                -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
                                                -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                                                -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
                                                -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
                                                -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
                                                -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                                                -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
                                                -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
                                                -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
                                                -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
                                                -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                                                -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
                                                -        ]
                                                -        head_chans = 1280
                                                -
                                                -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
                                                -
                                                -    # Build the model
                                                -    model = MobileNetV3(
                                                -        inverted_residual_setting,
                                                -        input_shape,
                                                -        head_chans=head_chans,
                                                -        **kwargs,
                                                -    )
                                                -    # Load pretrained parameters
                                                -    if pretrained:
                                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                                -
                                                -    return model
                                                -
                                                -
                                                -
                                                -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
                                                - - - -
                                                -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
                                                - - - -
                                                -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
                                                - - - -
                                                -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
                                                - -
                                                -
                                                -
                                                -
                                                - - -
                                                -
                                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.6.0/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index d959be9a0f..0000000000 --- a/v0.6.0/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                                -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                -
                                                -
                                                - - - - - Back to top - -
                                                -
                                                - -
                                                - -
                                                -
                                                -

                                                Source code for doctr.models.backbones.resnet.tensorflow

                                                -# Copyright (C) 2021, Mindee.
                                                -
                                                -# This program is licensed under the Apache License version 2.
                                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                -
                                                -from typing import Any, Dict, List, Optional, Tuple
                                                -
                                                -import tensorflow as tf
                                                -from tensorflow.keras import layers
                                                -from tensorflow.keras.models import Sequential
                                                -
                                                -from ...utils import conv_sequence, load_pretrained_params
                                                -
                                                -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
                                                -
                                                -
                                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
                                                -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
                                                -                 'url': None},
                                                -}
                                                -
                                                -
                                                -class ResnetBlock(layers.Layer):
                                                -
                                                -    """Implements a resnet31 block with shortcut
                                                -
                                                -    Args:
                                                -        conv_shortcut: Use of shortcut
                                                -        output_channels: number of channels to use in Conv2D
                                                -        kernel_size: size of square kernels
                                                -        strides: strides to use in the first convolution of the block
                                                -    """
                                                -    def __init__(
                                                -        self,
                                                -        output_channels: int,
                                                -        conv_shortcut: bool,
                                                -        strides: int = 1,
                                                -        **kwargs
                                                -    ) -> None:
                                                -
                                                -        super().__init__(**kwargs)
                                                -        if conv_shortcut:
                                                -            self.shortcut = Sequential(
                                                -                [
                                                -                    layers.Conv2D(
                                                -                        filters=output_channels,
                                                -                        strides=strides,
                                                -                        padding='same',
                                                -                        kernel_size=1,
                                                -                        use_bias=False,
                                                -                        kernel_initializer='he_normal'
                                                -                    ),
                                                -                    layers.BatchNormalization()
                                                -                ]
                                                -            )
                                                -        else:
                                                -            self.shortcut = layers.Lambda(lambda x: x)
                                                -        self.conv_block = Sequential(
                                                -            self.conv_resnetblock(output_channels, 3, strides)
                                                -        )
                                                -        self.act = layers.Activation('relu')
                                                -
                                                -    @staticmethod
                                                -    def conv_resnetblock(
                                                -        output_channels: int,
                                                -        kernel_size: int,
                                                -        strides: int = 1,
                                                -    ) -> List[layers.Layer]:
                                                -        return [
                                                -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
                                                -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
                                                -            layers.BatchNormalization(),
                                                -        ]
                                                -
                                                -    def call(
                                                -        self,
                                                -        inputs: tf.Tensor
                                                -    ) -> tf.Tensor:
                                                -        clone = self.shortcut(inputs)
                                                -        conv_out = self.conv_block(inputs)
                                                -        out = self.act(clone + conv_out)
                                                -
                                                -        return out
                                                -
                                                -
                                                -class ResnetStage(Sequential):
                                                -
                                                -    """Implements a resnet31 stage
                                                -
                                                -    Args:
                                                -        num_blocks: number of blocks inside the stage
                                                -        output_channels: number of channels to use in Conv2D
                                                -        downsample: if true, performs a /2 downsampling at the first block of the stage
                                                -    """
                                                -    def __init__(
                                                -        self,
                                                -        num_blocks: int,
                                                -        output_channels: int,
                                                -        downsample: bool = False,
                                                -    ) -> None:
                                                -
                                                -        super().__init__()
                                                -        final_blocks = [
                                                -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
                                                -        ]
                                                -        if downsample is True:
                                                -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
                                                -        else:
                                                -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
                                                -        for final_block in final_blocks:
                                                -            self.add(final_block)
                                                -
                                                -
                                                -class ResNet(Sequential):
                                                -
                                                -    """Resnet class with two convolutions and a maxpooling before the first stage
                                                -
                                                -    Args:
                                                -        num_blocks: number of resnet block in each stage
                                                -        output_channels: number of channels in each stage
                                                -        conv_seq: wether to add a conv_sequence after each stage
                                                -        pooling: pooling to add after each stage (if None, no pooling)
                                                -        input_shape: shape of inputs
                                                -        include_top: whether the classifier head should be instantiated
                                                -    """
                                                -
                                                -    def __init__(
                                                -        self,
                                                -        num_blocks: Tuple[int, int, int, int],
                                                -        output_channels: Tuple[int, int, int, int],
                                                -        conv_seq: Tuple[bool, bool, bool, bool],
                                                -        pooling: Tuple[
                                                -            Optional[Tuple[int, int]],
                                                -            Optional[Tuple[int, int]],
                                                -            Optional[Tuple[int, int]],
                                                -            Optional[Tuple[int, int]]
                                                -        ],
                                                -        input_shape: Tuple[int, int, int] = (640, 640, 3),
                                                -        include_top: bool = False,
                                                -    ) -> None:
                                                -
                                                -        _layers = [
                                                -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
                                                -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
                                                -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
                                                -        ]
                                                -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
                                                -            _layers.append(ResnetStage(n_blocks, out_channels))
                                                -            if conv:
                                                -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
                                                -            if pool:
                                                -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
                                                -        super().__init__(_layers)
                                                -
                                                -
                                                -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
                                                -
                                                -    # Build the model
                                                -    model = ResNet(
                                                -        default_cfgs[arch]['num_blocks'],
                                                -        default_cfgs[arch]['output_channels'],
                                                -        default_cfgs[arch]['conv_seq'],
                                                -        default_cfgs[arch]['pooling'],
                                                -        **kwargs
                                                -    )
                                                -    # Load pretrained parameters
                                                -    if pretrained:
                                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                                -
                                                -    return model
                                                -
                                                -
                                                -
                                                -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
                                                - -
                                                -
                                                -
                                                -
                                                - - -
                                                -
                                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.6.0/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 48c285257a..0000000000 --- a/v0.6.0/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
                                                -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                - -
                                                -
                                                - -
                                                -
                                                -
                                                - - - - - Back to top - -
                                                -
                                                - -
                                                - -
                                                -
                                                -

                                                Source code for doctr.models.backbones.vgg.tensorflow

                                                -# Copyright (C) 2021, Mindee.
                                                -
                                                -# This program is licensed under the Apache License version 2.
                                                -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                -
                                                -from typing import Any, Dict, Tuple
                                                -
                                                -from tensorflow.keras import layers
                                                -from tensorflow.keras.models import Sequential
                                                -
                                                -from ...utils import conv_sequence, load_pretrained_params
                                                -
                                                -__all__ = ['VGG', 'vgg16_bn']
                                                -
                                                -
                                                -default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
                                                -                 'rect_pools': (False, False, True, True, True),
                                                -                 'url': None},
                                                -}
                                                -
                                                -
                                                -class VGG(Sequential):
                                                -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
                                                -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
                                                -
                                                -    Args:
                                                -        num_blocks: number of convolutional block in each stage
                                                -        planes: number of output channels in each stage
                                                -        rect_pools: whether pooling square kernels should be replace with rectangular ones
                                                -        input_shape: shapes of the input tensor
                                                -        include_top: whether the classifier head should be instantiated
                                                -    """
                                                -    def __init__(
                                                -        self,
                                                -        num_blocks: Tuple[int, int, int, int, int],
                                                -        planes: Tuple[int, int, int, int, int],
                                                -        rect_pools: Tuple[bool, bool, bool, bool, bool],
                                                -        input_shape: Tuple[int, int, int] = (512, 512, 3),
                                                -        include_top: bool = False,
                                                -    ) -> None:
                                                -
                                                -        _layers = []
                                                -        # Specify input_shape only for the first layer
                                                -        kwargs = {"input_shape": input_shape}
                                                -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
                                                -            for _ in range(nb_blocks):
                                                -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
                                                -                kwargs = {}
                                                -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
                                                -        super().__init__(_layers)
                                                -
                                                -
                                                -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
                                                -
                                                -    # Build the model
                                                -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
                                                -                default_cfgs[arch]['rect_pools'], **kwargs)
                                                -    # Load pretrained parameters
                                                -    if pretrained:
                                                -        load_pretrained_params(model, default_cfgs[arch]['url'])
                                                -
                                                -    return model
                                                -
                                                -
                                                -
                                                -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
                                                - -
                                                -
                                                -
                                                -
                                                - - -
                                                -
                                                - - Made with Sphinx and @pradyunsg's - - Furo - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - -
                                                -
                                                - - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html index 7497de5e61..e181ef6a1f 100644 --- a/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/magc_resnet/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -301,7 +304,7 @@ 

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                from typing import Any, Dict, List, Optional, Tuple import tensorflow as tf -from tensorflow.keras import layers +from tensorflow.keras import activations, layers from tensorflow.keras.models import Sequential from doctr.datasets import VOCABS @@ -318,7 +321,7 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": None, + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/magc_resnet31-16aa7d71.weights.h5&src=0", }, } @@ -328,6 +331,7 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                <https://arxiv.org/pdf/1910.02562.pdf>`_. Args: + ---- inplanes: input channels headers: number of headers to split channels attn_scale: if True, re-scale attention to counteract the variance distibutions @@ -348,6 +352,7 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                self.headers = headers # h self.inplanes = inplanes # C self.attn_scale = attn_scale + self.ratio = ratio self.planes = int(inplanes * ratio) self.single_header_inplanes = int(inplanes / headers) # C / h @@ -388,7 +393,7 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                if self.attn_scale and self.headers > 1: context_mask = context_mask / math.sqrt(self.single_header_inplanes) # B*h, 1, H*W, 1 - context_mask = tf.keras.activations.softmax(context_mask, axis=2) + context_mask = activations.softmax(context_mask, axis=2) # Compute context # B*h, 1, C/h, 1 @@ -420,7 +425,6 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                origin_stem: bool = True, **kwargs: Any, ) -> ResNet: - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) @@ -445,7 +449,11 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -464,12 +472,14 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A feature extractor model """ - return _magc_resnet( "magc_resnet31", pretrained, @@ -515,7 +525,7 @@

                                                Source code for doctr.models.classification.magc_resnet.tensorflow

                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html index fbb797d4fc..c9545166e7 100644 --- a/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/mobilenet/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -313,7 +316,8 @@ 

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                "mobilenet_v3_small_r", "mobilenet_v3_large", "mobilenet_v3_large_r", - "mobilenet_v3_small_orientation", + "mobilenet_v3_small_crop_orientation", + "mobilenet_v3_small_page_orientation", ] @@ -323,35 +327,42 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large-47d25d7e.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large-d857506e.weights.h5&src=0", }, "mobilenet_v3_large_r": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large_r-a108e192.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large_r-eef2e3c6.weights.h5&src=0", }, "mobilenet_v3_small": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small-8a32c32c.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small-3fcebad7.weights.h5&src=0", }, "mobilenet_v3_small_r": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small_r-3d61452e.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_r-dd50218d.weights.h5&src=0", }, - "mobilenet_v3_small_orientation": { + "mobilenet_v3_small_crop_orientation": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (128, 128, 3), - "classes": [0, 90, 180, 270], - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/classif_mobilenet_v3_small-1ea8db03.zip&src=0", + "classes": [0, -90, 180, 90], + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_crop_orientation-ef019b6b.weights.h5&src=0", + }, + "mobilenet_v3_small_page_orientation": { + "mean": (0.694, 0.695, 0.693), + "std": (0.299, 0.296, 0.301), + "input_shape": (512, 512, 3), + "classes": [0, -90, 180, 90], + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_page_orientation-0071d55d.weights.h5&src=0", }, } @@ -374,14 +385,12 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                """Squeeze and Excitation.""" def __init__(self, chan: int, squeeze_factor: int = 4) -> None: - super().__init__( - [ - layers.GlobalAveragePooling2D(), - layers.Dense(chan // squeeze_factor, activation="relu"), - layers.Dense(chan, activation="hard_sigmoid"), - layers.Reshape((1, 1, chan)), - ] - ) + super().__init__([ + layers.GlobalAveragePooling2D(), + layers.Dense(chan // squeeze_factor, activation="relu"), + layers.Dense(chan, activation="hard_sigmoid"), + layers.Reshape((1, 1, chan)), + ]) def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor: x = super().call(inputs, **kwargs) @@ -418,6 +427,7 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                """InvertedResidual for mobilenet Args: + ---- conf: configuration object for inverted residual """ @@ -471,7 +481,6 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                inputs: tf.Tensor, **kwargs: Any, ) -> tf.Tensor: - out = self.block(inputs, **kwargs) if self.use_res_connect: out = tf.add(out, inputs) @@ -494,7 +503,6 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                cfg: Optional[Dict[str, Any]] = None, input_shape: Optional[Tuple[int, int, int]] = None, ) -> None: - _layers = [ Sequential( conv_sequence( @@ -514,21 +522,18 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                ) if include_top: - _layers.extend( - [ - layers.GlobalAveragePooling2D(), - layers.Dense(head_chans, activation=hard_swish), - layers.Dropout(0.2), - layers.Dense(num_classes), - ] - ) + _layers.extend([ + layers.GlobalAveragePooling2D(), + layers.Dense(head_chans, activation=hard_swish), + layers.Dropout(0.2), + layers.Dense(num_classes), + ]) super().__init__(_layers) self.cfg = cfg def _mobilenet_v3(arch: str, pretrained: bool, rect_strides: bool = False, **kwargs: Any) -> MobileNetV3: - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) @@ -587,7 +592,11 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -606,12 +615,14 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - return _mobilenet_v3("mobilenet_v3_small", pretrained, False, **kwargs)
                                                @@ -630,12 +641,14 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ - return _mobilenet_v3("mobilenet_v3_small_r", pretrained, True, **kwargs)
                                                @@ -654,9 +667,12 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ return _mobilenet_v3("mobilenet_v3_large", pretrained, False, **kwargs)
                                                @@ -677,36 +693,67 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ return _mobilenet_v3("mobilenet_v3_large_r", pretrained, True, **kwargs)
                                                -
                                                -[docs] -def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: +
                                                +[docs] +def mobilenet_v3_small_crop_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: """MobileNetV3-Small architecture as described in `"Searching for MobileNetV3", <https://arxiv.org/pdf/1905.02244.pdf>`_. >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_orientation - >>> model = mobilenet_v3_small_orientation(pretrained=False) + >>> from doctr.models import mobilenet_v3_small_crop_orientation + >>> model = mobilenet_v3_small_crop_orientation(pretrained=False) >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture Returns: + ------- a keras.Model """ + return _mobilenet_v3("mobilenet_v3_small_crop_orientation", pretrained, include_top=True, **kwargs)
                                                + + - return _mobilenet_v3("mobilenet_v3_small_orientation", pretrained, include_top=True, **kwargs)
                                                +
                                                +[docs] +def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: + """MobileNetV3-Small architecture as described in + `"Searching for MobileNetV3", + <https://arxiv.org/pdf/1905.02244.pdf>`_. + + >>> import tensorflow as tf + >>> from doctr.models import mobilenet_v3_small_page_orientation + >>> model = mobilenet_v3_small_page_orientation(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + ---- + pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the MobileNetV3 architecture + + Returns: + ------- + a keras.Model + """ + return _mobilenet_v3("mobilenet_v3_small_page_orientation", pretrained, include_top=True, **kwargs)
                                                @@ -740,7 +787,7 @@

                                                Source code for doctr.models.classification.mobilenet.tensorflow

                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html index 2179fb6c94..620d4f0635 100644 --- a/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/resnet/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -316,44 +319,44 @@ 

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/resnet18-d4634669.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet18-f42d3854.weights.h5&src=0", }, "resnet31": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet31-5a47a60b.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet31-ab75f78c.weights.h5&src=0", }, "resnet34": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet34-5dcc97ca.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34-03967df9.weights.h5&src=0", }, "resnet50": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet50-e75e4cdf.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet50-82358f34.weights.h5&src=0", }, "resnet34_wide": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet34_wide-c1271816.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34_wide-b18fdf79.weights.h5&src=0", }, } class ResnetBlock(layers.Layer): - """Implements a resnet31 block with shortcut Args: + ---- conv_shortcut: Use of shortcut output_channels: number of channels to use in Conv2D kernel_size: size of square kernels @@ -361,22 +364,19 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                """ def __init__(self, output_channels: int, conv_shortcut: bool, strides: int = 1, **kwargs) -> None: - super().__init__(**kwargs) if conv_shortcut: - self.shortcut = Sequential( - [ - layers.Conv2D( - filters=output_channels, - strides=strides, - padding="same", - kernel_size=1, - use_bias=False, - kernel_initializer="he_normal", - ), - layers.BatchNormalization(), - ] - ) + self.shortcut = Sequential([ + layers.Conv2D( + filters=output_channels, + strides=strides, + padding="same", + kernel_size=1, + use_bias=False, + kernel_initializer="he_normal", + ), + layers.BatchNormalization(), + ]) else: self.shortcut = layers.Lambda(lambda x: x) self.conv_block = Sequential(self.conv_resnetblock(output_channels, 3, strides)) @@ -416,6 +416,7 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                """Implements a ResNet architecture Args: + ---- num_blocks: number of resnet block in each stage output_channels: number of channels in each stage stage_downsample: whether the first residual block of a stage should downsample @@ -444,7 +445,6 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                cfg: Optional[Dict[str, Any]] = None, input_shape: Optional[Tuple[int, int, int]] = None, ) -> None: - inplanes = stem_channels if origin_stem: _layers = [ @@ -471,12 +471,10 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                inplanes = out_chan if include_top: - _layers.extend( - [ - layers.GlobalAveragePooling2D(), - layers.Dense(num_classes), - ] - ) + _layers.extend([ + layers.GlobalAveragePooling2D(), + layers.Dense(num_classes), + ]) super().__init__(_layers) self.cfg = cfg @@ -493,7 +491,6 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                origin_stem: bool = True, **kwargs: Any, ) -> ResNet: - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) @@ -510,7 +507,11 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                ) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -528,12 +529,14 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( "resnet18", pretrained, @@ -562,12 +565,14 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( "resnet31", pretrained, @@ -596,12 +601,14 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( "resnet34", pretrained, @@ -629,12 +636,14 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs["resnet50"]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs["resnet50"]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs["resnet50"]["classes"]) @@ -658,7 +667,13 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs["resnet50"]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, + default_cfgs["resnet50"]["url"], + skip_mismatch=kwargs["num_classes"] != len(default_cfgs["resnet50"]["classes"]), + ) return model
                                                @@ -675,12 +690,14 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the ResNet architecture Returns: + ------- A classification model """ - return _resnet( "resnet34_wide", pretrained, @@ -725,7 +742,7 @@

                                                Source code for doctr.models.classification.resnet.tensorflow

                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

                                                Source code for doctr.models.classification.textnet.tensorflow

                                                from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html index e966cb3913..66ee6dcdd8 100644 --- a/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/vgg/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -314,7 +317,7 @@ 

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                "std": (1.0, 1.0, 1.0), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.4.1/vgg16_bn_r-c5836cea.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vgg16_bn_r-b4d69212.weights.h5&src=0", }, } @@ -324,6 +327,7 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                <https://arxiv.org/pdf/1409.1556.pdf>`_. Args: + ---- num_blocks: number of convolutional block in each stage planes: number of output channels in each stage rect_pools: whether pooling square kernels should be replace with rectangular ones @@ -342,7 +346,6 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                input_shape: Optional[Tuple[int, int, int]] = None, cfg: Optional[Dict[str, Any]] = None, ) -> None: - _layers = [] # Specify input_shape only for the first layer kwargs = {"input_shape": input_shape} @@ -361,7 +364,6 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                def _vgg( arch: str, pretrained: bool, num_blocks: List[int], planes: List[int], rect_pools: List[bool], **kwargs: Any ) -> VGG: - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) @@ -376,7 +378,11 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                model = VGG(num_blocks, planes, rect_pools, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -395,12 +401,14 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on ImageNet + **kwargs: keyword arguments of the VGG architecture Returns: + ------- VGG feature extractor """ - return _vgg( "vgg16_bn_r", pretrained, [2, 2, 3, 3, 3], [64, 128, 256, 512, 512], [False, False, True, True, True], **kwargs )
                                                @@ -437,7 +445,7 @@

                                                Source code for doctr.models.classification.vgg.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html b/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html index 2a9aae9d66..7059d1f1d8 100644 --- a/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/classification/vit/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.vit.tensorflow

                                                -# Copyright (C) 2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -300,7 +303,6 @@ 

                                                Source code for doctr.models.classification.vit.tensorflow

                                                import tensorflow as tf from tensorflow.keras import Sequential, layers -from tensorflow_addons.layers import GELU from doctr.datasets import VOCABS from doctr.models.modules.transformer import EncoderBlock @@ -318,14 +320,14 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                "std": (0.299, 0.296, 0.301), "input_shape": (3, 32, 32), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.1/vit_s-7a23bea4.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vit_s-69bc459e.weights.h5&src=0", }, "vit_b": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 32, 3), "classes": list(VOCABS["french"]), - "url": "https://doctr-static.mindee.com/models?id=v0.5.1/vit_b-983c86b5.zip&src=0", + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vit_b-c64705bd.weights.h5&src=0", }, } @@ -334,6 +336,7 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                """Classifier head for Vision Transformer Args: + ---- num_classes: number of output classes """ @@ -353,10 +356,12 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                <https://arxiv.org/pdf/2010.11929.pdf>`_. Args: + ---- d_model: dimension of the transformer layers num_layers: number of transformer layers num_heads: number of attention heads ffd_ratio: multiplier for the hidden dimension of the feedforward layer + patch_size: size of the patches input_shape: size of the input image dropout: dropout rate num_classes: number of output classes @@ -369,16 +374,23 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                num_layers: int, num_heads: int, ffd_ratio: int, + patch_size: Tuple[int, int] = (4, 4), input_shape: Tuple[int, int, int] = (32, 32, 3), dropout: float = 0.0, num_classes: int = 1000, include_top: bool = True, cfg: Optional[Dict[str, Any]] = None, ) -> None: - _layers = [ - PatchEmbedding(input_shape, d_model), - EncoderBlock(num_layers, num_heads, d_model, d_model * ffd_ratio, dropout, activation_fct=GELU()), + PatchEmbedding(input_shape, d_model, patch_size), + EncoderBlock( + num_layers, + num_heads, + d_model, + d_model * ffd_ratio, + dropout, + activation_fct=layers.Activation("gelu"), + ), ] if include_top: _layers.append(ClassifierHead(num_classes)) @@ -392,7 +404,6 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                pretrained: bool, **kwargs: Any, ) -> VisionTransformer: - kwargs["num_classes"] = kwargs.get("num_classes", len(default_cfgs[arch]["classes"])) kwargs["input_shape"] = kwargs.get("input_shape", default_cfgs[arch]["input_shape"]) kwargs["classes"] = kwargs.get("classes", default_cfgs[arch]["classes"]) @@ -407,7 +418,11 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                model = VisionTransformer(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The number of classes is not the same as the number of classes in the pretrained model => + # skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"]) + ) return model @@ -428,12 +443,14 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the VisionTransformer architecture Returns: + ------- A feature extractor model """ - return _vit( "vit_s", pretrained, @@ -460,12 +477,14 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained: boolean, True if model is pretrained + **kwargs: keyword arguments of the VisionTransformer architecture Returns: + ------- A feature extractor model """ - return _vit( "vit_b", pretrained, @@ -508,7 +527,7 @@

                                                Source code for doctr.models.classification.vit.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/classification/zoo.html b/v0.6.0/_modules/doctr/models/classification/zoo.html index 48f112a19d..9ecb9674f6 100644 --- a/v0.6.0/_modules/doctr/models/classification/zoo.html +++ b/v0.6.0/_modules/doctr/models/classification/zoo.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.classification.zoo

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -301,9 +304,9 @@ 

                                                Source code for doctr.models.classification.zoo

                                                < from .. import classification from ..preprocessor import PreProcessor -from .predictor import CropOrientationPredictor +from .predictor import OrientationPredictor -__all__ = ["crop_orientation_predictor"] +__all__ = ["crop_orientation_predictor", "page_orientation_predictor"] ARCHS: List[str] = [ "magc_resnet31", @@ -316,25 +319,39 @@

                                                Source code for doctr.models.classification.zoo

                                                < "resnet34", "resnet50", "resnet34_wide", + "textnet_tiny", + "textnet_small", + "textnet_base", "vgg16_bn_r", "vit_s", "vit_b", ] -ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_orientation"] +ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_crop_orientation", "mobilenet_v3_small_page_orientation"] -def _crop_orientation_predictor(arch: str, pretrained: bool, **kwargs: Any) -> CropOrientationPredictor: +def _orientation_predictor( + arch: Any, pretrained: bool, model_type: str, disabled: bool = False, **kwargs: Any +) -> OrientationPredictor: + if disabled: + # Case where the orientation predictor is disabled + return OrientationPredictor(None, None) - if arch not in ORIENTATION_ARCHS: - raise ValueError(f"unknown architecture '{arch}'") + if isinstance(arch, str): + if arch not in ORIENTATION_ARCHS: + raise ValueError(f"unknown architecture '{arch}'") + + # Load directly classifier from backbone + _model = classification.__dict__[arch](pretrained=pretrained) + else: + if not isinstance(arch, classification.MobileNetV3): + raise ValueError(f"unknown architecture: {type(arch)}") + _model = arch - # Load directly classifier from backbone - _model = classification.__dict__[arch](pretrained=pretrained) kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"]) kwargs["std"] = kwargs.get("std", _model.cfg["std"]) - kwargs["batch_size"] = kwargs.get("batch_size", 64) + kwargs["batch_size"] = kwargs.get("batch_size", 128 if model_type == "crop" else 4) input_shape = _model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:] - predictor = CropOrientationPredictor( + predictor = OrientationPredictor( PreProcessor(input_shape, preserve_aspect_ratio=True, symmetric_pad=True, **kwargs), _model ) return predictor @@ -343,25 +360,54 @@

                                                Source code for doctr.models.classification.zoo

                                                <
                                                [docs] def crop_orientation_predictor( - arch: str = "mobilenet_v3_small_orientation", pretrained: bool = False, **kwargs: Any -) -> CropOrientationPredictor: - """Orientation classification architecture. + arch: Any = "mobilenet_v3_small_crop_orientation", pretrained: bool = False, **kwargs: Any +) -> OrientationPredictor: + """Crop orientation classification architecture. >>> import numpy as np >>> from doctr.models import crop_orientation_predictor - >>> model = crop_orientation_predictor(arch='classif_mobilenet_v3_small', pretrained=True) - >>> input_crop = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> model = crop_orientation_predictor(arch='mobilenet_v3_small_crop_orientation', pretrained=True) + >>> input_crop = (255 * np.random.rand(256, 256, 3)).astype(np.uint8) >>> out = model([input_crop]) Args: - arch: name of the architecture to use (e.g. 'mobilenet_v3_small') + ---- + arch: name of the architecture to use (e.g. 'mobilenet_v3_small_crop_orientation') pretrained: If True, returns a model pre-trained on our recognition crops dataset + **kwargs: keyword arguments to be passed to the OrientationPredictor Returns: - CropOrientationPredictor + ------- + OrientationPredictor """ + return _orientation_predictor(arch, pretrained, model_type="crop", **kwargs)
                                                + + + +
                                                +[docs] +def page_orientation_predictor( + arch: Any = "mobilenet_v3_small_page_orientation", pretrained: bool = False, **kwargs: Any +) -> OrientationPredictor: + """Page orientation classification architecture. + + >>> import numpy as np + >>> from doctr.models import page_orientation_predictor + >>> model = page_orientation_predictor(arch='mobilenet_v3_small_page_orientation', pretrained=True) + >>> input_page = (255 * np.random.rand(512, 512, 3)).astype(np.uint8) + >>> out = model([input_page]) - return _crop_orientation_predictor(arch, pretrained, **kwargs)
                                                + Args: + ---- + arch: name of the architecture to use (e.g. 'mobilenet_v3_small_page_orientation') + pretrained: If True, returns a model pre-trained on our recognition crops dataset + **kwargs: keyword arguments to be passed to the OrientationPredictor + + Returns: + ------- + OrientationPredictor + """ + return _orientation_predictor(arch, pretrained, model_type="page", **kwargs)
                                                @@ -395,7 +441,7 @@

                                                Source code for doctr.models.classification.zoo

                                                <
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html index bf685ae9fb..dc65e2ed03 100644 --- a/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/differentiable_binarization/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
                                                 
                                                 from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple
                                                -
                                                -import numpy as np
                                                 import tensorflow as tf
                                                 from tensorflow import keras
                                                 from tensorflow.keras import layers
                                                -from tensorflow.keras.applications import ResNet50
                                                +from typing import List, Tuple, Optional, Any, Dict
                                                 
                                                -from doctr.models.utils import IntermediateLayerGetter, conv_sequence, load_pretrained_params
                                                 from doctr.utils.repr import NestedObject
                                                -
                                                -from ...classification import mobilenet_v3_large
                                                +from doctr.models.utils import IntermediateLayerGetter, load_pretrained_params, conv_sequence
                                                 from .base import DBPostProcessor, _DBNet
                                                 
                                                -__all__ = ["DBNet", "db_resnet50", "db_mobilenet_v3_large"]
                                                +__all__ = ['DBNet', 'db_resnet50']
                                                 
                                                 
                                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    "db_resnet50": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": "https://doctr-static.mindee.com/models?id=v0.2.0/db_resnet50-adcafc63.zip&src=0",
                                                -    },
                                                -    "db_mobilenet_v3_large": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/db_mobilenet_v3_large-8c16d5bf.zip&src=0",
                                                +    'db_resnet50': {
                                                +        'mean': (0.798, 0.785, 0.772),
                                                +        'std': (0.264, 0.2749, 0.287),
                                                +        'backbone': 'ResNet50',
                                                +        'fpn_layers': ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"],
                                                +        'fpn_channels': 128,
                                                +        'input_shape': (1024, 1024, 3),
                                                +        'rotated_bbox': False,
                                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.2.0/db_resnet50-adcafc63.zip',
                                                     },
                                                 }
                                                 
                                                @@ -345,9 +323,9 @@ 

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo ) -> None: super().__init__() self.channels = channels - self.upsample = layers.UpSampling2D(size=(2, 2), interpolation="nearest") - self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer="he_normal") for _ in range(4)] - self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2**idx) for idx in range(4)] + self.upsample = layers.UpSampling2D(size=(2, 2), interpolation='nearest') + self.inner_blocks = [layers.Conv2D(channels, 1, strides=1, kernel_initializer='he_normal') for _ in range(4)] + self.layer_blocks = [self.build_upsampling(channels, dilation_factor=2 ** idx) for idx in range(4)] @staticmethod def build_upsampling( @@ -365,10 +343,10 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo """ - _layers = conv_sequence(channels, "relu", True, kernel_size=3) + _layers = conv_sequence(channels, 'relu', True, kernel_size=3) if dilation_factor > 1: - _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest")) + _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation='nearest')) module = keras.Sequential(_layers) @@ -401,21 +379,15 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo Args: feature extractor: the backbone serving as feature extractor fpn_channels: number of channels each extracted feature maps is mapped to - num_classes: number of output channels in the segmentation map - assume_straight_pages: if True, fit straight bounding boxes only - exportable: onnx exportable returns only logits - cfg: the configuration dict of the model """ - _children_names: List[str] = ["feat_extractor", "fpn", "probability_head", "threshold_head", "postprocessor"] + _children_names: List[str] = ['feat_extractor', 'fpn', 'probability_head', 'threshold_head', 'postprocessor'] def __init__( self, feature_extractor: IntermediateLayerGetter, - fpn_channels: int = 128, # to be set to 256 to represent the author's initial idea - num_classes: int = 1, - assume_straight_pages: bool = True, - exportable: bool = False, + fpn_channels: int = 128, + rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: @@ -423,8 +395,7 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo self.cfg = cfg self.feat_extractor = feature_extractor - self.exportable = exportable - self.assume_straight_pages = assume_straight_pages + self.rotated_bbox = rotated_bbox self.fpn = FeaturePyramidNetwork(channels=fpn_channels) # Initialize kernels @@ -433,26 +404,31 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo self.probability_head = keras.Sequential( [ - *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), layers.BatchNormalization(), - layers.Activation("relu"), - layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + layers.Activation('relu'), + layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), ] ) self.threshold_head = keras.Sequential( [ - *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]), - layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"), + *conv_sequence(64, 'relu', True, kernel_size=3, input_shape=output_shape[1:]), + layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer='he_normal'), layers.BatchNormalization(), - layers.Activation("relu"), - layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"), + layers.Activation('relu'), + layers.Conv2DTranspose(1, 2, strides=2, kernel_initializer='he_normal'), ] ) - self.postprocessor = DBPostProcessor(assume_straight_pages=assume_straight_pages) + self.postprocessor = DBPostProcessor(rotated_bbox=rotated_bbox) - def compute_loss(self, out_map: tf.Tensor, thresh_map: tf.Tensor, target: List[np.ndarray]) -> tf.Tensor: + def compute_loss( + self, + out_map: tf.Tensor, + thresh_map: tf.Tensor, + target: List[Dict[str, Any]] + ) -> tf.Tensor: """Compute a batch of gts, masks, thresh_gts, thresh_masks from a list of boxes and a list of masks for each image. From there it computes the loss with the model output @@ -468,48 +444,48 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo prob_map = tf.math.sigmoid(tf.squeeze(out_map, axis=[-1])) thresh_map = tf.math.sigmoid(tf.squeeze(thresh_map, axis=[-1])) - seg_target, seg_mask, thresh_target, thresh_mask = self.build_target(target, out_map.shape[:3]) - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target, seg_mask, thresh_target, thresh_mask = self.compute_target(target, out_map.shape[:3]) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - thresh_target = tf.convert_to_tensor(thresh_target, dtype=out_map.dtype) + thresh_target = tf.convert_to_tensor(thresh_target, dtype=tf.float32) thresh_mask = tf.convert_to_tensor(thresh_mask, dtype=tf.bool) # Compute balanced BCE loss for proba_map - bce_scale = 5.0 + bce_scale = 5. bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map, from_logits=True)[seg_mask] neg_target = 1 - seg_target[seg_mask] positive_count = tf.math.reduce_sum(seg_target[seg_mask]) - negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3.0 * positive_count]) + negative_count = tf.math.reduce_min([tf.math.reduce_sum(neg_target), 3. * positive_count]) negative_loss = bce_loss * neg_target negative_loss, _ = tf.nn.top_k(negative_loss, tf.cast(negative_count, tf.int32)) sum_losses = tf.math.reduce_sum(bce_loss * seg_target[seg_mask]) + tf.math.reduce_sum(negative_loss) balanced_bce_loss = sum_losses / (positive_count + negative_count + 1e-6) # Compute dice loss for approxbin_map - bin_map = 1 / (1 + tf.exp(-50.0 * (prob_map[seg_mask] - thresh_map[seg_mask]))) + bin_map = 1 / (1 + tf.exp(-50. * (prob_map[seg_mask] - thresh_map[seg_mask]))) bce_min = tf.math.reduce_min(bce_loss) - weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1.0 + weights = (bce_loss - bce_min) / (tf.math.reduce_max(bce_loss) - bce_min) + 1. inter = tf.math.reduce_sum(bin_map * seg_target[seg_mask] * weights) union = tf.math.reduce_sum(bin_map) + tf.math.reduce_sum(seg_target[seg_mask]) + 1e-8 dice_loss = 1 - 2.0 * inter / union # Compute l1 loss for thresh_map - l1_scale = 10.0 + l1_scale = 10. if tf.reduce_any(thresh_mask): l1_loss = tf.math.reduce_mean(tf.math.abs(thresh_map[thresh_mask] - thresh_target[thresh_mask])) else: - l1_loss = tf.constant(0.0) + l1_loss = tf.constant(0.) return l1_scale * l1_loss + bce_scale * balanced_bce_loss + dice_loss def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, - return_preds: bool = False, + return_boxes: bool = False, **kwargs: Any, ) -> Dict[str, Any]: @@ -518,139 +494,69 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo logits = self.probability_head(feat_concat, **kwargs) out: Dict[str, tf.Tensor] = {} - if self.exportable: - out["logits"] = logits - return out - - if return_model_output or target is None or return_preds: + if return_model_output or target is None or return_boxes: prob_map = tf.math.sigmoid(logits) if return_model_output: out["out_map"] = prob_map - if target is None or return_preds: - # Post-process boxes (keep only text predictions) - out["preds"] = [preds[0] for preds in self.postprocessor(prob_map.numpy())] + if target is None or return_boxes: + # Post-process boxes + out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) if target is not None: thresh_map = self.threshold_head(feat_concat, **kwargs) loss = self.compute_loss(logits, thresh_map, target) - out["loss"] = loss + out['loss'] = loss return out -def _db_resnet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _db_resnet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> DBNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg["input_shape"] = input_shape or _cfg["input_shape"] + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['fpn_channels'] = kwargs.get('fpn_channels', _cfg['fpn_channels']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) # Feature extractor - feat_extractor = IntermediateLayerGetter( - backbone_fn( - weights="imagenet" if pretrained_backbone else None, - include_top=False, - pooling=None, - input_shape=_cfg["input_shape"], - ), - fpn_layers, + resnet = tf.keras.applications.__dict__[_cfg['backbone']]( + include_top=False, + weights=None, + input_shape=_cfg['input_shape'], + pooling=None, ) - # Build the model - model = DBNet(feat_extractor, cfg=_cfg, **kwargs) - # Load pretrained parameters - if pretrained: - load_pretrained_params(model, _cfg["url"]) - - return model - - -def _db_mobilenet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> DBNet: - - pretrained_backbone = pretrained_backbone and not pretrained - - # Patch the config - _cfg = deepcopy(default_cfgs[arch]) - _cfg["input_shape"] = input_shape or _cfg["input_shape"] - - # Feature extractor feat_extractor = IntermediateLayerGetter( - backbone_fn( - input_shape=_cfg["input_shape"], - include_top=False, - pretrained=pretrained_backbone, - ), - fpn_layers, + resnet, + _cfg['fpn_layers'], ) + kwargs['fpn_channels'] = _cfg['fpn_channels'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] + # Build the model model = DBNet(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg["url"]) + load_pretrained_params(model, _cfg['url']) return model
                                                -[docs] +[docs] def db_resnet50(pretrained: bool = False, **kwargs: Any) -> DBNet: """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" <https://arxiv.org/pdf/1911.08947.pdf>`_, using a ResNet-50 backbone. - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _db_resnet( - "db_resnet50", - pretrained, - ResNet50, - ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], - **kwargs, - )
                                                - - - -
                                                -[docs] -def db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> DBNet: - """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization" - <https://arxiv.org/pdf/1911.08947.pdf>`_, using a mobilenet v3 large backbone. - - >>> import tensorflow as tf - >>> from doctr.models import db_mobilenet_v3_large - >>> model = db_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text detection dataset @@ -659,13 +565,7 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo text detection architecture """ - return _db_mobilenet( - "db_mobilenet_v3_large", - pretrained, - mobilenet_v3_large, - ["inverted_2", "inverted_5", "inverted_11", "final_block"], - **kwargs, - )

                                                + return _db_resnet('db_resnet50', pretrained, **kwargs)

                                                @@ -699,7 +599,7 @@

                                                Source code for doctr.models.detection.differentiable_binarization.tensorflo

                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

                                                Source code for doctr.models.detection.fast.tensorflow

                                                import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html b/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html index 1aa7020064..9f836ce462 100644 --- a/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/detection/linknet/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.detection.linknet.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 # Credits: post-processing adapted from https://github.com/xuannianz/DifferentiableBinarization
                                                 
                                                 from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple
                                                -
                                                -import numpy as np
                                                 import tensorflow as tf
                                                 from tensorflow import keras
                                                -from tensorflow.keras import Model, Sequential, layers
                                                +from tensorflow.keras import layers, Sequential
                                                +from typing import Dict, Any, Tuple, Optional, List
                                                 
                                                -from doctr.models.classification import resnet18, resnet34, resnet50
                                                -from doctr.models.utils import IntermediateLayerGetter, conv_sequence, load_pretrained_params
                                                 from doctr.utils.repr import NestedObject
                                                -
                                                +from doctr.models.backbones import ResnetStage
                                                +from doctr.models.utils import conv_sequence, load_pretrained_params
                                                 from .base import LinkNetPostProcessor, _LinkNet
                                                 
                                                -__all__ = ["LinkNet", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50", "linknet_resnet18_rotation"]
                                                +__all__ = ['LinkNet', 'linknet16']
                                                 
                                                 
                                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    "linknet_resnet18": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": None,
                                                -    },
                                                -    "linknet_resnet18_rotation": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": "https://doctr-static.mindee.com/models?id=v0.5.0/linknet_resnet18-a48e6ed3.zip&src=0",
                                                -    },
                                                -    "linknet_resnet34": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": None,
                                                -    },
                                                -    "linknet_resnet50": {
                                                -        "mean": (0.798, 0.785, 0.772),
                                                -        "std": (0.264, 0.2749, 0.287),
                                                -        "input_shape": (1024, 1024, 3),
                                                -        "url": None,
                                                +    'linknet16': {
                                                +        'mean': (0.798, 0.785, 0.772),
                                                +        'std': (0.264, 0.2749, 0.287),
                                                +        'num_classes': 1,
                                                +        'input_shape': (1024, 1024, 3),
                                                +        'rotated_bbox': False,
                                                +        'url': None,
                                                     },
                                                 }
                                                 
                                                 
                                                -def decoder_block(in_chan: int, out_chan: int, stride: int, **kwargs: Any) -> Sequential:
                                                +def decoder_block(in_chan: int, out_chan: int) -> Sequential:
                                                     """Creates a LinkNet decoder block"""
                                                 
                                                -    return Sequential(
                                                -        [
                                                -            *conv_sequence(in_chan // 4, "relu", True, kernel_size=1, **kwargs),
                                                -            layers.Conv2DTranspose(
                                                -                filters=in_chan // 4,
                                                -                kernel_size=3,
                                                -                strides=stride,
                                                -                padding="same",
                                                -                use_bias=False,
                                                -                kernel_initializer="he_normal",
                                                -            ),
                                                -            layers.BatchNormalization(),
                                                -            layers.Activation("relu"),
                                                -            *conv_sequence(out_chan, "relu", True, kernel_size=1),
                                                -        ]
                                                -    )
                                                +    return Sequential([
                                                +        *conv_sequence(in_chan // 4, 'relu', True, kernel_size=1),
                                                +        layers.Conv2DTranspose(
                                                +            filters=in_chan // 4,
                                                +            kernel_size=3,
                                                +            strides=2,
                                                +            padding="same",
                                                +            use_bias=False,
                                                +            kernel_initializer='he_normal'
                                                +        ),
                                                +        layers.BatchNormalization(),
                                                +        layers.Activation('relu'),
                                                +        *conv_sequence(out_chan, 'relu', True, kernel_size=1),
                                                +    ])
                                                 
                                                 
                                                -class LinkNetFPN(Model, NestedObject):
                                                -    """LinkNet Decoder module"""
                                                +class LinkNetFPN(layers.Layer, NestedObject):
                                                +    """LinkNet Encoder-Decoder module"""
                                                 
                                                     def __init__(
                                                         self,
                                                -        out_chans: int,
                                                -        in_shapes: List[Tuple[int, ...]],
                                                     ) -> None:
                                                 
                                                         super().__init__()
                                                -        self.out_chans = out_chans
                                                -        strides = [2] * (len(in_shapes) - 1) + [1]
                                                -        i_chans = [s[-1] for s in in_shapes[::-1]]
                                                -        o_chans = i_chans[1:] + [out_chans]
                                                -        self.decoders = [
                                                -            decoder_block(in_chan, out_chan, s, input_shape=in_shape)
                                                -            for in_chan, out_chan, s, in_shape in zip(i_chans, o_chans, strides, in_shapes[::-1])
                                                -        ]
                                                -
                                                -    def call(self, x: List[tf.Tensor]) -> tf.Tensor:
                                                -        out = 0
                                                -        for decoder, fmap in zip(self.decoders, x[::-1]):
                                                -            out = decoder(out + fmap)
                                                -        return out
                                                +        self.encoder_1 = ResnetStage(num_blocks=2, output_channels=64, downsample=True)
                                                +        self.encoder_2 = ResnetStage(num_blocks=2, output_channels=128, downsample=True)
                                                +        self.encoder_3 = ResnetStage(num_blocks=2, output_channels=256, downsample=True)
                                                +        self.encoder_4 = ResnetStage(num_blocks=2, output_channels=512, downsample=True)
                                                +        self.decoder_1 = decoder_block(in_chan=64, out_chan=64)
                                                +        self.decoder_2 = decoder_block(in_chan=128, out_chan=64)
                                                +        self.decoder_3 = decoder_block(in_chan=256, out_chan=128)
                                                +        self.decoder_4 = decoder_block(in_chan=512, out_chan=256)
                                                 
                                                -    def extra_repr(self) -> str:
                                                -        return f"out_chans={self.out_chans}"
                                                +    def call(
                                                +        self,
                                                +        x: tf.Tensor
                                                +    ) -> tf.Tensor:
                                                +        x_1 = self.encoder_1(x)
                                                +        x_2 = self.encoder_2(x_1)
                                                +        x_3 = self.encoder_3(x_2)
                                                +        x_4 = self.encoder_4(x_3)
                                                +        y_4 = self.decoder_4(x_4)
                                                +        y_3 = self.decoder_3(y_4 + x_3)
                                                +        y_2 = self.decoder_2(y_3 + x_2)
                                                +        y_1 = self.decoder_1(y_2 + x_1)
                                                +        return y_1
                                                 
                                                 
                                                 class LinkNet(_LinkNet, keras.Model):
                                                @@ -397,69 +364,61 @@ 

                                                Source code for doctr.models.detection.linknet.tensorflow

                                                <https://arxiv.org/pdf/1707.03718.pdf>`_. Args: - feature extractor: the backbone serving as feature extractor - fpn_channels: number of channels each extracted feature maps is mapped to - num_classes: number of output channels in the segmentation map - assume_straight_pages: if True, fit straight bounding boxes only - exportable: onnx exportable returns only logits - cfg: the configuration dict of the model + num_classes: number of channels for the output """ - _children_names: List[str] = ["feat_extractor", "fpn", "classifier", "postprocessor"] + _children_names: List[str] = ['stem', 'fpn', 'classifier', 'postprocessor'] def __init__( self, - feat_extractor: IntermediateLayerGetter, - fpn_channels: int = 64, num_classes: int = 1, - assume_straight_pages: bool = True, - exportable: bool = False, + input_shape: Tuple[int, int, int] = (512, 512, 3), + rotated_bbox: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__(cfg=cfg) - self.exportable = exportable - self.assume_straight_pages = assume_straight_pages - - self.feat_extractor = feat_extractor - - self.fpn = LinkNetFPN(fpn_channels, [_shape[1:] for _shape in self.feat_extractor.output_shape]) - self.fpn.build(self.feat_extractor.output_shape) - - self.classifier = Sequential( - [ - layers.Conv2DTranspose( - filters=32, - kernel_size=3, - strides=2, - padding="same", - use_bias=False, - kernel_initializer="he_normal", - input_shape=self.fpn.decoders[-1].output_shape[1:], - ), - layers.BatchNormalization(), - layers.Activation("relu"), - *conv_sequence(32, "relu", True, kernel_size=3, strides=1), - layers.Conv2DTranspose( - filters=num_classes, - kernel_size=2, - strides=2, - padding="same", - use_bias=True, - kernel_initializer="he_normal", - ), - ] - ) - - self.postprocessor = LinkNetPostProcessor(assume_straight_pages=assume_straight_pages) + self.rotated_bbox = rotated_bbox + + self.stem = Sequential([ + *conv_sequence(64, 'relu', True, strides=2, kernel_size=7, input_shape=input_shape), + layers.MaxPool2D(pool_size=(3, 3), strides=2, padding='same'), + ]) + + self.fpn = LinkNetFPN() + + self.classifier = Sequential([ + layers.Conv2DTranspose( + filters=32, + kernel_size=3, + strides=2, + padding="same", + use_bias=False, + kernel_initializer='he_normal' + ), + layers.BatchNormalization(), + layers.Activation('relu'), + *conv_sequence(32, 'relu', True, strides=1, kernel_size=3), + layers.Conv2DTranspose( + filters=num_classes, + kernel_size=2, + strides=2, + padding="same", + use_bias=False, + kernel_initializer='he_normal' + ), + ]) + + self.postprocessor = LinkNetPostProcessor(rotated_bbox=rotated_bbox) def compute_loss( self, out_map: tf.Tensor, - target: List[np.ndarray], - gamma: float = 2.0, - alpha: float = 0.5, - eps: float = 1e-8, + target: List[Dict[str, Any]], + focal_loss: bool = False, + alpha: float = .5, + gamma: float = 2., + edge_factor: float = 2., ) -> tf.Tensor: """Compute linknet loss, BCE with boosted box edges or focal loss. Focal loss implementation based on <https://github.com/tensorflow/addons/>`_. @@ -467,206 +426,113 @@

                                                Source code for doctr.models.detection.linknet.tensorflow

                                                Args: out_map: output feature map of the model of shape N x H x W x 1 target: list of dictionary where each dict has a `boxes` and a `flags` entry - gamma: modulating factor in the focal loss formula + focal_loss: if True, use focal loss instead of BCE + edge_factor: boost factor for box edges (in case of BCE) alpha: balancing factor in the focal loss formula + gammma: modulating factor in the focal loss formula Returns: A loss tensor """ - seg_target, seg_mask = self.build_target(target, out_map.shape[1:3]) - - seg_target = tf.convert_to_tensor(seg_target, dtype=out_map.dtype) + seg_target, seg_mask, edge_mask = self.compute_target(target, out_map.shape[:3]) + seg_target = tf.convert_to_tensor(seg_target, dtype=tf.float32) + edge_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) seg_mask = tf.convert_to_tensor(seg_mask, dtype=tf.bool) - seg_mask = tf.cast(seg_mask, tf.float32) - bce_loss = tf.keras.losses.binary_crossentropy(seg_target, out_map, from_logits=True)[..., None] - proba_map = tf.sigmoid(out_map) + # Get the cross_entropy for each entry + bce = tf.keras.losses.binary_crossentropy( + seg_target[seg_mask], + tf.squeeze(out_map, axis=[-1])[seg_mask], + from_logits=True) + + if focal_loss: + if gamma and gamma < 0: + raise ValueError("Value of gamma should be greater than or equal to zero.") + + # Convert logits to prob, compute gamma factor + pred_prob = tf.sigmoid(tf.squeeze(out_map, axis=[-1])[seg_mask]) + p_t = (seg_target[seg_mask] * pred_prob) + ((1 - seg_target[seg_mask]) * (1 - pred_prob)) + modulating_factor = tf.pow((1.0 - p_t), gamma) - # Focal loss - if gamma < 0: - raise ValueError("Value of gamma should be greater than or equal to zero.") - # Convert logits to prob, compute gamma factor - p_t = (seg_target * proba_map) + ((1 - seg_target) * (1 - proba_map)) - alpha_t = seg_target * alpha + (1 - seg_target) * (1 - alpha) - # Unreduced loss - focal_loss = alpha_t * (1 - p_t) ** gamma * bce_loss - # Class reduced - focal_loss = tf.reduce_sum(seg_mask * focal_loss, (0, 1, 2)) / tf.reduce_sum(seg_mask, (0, 1, 2)) + # Compute alpha factor + alpha_factor = seg_target[seg_mask] * alpha + (1 - seg_target[seg_mask]) * (1 - alpha) - # Dice loss - inter = tf.math.reduce_sum(seg_mask * proba_map * seg_target, (0, 1, 2)) - cardinality = tf.math.reduce_sum(seg_mask * (proba_map + seg_target), (0, 1, 2)) - dice_loss = 1 - 2 * (inter + eps) / (cardinality + eps) + # compute the final loss + loss = tf.reduce_mean(alpha_factor * modulating_factor * bce) - return tf.reduce_mean(focal_loss) + tf.reduce_mean(dice_loss) + else: + # Compute BCE loss with highlighted edges + loss = tf.math.multiply( + 1 + (edge_factor - 1) * tf.cast(edge_mask, tf.float32), + bce + ) + loss = tf.reduce_mean(loss) + + return loss def call( self, x: tf.Tensor, - target: Optional[List[np.ndarray]] = None, + target: Optional[List[Dict[str, Any]]] = None, return_model_output: bool = False, - return_preds: bool = False, + return_boxes: bool = False, + focal_loss: bool = True, **kwargs: Any, ) -> Dict[str, Any]: - feat_maps = self.feat_extractor(x, **kwargs) - logits = self.fpn(feat_maps, **kwargs) - logits = self.classifier(logits, **kwargs) + logits = self.stem(x) + logits = self.fpn(logits) + logits = self.classifier(logits) out: Dict[str, tf.Tensor] = {} - if self.exportable: - out["logits"] = logits - return out - - if return_model_output or target is None or return_preds: + if return_model_output or target is None or return_boxes: prob_map = tf.math.sigmoid(logits) if return_model_output: out["out_map"] = prob_map - if target is None or return_preds: + if target is None or return_boxes: # Post-process boxes - out["preds"] = [preds[0] for preds in self.postprocessor(prob_map.numpy())] + out["preds"] = self.postprocessor(tf.squeeze(prob_map, axis=-1).numpy()) if target is not None: - loss = self.compute_loss(logits, target) - out["loss"] = loss + loss = self.compute_loss(logits, target, focal_loss) + out['loss'] = loss return out -def _linknet( - arch: str, - pretrained: bool, - backbone_fn, - fpn_layers: List[str], - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> LinkNet: - - pretrained_backbone = pretrained_backbone and not pretrained +def _linknet(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> LinkNet: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] - - # Feature extractor - feat_extractor = IntermediateLayerGetter( - backbone_fn( - pretrained=pretrained_backbone, - include_top=False, - input_shape=_cfg["input_shape"], - ), - fpn_layers, - ) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['num_classes'] = kwargs.get('num_classes', _cfg['num_classes']) + _cfg['rotated_bbox'] = kwargs.get('rotated_bbox', _cfg['rotated_bbox']) + kwargs['num_classes'] = _cfg['num_classes'] + kwargs['input_shape'] = _cfg['input_shape'] + kwargs['rotated_bbox'] = _cfg['rotated_bbox'] # Build the model - model = LinkNet(feat_extractor, cfg=_cfg, **kwargs) + model = LinkNet(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg["url"]) + load_pretrained_params(model, _cfg['url']) return model -
                                                -[docs] -def linknet_resnet18(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet18 - >>> model = linknet_resnet18(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - "linknet_resnet18", - pretrained, - resnet18, - ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], - **kwargs, - )
                                                - - - -
                                                -[docs] -def linknet_resnet18_rotation(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet18_rotation - >>> model = linknet_resnet18_rotation(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - "linknet_resnet18_rotation", - pretrained, - resnet18, - ["resnet_block_1", "resnet_block_3", "resnet_block_5", "resnet_block_7"], - **kwargs, - )
                                                - - - -
                                                -[docs] -def linknet_resnet34(pretrained: bool = False, **kwargs: Any) -> LinkNet: - """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" - <https://arxiv.org/pdf/1707.03718.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet34 - >>> model = linknet_resnet34(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text detection dataset - - Returns: - text detection architecture - """ - - return _linknet( - "linknet_resnet34", - pretrained, - resnet34, - ["resnet_block_2", "resnet_block_6", "resnet_block_12", "resnet_block_15"], - **kwargs, - )
                                                - - - -
                                                -[docs] -def linknet_resnet50(pretrained: bool = False, **kwargs: Any) -> LinkNet: +
                                                +[docs] +def linknet16(pretrained: bool = False, **kwargs: Any) -> LinkNet: """LinkNet as described in `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" <https://arxiv.org/pdf/1707.03718.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import linknet_resnet50 - >>> model = linknet_resnet50(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import linknet16 + >>> model = linknet16(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text detection dataset @@ -675,13 +541,7 @@

                                                Source code for doctr.models.detection.linknet.tensorflow

                                                text detection architecture """ - return _linknet( - "linknet_resnet50", - pretrained, - resnet50, - ["conv2_block3_out", "conv3_block4_out", "conv4_block6_out", "conv5_block3_out"], - **kwargs, - )
                                                + return _linknet('linknet16', pretrained, **kwargs)
                                                @@ -715,7 +575,7 @@

                                                Source code for doctr.models.detection.linknet.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/detection/zoo.html b/v0.6.0/_modules/doctr/models/detection/zoo.html index 58cce8ba72..23a2f451e3 100644 --- a/v0.6.0/_modules/doctr/models/detection/zoo.html +++ b/v0.6.0/_modules/doctr/models/detection/zoo.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.detection.zoo

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                -from typing import Any, List
                                                +from typing import Any
                                                 
                                                 from doctr.file_utils import is_tf_available, is_torch_available
                                                -
                                                -from .. import detection
                                                +from .core import DetectionPredictor
                                                 from ..preprocessor import PreProcessor
                                                -from .predictor import DetectionPredictor
                                                +from .. import detection
                                                 
                                                -__all__ = ["detection_predictor"]
                                                 
                                                -ARCHS: List[str]
                                                -ROT_ARCHS: List[str]
                                                +__all__ = ["detection_predictor"]
                                                 
                                                 
                                                 if is_tf_available():
                                                -    ARCHS = ["db_resnet50", "db_mobilenet_v3_large", "linknet_resnet18", "linknet_resnet34", "linknet_resnet50"]
                                                -    ROT_ARCHS = ["linknet_resnet18_rotation"]
                                                +    ARCHS = ['db_resnet50', 'linknet16']
                                                 elif is_torch_available():
                                                -    ARCHS = [
                                                -        "db_resnet34",
                                                -        "db_resnet50",
                                                -        "db_mobilenet_v3_large",
                                                -        "linknet_resnet18",
                                                -        "linknet_resnet34",
                                                -        "linknet_resnet50",
                                                -    ]
                                                -    ROT_ARCHS = ["db_resnet50_rotation"]
                                                -
                                                -
                                                -def _predictor(arch: Any, pretrained: bool, assume_straight_pages: bool = True, **kwargs: Any) -> DetectionPredictor:
                                                -
                                                -    if isinstance(arch, str):
                                                -        if arch not in ARCHS + ROT_ARCHS:
                                                -            raise ValueError(f"unknown architecture '{arch}'")
                                                -
                                                -        if arch not in ROT_ARCHS and not assume_straight_pages:
                                                -            raise AssertionError(
                                                -                "You are trying to use a model trained on straight pages while not assuming"
                                                -                " your pages are straight. If you have only straight documents, don't pass"
                                                -                " assume_straight_pages=False, otherwise you should use one of these archs:"
                                                -                f"{ROT_ARCHS}"
                                                -            )
                                                +    ARCHS = ['db_resnet34', 'db_resnet50', 'db_mobilenet_v3', 'linknet16']
                                                 
                                                -        _model = detection.__dict__[arch](
                                                -            pretrained=pretrained,
                                                -            pretrained_backbone=kwargs.get("pretrained_backbone", True),
                                                -            assume_straight_pages=assume_straight_pages,
                                                -        )
                                                -    else:
                                                -        if not isinstance(arch, (detection.DBNet, detection.LinkNet)):
                                                -            raise ValueError(f"unknown architecture: {type(arch)}")
                                                 
                                                -        _model = arch
                                                -        _model.assume_straight_pages = assume_straight_pages
                                                +def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> DetectionPredictor:
                                                 
                                                -    kwargs.pop("pretrained_backbone", None)
                                                +    if arch not in ARCHS:
                                                +        raise ValueError(f"unknown architecture '{arch}'")
                                                 
                                                -    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
                                                -    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
                                                -    kwargs["batch_size"] = kwargs.get("batch_size", 1)
                                                +    # Detection
                                                +    _model = detection.__dict__[arch](pretrained=pretrained)
                                                +    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                                +    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                                +    kwargs['batch_size'] = kwargs.get('batch_size', 1)
                                                     predictor = DetectionPredictor(
                                                -        PreProcessor(_model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:], **kwargs),
                                                -        _model,
                                                +        PreProcessor(_model.cfg['input_shape'][:2], **kwargs),
                                                +        _model
                                                     )
                                                     return predictor
                                                 
                                                 
                                                 
                                                -[docs] -def detection_predictor( - arch: Any = "db_resnet50", pretrained: bool = False, assume_straight_pages: bool = True, **kwargs: Any -) -> DetectionPredictor: +[docs] +def detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) -> DetectionPredictor: """Text detection architecture. - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> model = detection_predictor(arch='db_resnet50', pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + Example:: + >>> import numpy as np + >>> from doctr.models import detection_predictor + >>> model = detection_predictor(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - arch: name of the architecture or model itself to use (e.g. 'db_resnet50') + arch: name of the architecture to use ('db_resnet50') pretrained: If True, returns a model pre-trained on our text detection dataset - assume_straight_pages: If True, fit straight boxes to the page Returns: Detection predictor """ - return _predictor(arch, pretrained, assume_straight_pages, **kwargs)
                                                + return _predictor(arch, pretrained, **kwargs)
                                                @@ -418,7 +368,7 @@

                                                Source code for doctr.models.detection.zoo

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/factory/hub.html b/v0.6.0/_modules/doctr/models/factory/hub.html index f8f4b67e5c..47274933f2 100644 --- a/v0.6.0/_modules/doctr/models/factory/hub.html +++ b/v0.6.0/_modules/doctr/models/factory/hub.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.factory.hub

                                                -# Copyright (C) 2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -305,36 +308,43 @@ 

                                                Source code for doctr.models.factory.hub

                                                 from pathlib import Path
                                                 from typing import Any
                                                 
                                                -from huggingface_hub import HfApi, HfFolder, Repository, hf_hub_download, snapshot_download
                                                +from huggingface_hub import (
                                                +    HfApi,
                                                +    Repository,
                                                +    get_token,
                                                +    get_token_permission,
                                                +    hf_hub_download,
                                                +    login,
                                                +)
                                                 
                                                 from doctr import models
                                                 from doctr.file_utils import is_tf_available, is_torch_available
                                                 
                                                 if is_torch_available():
                                                     import torch
                                                +elif is_tf_available():
                                                +    import tensorflow as tf
                                                 
                                                 __all__ = ["login_to_hub", "push_to_hf_hub", "from_hub", "_save_model_and_config_for_hf_hub"]
                                                 
                                                 
                                                 AVAILABLE_ARCHS = {
                                                -    "classification": models.classification.zoo.ARCHS,
                                                -    "detection": models.detection.zoo.ARCHS + models.detection.zoo.ROT_ARCHS,
                                                +    "classification": models.classification.zoo.ARCHS + models.classification.zoo.ORIENTATION_ARCHS,
                                                +    "detection": models.detection.zoo.ARCHS,
                                                     "recognition": models.recognition.zoo.ARCHS,
                                                -    "obj_detection": ["fasterrcnn_mobilenet_v3_large_fpn"] if is_torch_available() else None,
                                                 }
                                                 
                                                 
                                                 
                                                [docs] -def login_to_hub() -> None: +def login_to_hub() -> None: # pragma: no cover """Login to huggingface hub""" - access_token = HfFolder.get_token() - if access_token is not None and HfApi()._is_valid_token(access_token): + access_token = get_token() + if access_token is not None and get_token_permission(access_token): logging.info("Huggingface Hub token found and valid") - HfApi().set_access_token(access_token) + login(token=access_token, write_permission=True) else: - subprocess.call(["huggingface-cli", "login"]) - HfApi().set_access_token(HfFolder().get_token()) + login() # check if git lfs is installed try: subprocess.call(["git", "lfs", "version"]) @@ -351,6 +361,7 @@

                                                Source code for doctr.models.factory.hub

                                                     """Save model and config to disk for pushing to huggingface hub
                                                 
                                                     Args:
                                                +    ----
                                                         model: TF or PyTorch model to be saved
                                                         save_dir: directory to save model and config
                                                         arch: architecture name
                                                @@ -362,7 +373,9 @@ 

                                                Source code for doctr.models.factory.hub

                                                         weights_path = save_directory / "pytorch_model.bin"
                                                         torch.save(model.state_dict(), weights_path)
                                                     elif is_tf_available():
                                                -        weights_path = save_directory / "tf_model" / "weights"
                                                +        weights_path = save_directory / "tf_model.weights.h5"
                                                +        # NOTE: `model.build` is not an option because it doesn't runs in eager mode
                                                +        _ = model(tf.ones((1, *model.cfg["input_shape"])), training=False)
                                                         model.save_weights(str(weights_path))
                                                 
                                                     config_path = save_directory / "config.json"
                                                @@ -378,7 +391,7 @@ 

                                                Source code for doctr.models.factory.hub

                                                 
                                                 
                                                [docs] -def push_to_hf_hub(model: Any, model_name: str, task: str, **kwargs) -> None: +def push_to_hf_hub(model: Any, model_name: str, task: str, **kwargs) -> None: # pragma: no cover """Save model and its configuration on HF hub >>> from doctr.models import login_to_hub, push_to_hf_hub @@ -388,6 +401,7 @@

                                                Source code for doctr.models.factory.hub

                                                     >>> push_to_hf_hub(model, 'my-model', 'recognition', arch='crnn_mobilenet_v3_small')
                                                 
                                                     Args:
                                                +    ----
                                                         model: TF or PyTorch model to be saved
                                                         model_name: name of the model which is also the repository name
                                                         task: task name
                                                @@ -398,8 +412,8 @@ 

                                                Source code for doctr.models.factory.hub

                                                 
                                                     if run_config is None and arch is None:
                                                         raise ValueError("run_config or arch must be specified")
                                                -    if task not in ["classification", "detection", "recognition", "obj_detection"]:
                                                -        raise ValueError("task must be one of classification, detection, recognition, obj_detection")
                                                +    if task not in ["classification", "detection", "recognition"]:
                                                +        raise ValueError("task must be one of classification, detection, recognition")
                                                 
                                                     # default readme
                                                     readme = textwrap.dedent(
                                                @@ -453,7 +467,7 @@ 

                                                Source code for doctr.models.factory.hub

                                                                                   \n{json.dumps(vars(run_config), indent=2, ensure_ascii=False)}"""
                                                         )
                                                 
                                                -    if arch not in AVAILABLE_ARCHS[task]:  # type: ignore
                                                +    if arch not in AVAILABLE_ARCHS[task]:
                                                         raise ValueError(
                                                             f"Architecture: {arch} for task: {task} not found.\
                                                                          \nAvailable architectures: {AVAILABLE_ARCHS}"
                                                @@ -462,11 +476,10 @@ 

                                                Source code for doctr.models.factory.hub

                                                     commit_message = f"Add {model_name} model"
                                                 
                                                     local_cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub", model_name)
                                                -    repo_url = HfApi().create_repo(model_name, token=HfFolder.get_token(), exist_ok=False)
                                                -    repo = Repository(local_dir=local_cache_dir, clone_from=repo_url, use_auth_token=True)
                                                +    repo_url = HfApi().create_repo(model_name, token=get_token(), exist_ok=False)
                                                +    repo = Repository(local_dir=local_cache_dir, clone_from=repo_url)
                                                 
                                                     with repo.commit(commit_message):
                                                -
                                                         _save_model_and_config_for_hf_hub(model, repo.local_dir, arch=arch, task=task)
                                                         readme_path = Path(repo.local_dir) / "README.md"
                                                         readme_path.write_text(readme)
                                                @@ -484,13 +497,14 @@ 

                                                Source code for doctr.models.factory.hub

                                                     >>> model = from_hub("mindee/fasterrcnn_mobilenet_v3_large_fpn")
                                                 
                                                     Args:
                                                +    ----
                                                         repo_id: HuggingFace model hub repo
                                                         kwargs: kwargs of `hf_hub_download` or `snapshot_download`
                                                 
                                                     Returns:
                                                +    -------
                                                         Model loaded with the checkpoint
                                                     """
                                                -
                                                     # Get the config
                                                     with open(hf_hub_download(repo_id, filename="config.json", **kwargs), "rb") as f:
                                                         cfg = json.load(f)
                                                @@ -508,14 +522,6 @@ 

                                                Source code for doctr.models.factory.hub

                                                         model = models.detection.__dict__[arch](pretrained=False)
                                                     elif task == "recognition":
                                                         model = models.recognition.__dict__[arch](pretrained=False, input_shape=cfg["input_shape"], vocab=cfg["vocab"])
                                                -    elif task == "obj_detection" and is_torch_available():
                                                -        model = models.obj_detection.__dict__[arch](
                                                -            pretrained=False,
                                                -            image_mean=cfg["mean"],
                                                -            image_std=cfg["std"],
                                                -            max_size=cfg["input_shape"][-1],
                                                -            num_classes=len(cfg["classes"]),
                                                -        )
                                                 
                                                     # update model cfg
                                                     model.cfg = cfg
                                                @@ -525,8 +531,10 @@ 

                                                Source code for doctr.models.factory.hub

                                                         state_dict = torch.load(hf_hub_download(repo_id, filename="pytorch_model.bin", **kwargs), map_location="cpu")
                                                         model.load_state_dict(state_dict)
                                                     else:  # tf
                                                -        repo_path = snapshot_download(repo_id, **kwargs)
                                                -        model.load_weights(os.path.join(repo_path, "tf_model", "weights"))
                                                +        weights = hf_hub_download(repo_id, filename="tf_model.weights.h5", **kwargs)
                                                +        # NOTE: `model.build` is not an option because it doesn't runs in eager mode
                                                +        _ = model(tf.ones((1, *model.cfg["input_shape"])), training=False)
                                                +        model.load_weights(weights)
                                                 
                                                     return model
                                                @@ -562,7 +570,7 @@

                                                Source code for doctr.models.factory.hub

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html index ebcd2c17dc..7b8529c26d 100644 --- a/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/crnn/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple, Union
                                                -
                                                 import tensorflow as tf
                                                 from tensorflow.keras import layers
                                                -from tensorflow.keras.models import Model, Sequential
                                                -
                                                -from doctr.datasets import VOCABS
                                                +from tensorflow.keras.models import Sequential, Model
                                                +from typing import Tuple, Dict, Any, Optional, List
                                                 
                                                -from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
                                                -from ...utils.tensorflow import load_pretrained_params
                                                +from ... import backbones
                                                +from ...utils import load_pretrained_params
                                                 from ..core import RecognitionModel, RecognitionPostProcessor
                                                 
                                                -__all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
                                                +__all__ = ['CRNN', 'crnn_vgg16_bn', 'crnn_resnet31', 'CTCPostProcessor']
                                                 
                                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    "crnn_vgg16_bn": {
                                                -        "mean": (0.694, 0.695, 0.693),
                                                -        "std": (0.299, 0.296, 0.301),
                                                -        "input_shape": (32, 128, 3),
                                                -        "vocab": VOCABS["legacy_french"],
                                                -        "url": "https://doctr-static.mindee.com/models?id=v0.3.0/crnn_vgg16_bn-76b7f2c6.zip&src=0",
                                                -    },
                                                -    "crnn_mobilenet_v3_small": {
                                                -        "mean": (0.694, 0.695, 0.693),
                                                -        "std": (0.299, 0.296, 0.301),
                                                -        "input_shape": (32, 128, 3),
                                                -        "vocab": VOCABS["french"],
                                                -        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip&src=0",
                                                +    'crnn_vgg16_bn': {
                                                +        'mean': (.5, .5, .5),
                                                +        'std': (1., 1., 1.),
                                                +        'backbone': 'vgg16_bn', 'rnn_units': 128,
                                                +        'input_shape': (32, 128, 3),
                                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/crnn_vgg16_bn-748c855f.zip',
                                                     },
                                                -    "crnn_mobilenet_v3_large": {
                                                -        "mean": (0.694, 0.695, 0.693),
                                                -        "std": (0.299, 0.296, 0.301),
                                                -        "input_shape": (32, 128, 3),
                                                -        "vocab": VOCABS["french"],
                                                -        "url": None,
                                                +    'crnn_resnet31': {
                                                +        'mean': (0.694, 0.695, 0.693),
                                                +        'std': (0.299, 0.296, 0.301),
                                                +        'backbone': 'resnet31', 'rnn_units': 128,
                                                +        'input_shape': (32, 128, 3),
                                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.1/crnn_resnet31-69ab71db.zip',
                                                     },
                                                 }
                                                 
                                                @@ -347,56 +326,37 @@ 

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                def __call__( self, - logits: tf.Tensor, - beam_width: int = 1, - top_paths: int = 1, - ) -> Union[List[Tuple[str, float]], List[Tuple[List[str], List[float]]]]: + logits: tf.Tensor + ) -> List[Tuple[str, float]]: """ Performs decoding of raw output with CTC and decoding of CTC predictions with label_to_idx mapping dictionnary Args: logits: raw output of the model, shape BATCH_SIZE X SEQ_LEN X NUM_CLASSES + 1 - beam_width: An int scalar >= 0 (beam search beam width). - top_paths: An int scalar >= 0, <= beam_width (controls output size). Returns: A list of decoded words of length BATCH_SIZE - """ # Decode CTC _decoded, _log_prob = tf.nn.ctc_beam_search_decoder( tf.transpose(logits, perm=[1, 0, 2]), - tf.fill(tf.shape(logits)[:1], tf.shape(logits)[1]), - beam_width=beam_width, - top_paths=top_paths, + tf.fill(logits.shape[0], logits.shape[1]), + beam_width=1, top_paths=1, ) - - _decoded = tf.sparse.concat( - 1, - [tf.sparse.expand_dims(dec, axis=1) for dec in _decoded], - expand_nonconcat_dims=True, - ) # dim : batchsize x beamwidth x actual_max_len_predictions - out_idxs = tf.sparse.to_dense(_decoded, default_value=len(self.vocab)) + out_idxs = tf.sparse.to_dense(_decoded[0], default_value=len(self.vocab)) + probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # Map it to characters _decoded_strings_pred = tf.strings.reduce_join( inputs=tf.nn.embedding_lookup(tf.constant(self._embedding, dtype=tf.string), out_idxs), - axis=-1, + axis=-1 ) _decoded_strings_pred = tf.strings.split(_decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value="not valid")[ - :, :, 0 - ] # dim : batch_size x beam_width - - if top_paths == 1: - probs = tf.math.exp(tf.squeeze(_log_prob, axis=1)) # dim : batchsize - decoded_strings_pred = tf.squeeze(decoded_strings_pred, axis=1) - word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - else: - probs = tf.math.exp(_log_prob) # dim : batchsize x beamwidth - word_values = [[word.decode() for word in words] for words in decoded_strings_pred.numpy().tolist()] + decoded_strings_pred = tf.sparse.to_dense(_decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] + word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] + return list(zip(word_values, probs.numpy().tolist())) @@ -408,22 +368,16 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding rnn_units: number of units in the LSTM layers - exportable: onnx exportable returns only logits - beam_width: beam width for beam search decoding - top_paths: number of top paths for beam search decoding cfg: configuration dictionary """ - _children_names: List[str] = ["feat_extractor", "decoder", "postprocessor"] + _children_names: List[str] = ['feat_extractor', 'decoder', 'postprocessor'] def __init__( self, feature_extractor: tf.keras.Model, vocab: str, rnn_units: int = 128, - exportable: bool = False, - beam_width: int = 1, - top_paths: int = 1, cfg: Optional[Dict[str, Any]] = None, ) -> None: # Initialize kernels @@ -433,23 +387,19 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                self.vocab = vocab self.max_length = w self.cfg = cfg - self.exportable = exportable self.feat_extractor = feature_extractor self.decoder = Sequential( [ layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), layers.Bidirectional(layers.LSTM(units=rnn_units, return_sequences=True)), - layers.Dense(units=len(vocab) + 1), + layers.Dense(units=len(vocab) + 1) ] ) self.decoder.build(input_shape=(None, w, h * c)) self.postprocessor = CTCPostProcessor(vocab=vocab) - self.beam_width = beam_width - self.top_paths = top_paths - def compute_loss( self, model_output: tf.Tensor, @@ -458,15 +408,16 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                """Compute CTC loss for the model. Args: + gt: the encoded tensor with gt labels model_output: predicted logits of the model - target: lengths of each gt word inside the batch + seq_len: lengths of each gt word inside the batch Returns: The loss of the model on the batch """ - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) batch_len = model_output.shape[0] - input_length = tf.fill((batch_len,), model_output.shape[1]) + input_length = model_output.shape[1] * tf.ones(shape=(batch_len)) ctc_loss = tf.nn.ctc_loss( gt, model_output, seq_len, input_length, logits_time_major=False, blank_index=len(self.vocab) ) @@ -478,14 +429,9 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                target: Optional[List[str]] = None, return_model_output: bool = False, return_preds: bool = False, - beam_width: int = 1, - top_paths: int = 1, **kwargs: Any, ) -> Dict[str, Any]: - if kwargs.get("training", False) and target is None: - raise ValueError("Need to provide labels during training") - features = self.feat_extractor(x, **kwargs) # B x H x W x C --> B x W x H x C transposed_feat = tf.transpose(features, perm=[0, 2, 1, 3]) @@ -495,66 +441,57 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                logits = self.decoder(features_seq, **kwargs) out: Dict[str, tf.Tensor] = {} - if self.exportable: - out["logits"] = logits - return out - if return_model_output: out["out_map"] = logits if target is None or return_preds: # Post-process boxes - out["preds"] = self.postprocessor(logits, beam_width=beam_width, top_paths=top_paths) + out["preds"] = self.postprocessor(logits) if target is not None: - out["loss"] = self.compute_loss(logits, target) + out['loss'] = self.compute_loss(logits, target) return out -def _crnn( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> CRNN: - - pretrained_backbone = pretrained_backbone and not pretrained - - kwargs["vocab"] = kwargs.get("vocab", default_cfgs[arch]["vocab"]) +def _crnn(arch: str, pretrained: bool, input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any) -> CRNN: + # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg["vocab"] = kwargs["vocab"] - _cfg["input_shape"] = input_shape or default_cfgs[arch]["input_shape"] + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) - feat_extractor = backbone_fn( - input_shape=_cfg["input_shape"], + # Feature extractor + feat_extractor = backbones.__dict__[_cfg['backbone']]( + input_shape=_cfg['input_shape'], include_top=False, - pretrained=pretrained_backbone, ) + kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + # Build the model model = CRNN(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, _cfg["url"]) + load_pretrained_params(model, _cfg['url']) return model
                                                -[docs] +[docs] def crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> CRNN: """CRNN with a VGG-16 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import crnn_vgg16_bn - >>> model = crnn_vgg16_bn(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import crnn_vgg16_bn + >>> model = crnn_vgg16_bn(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -563,21 +500,20 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                text recognition architecture """ - return _crnn("crnn_vgg16_bn", pretrained, vgg16_bn_r, **kwargs)
                                                + return _crnn('crnn_vgg16_bn', pretrained, **kwargs)
                                                -
                                                -[docs] -def crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Small backbone as described in `"An End-to-End Trainable Neural Network for Image-based +def crnn_resnet31(pretrained: bool = False, **kwargs: Any) -> CRNN: + """CRNN with a resnet31 backbone as described in `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_small - >>> model = crnn_mobilenet_v3_small(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example:: + >>> import tensorflow as tf + >>> from doctr.models import crnn_resnet31 + >>> model = crnn_resnet31(pretrained=True) + >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -586,31 +522,7 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                text recognition architecture """ - return _crnn("crnn_mobilenet_v3_small", pretrained, mobilenet_v3_small_r, **kwargs)
                                                - - - -
                                                -[docs] -def crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> CRNN: - """CRNN with a MobileNet V3 Large backbone as described in `"An End-to-End Trainable Neural Network for Image-based - Sequence Recognition and Its Application to Scene Text Recognition" <https://arxiv.org/pdf/1507.05717.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import crnn_mobilenet_v3_large - >>> model = crnn_mobilenet_v3_large(pretrained=True) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - - Returns: - text recognition architecture - """ - - return _crnn("crnn_mobilenet_v3_large", pretrained, mobilenet_v3_large_r, **kwargs)
                                                - + return _crnn('crnn_resnet31', pretrained, **kwargs)
                                                @@ -643,7 +555,7 @@

                                                Source code for doctr.models.recognition.crnn.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html index 8421f650b7..6d9bff4577 100644 --- a/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/master/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.recognition.master.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                -
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 import tensorflow as tf
                                                -from tensorflow.keras import Model, layers
                                                -
                                                -from doctr.datasets import VOCABS
                                                -from doctr.models.classification import magc_resnet31
                                                -from doctr.models.modules.transformer import Decoder, PositionalEncoding
                                                +from tensorflow.keras import layers, Sequential, Model
                                                +from typing import Tuple, List, Dict, Any, Optional
                                                +from copy import deepcopy
                                                 
                                                -from ...utils.tensorflow import load_pretrained_params
                                                +from ..core import RecognitionPostProcessor
                                                +from ...backbones.resnet import ResnetStage
                                                +from ...utils import conv_sequence, load_pretrained_params
                                                +from ..transformer import Decoder, positional_encoding, create_look_ahead_mask, create_padding_mask
                                                +from ....datasets import VOCABS
                                                 from .base import _MASTER, _MASTERPostProcessor
                                                 
                                                -__all__ = ["MASTER", "master"]
                                                +
                                                +__all__ = ['MASTER', 'master', 'MASTERPostProcessor']
                                                 
                                                 
                                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    "master": {
                                                -        "mean": (0.694, 0.695, 0.693),
                                                -        "std": (0.299, 0.296, 0.301),
                                                -        "input_shape": (32, 128, 3),
                                                -        "vocab": VOCABS["french"],
                                                -        "url": None,
                                                +    'master': {
                                                +        'mean': (.5, .5, .5),
                                                +        'std': (1., 1., 1.),
                                                +        'input_shape': (48, 160, 3),
                                                +        'vocab': VOCABS['french'],
                                                +        'url': None,
                                                     },
                                                 }
                                                 
                                                 
                                                +class MAGC(layers.Layer):
                                                +
                                                +    """Implements the Multi-Aspect Global Context Attention, as described in
                                                +    <https://arxiv.org/pdf/1910.02562.pdf>`_.
                                                +
                                                +    Args:
                                                +        inplanes: input channels
                                                +        headers: number of headers to split channels
                                                +        att_scale: if True, re-scale attention to counteract the variance distibutions
                                                +        **kwargs
                                                +    """
                                                +
                                                +    def __init__(
                                                +        self,
                                                +        inplanes: int,
                                                +        headers: int = 1,
                                                +        att_scale: bool = False,
                                                +        **kwargs
                                                +    ) -> None:
                                                +        super().__init__(**kwargs)
                                                +
                                                +        self.headers = headers  # h
                                                +        self.inplanes = inplanes  # C
                                                +        self.att_scale = att_scale
                                                +
                                                +        self.single_header_inplanes = int(inplanes / headers)  # C / h
                                                +
                                                +        self.conv_mask = tf.keras.layers.Conv2D(
                                                +            filters=1,
                                                +            kernel_size=1,
                                                +            kernel_initializer=tf.initializers.he_normal()
                                                +        )
                                                +
                                                +        self.transform = tf.keras.Sequential(
                                                +            [
                                                +                tf.keras.layers.Conv2D(
                                                +                    filters=self.inplanes,
                                                +                    kernel_size=1,
                                                +                    kernel_initializer=tf.initializers.he_normal()
                                                +                ),
                                                +                tf.keras.layers.LayerNormalization([1, 2, 3]),
                                                +                tf.keras.layers.ReLU(),
                                                +                tf.keras.layers.Conv2D(
                                                +                    filters=self.inplanes,
                                                +                    kernel_size=1,
                                                +                    kernel_initializer=tf.initializers.he_normal()
                                                +                ),
                                                +            ],
                                                +            name='transform'
                                                +        )
                                                +
                                                +    @tf.function
                                                +    def context_modeling(self, inputs: tf.Tensor) -> tf.Tensor:
                                                +        b, h, w, c = (tf.shape(inputs)[i] for i in range(4))
                                                +
                                                +        # B, H, W, C -->> B*h, H, W, C/h
                                                +        x = tf.reshape(inputs, shape=(b, h, w, self.headers, self.single_header_inplanes))
                                                +        x = tf.transpose(x, perm=(0, 3, 1, 2, 4))
                                                +        x = tf.reshape(x, shape=(b * self.headers, h, w, self.single_header_inplanes))
                                                +
                                                +        # Compute shorcut
                                                +        shortcut = x
                                                +        # B*h, 1, H*W, C/h
                                                +        shortcut = tf.reshape(shortcut, shape=(b * self.headers, 1, h * w, self.single_header_inplanes))
                                                +        # B*h, 1, C/h, H*W
                                                +        shortcut = tf.transpose(shortcut, perm=[0, 1, 3, 2])
                                                +
                                                +        # Compute context mask
                                                +        # B*h, H, W, 1,
                                                +        context_mask = self.conv_mask(x)
                                                +        # B*h, 1, H*W, 1
                                                +        context_mask = tf.reshape(context_mask, shape=(b * self.headers, 1, h * w, 1))
                                                +        # scale variance
                                                +        if self.att_scale and self.headers > 1:
                                                +            context_mask = context_mask / tf.sqrt(self.single_header_inplanes)
                                                +        # B*h, 1, H*W, 1
                                                +        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
                                                +
                                                +        # Compute context
                                                +        # B*h, 1, C/h, 1
                                                +        context = tf.matmul(shortcut, context_mask)
                                                +        context = tf.reshape(context, shape=(b, 1, c, 1))
                                                +        # B, 1, 1, C
                                                +        context = tf.transpose(context, perm=(0, 1, 3, 2))
                                                +        # Set shape to resolve shape when calling this module in the Sequential MAGCResnet
                                                +        batch, chan = inputs.get_shape().as_list()[0], inputs.get_shape().as_list()[-1]
                                                +        context.set_shape([batch, 1, 1, chan])
                                                +        return context
                                                +
                                                +    def call(self, inputs: tf.Tensor, **kwargs) -> tf.Tensor:
                                                +        # Context modeling: B, H, W, C  ->  B, 1, 1, C
                                                +        context = self.context_modeling(inputs)
                                                +        # Transform: B, 1, 1, C  ->  B, 1, 1, C
                                                +        transformed = self.transform(context)
                                                +        return inputs + transformed
                                                +
                                                +
                                                +class MAGCResnet(Sequential):
                                                +
                                                +    """Implements the modified resnet with MAGC layers, as described in paper.
                                                +
                                                +    Args:
                                                +        headers: number of header to split channels in MAGC layers
                                                +        input_shape: shape of the model input (without batch dim)
                                                +    """
                                                +
                                                +    def __init__(
                                                +        self,
                                                +        headers: int = 1,
                                                +        input_shape: Tuple[int, int, int] = (48, 160, 3),
                                                +    ) -> None:
                                                +        _layers = [
                                                +            # conv_1x
                                                +            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
                                                +            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
                                                +            layers.MaxPooling2D((2, 2), (2, 2)),
                                                +            # conv_2x
                                                +            ResnetStage(num_blocks=1, output_channels=256),
                                                +            MAGC(inplanes=256, headers=headers, att_scale=True),
                                                +            *conv_sequence(out_channels=256, activation='relu', bn=True, kernel_size=3),
                                                +            layers.MaxPooling2D((2, 2), (2, 2)),
                                                +            # conv_3x
                                                +            ResnetStage(num_blocks=2, output_channels=512),
                                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                                +            layers.MaxPooling2D((2, 1), (2, 1)),
                                                +            # conv_4x
                                                +            ResnetStage(num_blocks=5, output_channels=512),
                                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                                +            # conv_5x
                                                +            ResnetStage(num_blocks=3, output_channels=512),
                                                +            MAGC(inplanes=512, headers=headers, att_scale=True),
                                                +            *conv_sequence(out_channels=512, activation='relu', bn=True, kernel_size=3),
                                                +        ]
                                                +        super().__init__(_layers)
                                                +
                                                +
                                                 class MASTER(_MASTER, Model):
                                                 
                                                     """Implements MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.
                                                     Implementation based on the official TF implementation: <https://github.com/jiangxiluning/MASTER-TF>`_.
                                                 
                                                     Args:
                                                -        feature_extractor: the backbone serving as feature extractor
                                                         vocab: vocabulary, (without EOS, SOS, PAD)
                                                         d_model: d parameter for the transformer decoder
                                                +        headers: headers for the MAGC module
                                                         dff: depth of the pointwise feed-forward layer
                                                         num_heads: number of heads for the mutli-head attention module
                                                         num_layers: number of decoder layers to stack
                                                         max_length: maximum length of character sequence handled by the model
                                                -        dropout: dropout probability of the decoder
                                                -        input_shape: size of the image inputs
                                                -        exportable: onnx exportable returns only logits
                                                -        cfg: dictionary containing information about the model
                                                +        input_size: size of the image inputs
                                                     """
                                                 
                                                     def __init__(
                                                         self,
                                                -        feature_extractor: tf.keras.Model,
                                                         vocab: str,
                                                         d_model: int = 512,
                                                +        headers: int = 1,
                                                         dff: int = 2048,
                                                -        num_heads: int = 8,  # number of heads in the transformer decoder
                                                +        num_heads: int = 8,
                                                         num_layers: int = 3,
                                                         max_length: int = 50,
                                                -        dropout: float = 0.2,
                                                -        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from the paper
                                                -        exportable: bool = False,
                                                +        input_shape: Tuple[int, int, int] = (48, 160, 3),
                                                         cfg: Optional[Dict[str, Any]] = None,
                                                     ) -> None:
                                                         super().__init__()
                                                 
                                                -        self.exportable = exportable
                                                -        self.max_length = max_length
                                                -        self.d_model = d_model
                                                         self.vocab = vocab
                                                +        self.max_length = max_length
                                                         self.cfg = cfg
                                                         self.vocab_size = len(vocab)
                                                 
                                                -        self.feat_extractor = feature_extractor
                                                -        self.positional_encoding = PositionalEncoding(self.d_model, dropout, max_len=input_shape[0] * input_shape[1])
                                                +        self.feature_extractor = MAGCResnet(headers=headers, input_shape=input_shape)
                                                +        self.seq_embedding = layers.Embedding(self.vocab_size + 3, d_model)  # 3 more classes: EOS/PAD/SOS
                                                 
                                                         self.decoder = Decoder(
                                                             num_layers=num_layers,
                                                -            d_model=self.d_model,
                                                +            d_model=d_model,
                                                             num_heads=num_heads,
                                                -            vocab_size=self.vocab_size + 3,  # EOS, SOS, PAD
                                                             dff=dff,
                                                -            dropout=dropout,
                                                -            maximum_position_encoding=self.max_length,
                                                +            vocab_size=self.vocab_size,
                                                +            maximum_position_encoding=max_length,
                                                         )
                                                -
                                                +        self.feature_pe = positional_encoding(input_shape[0] * input_shape[1], d_model)
                                                         self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
                                                +
                                                         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
                                                 
                                                     @tf.function
                                                -    def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
                                                -        # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
                                                -        # (N, 1, 1, max_length)
                                                -        target_pad_mask = tf.cast(tf.math.not_equal(target, self.vocab_size + 2), dtype=tf.uint8)
                                                -        target_pad_mask = target_pad_mask[:, tf.newaxis, tf.newaxis, :]
                                                -        target_length = target.shape[1]
                                                -        # sub mask filled diagonal with 1 = see 0 = masked (max_length, max_length)
                                                -        target_sub_mask = tf.linalg.band_part(tf.ones((target_length, target_length)), -1, 0)
                                                -        # source mask filled with ones (max_length, positional_encoded_seq_len)
                                                -        source_mask = tf.ones((target_length, source.shape[1]))
                                                -        # combine the two masks into one boolean mask where False is masked (N, 1, max_length, max_length)
                                                -        target_mask = tf.math.logical_and(
                                                -            tf.cast(target_sub_mask, dtype=tf.bool), tf.cast(target_pad_mask, dtype=tf.bool)
                                                -        )
                                                -        return source_mask, target_mask
                                                +    def make_mask(self, target: tf.Tensor) -> tf.Tensor:
                                                +        look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
                                                +        target_padding_mask = create_padding_mask(target, self.vocab_size + 2)  # Pad symbol
                                                +        combined_mask = tf.maximum(target_padding_mask, look_ahead_mask)
                                                +        return combined_mask
                                                 
                                                -    @staticmethod
                                                     def compute_loss(
                                                +        self,
                                                         model_output: tf.Tensor,
                                                         gt: tf.Tensor,
                                                         seq_len: List[int],
                                                @@ -427,7 +533,7 @@ 

                                                Source code for doctr.models.recognition.master.tensorflow

                                                mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) @@ -452,45 +558,42 @@

                                                Source code for doctr.models.recognition.master.tensorflow

                                                """ # Encode - feature = self.feat_extractor(x, **kwargs) - b, h, w, c = feature.get_shape() - # (N, H, W, C) --> (N, H * W, C) + feature = self.feature_extractor(x, **kwargs) + b, h, w, c = (tf.shape(feature)[i] for i in range(4)) feature = tf.reshape(feature, shape=(b, h * w, c)) - # add positional encoding to features - encoded = self.positional_encoding(feature, **kwargs) + encoded = feature + self.feature_pe[:, :h * w, :] out: Dict[str, tf.Tensor] = {} - if kwargs.get("training", False) and target is None: - raise ValueError("Need to provide labels during training") - if target is not None: # Compute target: tensor of gts and sequence lengths - gt, seq_len = self.build_target(target) - # Compute decoder masks - source_mask, target_mask = self.make_source_and_target_mask(encoded, gt) + gt, seq_len = self.compute_target(target) + + if kwargs.get('training', False): + if target is None: + raise AssertionError("In training mode, you need to pass a value to 'target'") + tgt_mask = self.make_mask(gt) # Compute logits - output = self.decoder(gt, encoded, source_mask, target_mask, **kwargs) + output = self.decoder(gt, encoded, tgt_mask, None, **kwargs) logits = self.linear(output, **kwargs) + else: + # When not training, we want to compute logits in with the decoder, although + # we have access to gts (we need gts to compute the loss, but not in the decoder) logits = self.decode(encoded, **kwargs) - if self.exportable: - out["logits"] = logits - return out - if target is not None: - out["loss"] = self.compute_loss(logits, gt, seq_len) + out['loss'] = self.compute_loss(logits, gt, seq_len) if return_model_output: - out["out_map"] = logits + out['out_map'] = logits if return_preds: - out["preds"] = self.postprocessor(logits) + predictions = self.postprocessor(logits) + out['preds'] = predictions return out - @tf.function def decode(self, encoded: tf.Tensor, **kwargs: Any) -> tf.Tensor: """Decode function for prediction @@ -500,38 +603,39 @@

                                                Source code for doctr.models.recognition.master.tensorflow

                                                Return: A Tuple of tf.Tensor: predictions, logits """ - b = encoded.shape[0] - + b = tf.shape(encoded)[0] + max_len = tf.constant(self.max_length, dtype=tf.int32) start_symbol = tf.constant(self.vocab_size + 1, dtype=tf.int32) # SOS padding_symbol = tf.constant(self.vocab_size + 2, dtype=tf.int32) # PAD - ys = tf.fill(dims=(b, self.max_length - 1), value=padding_symbol) + ys = tf.fill(dims=(b, max_len - 1), value=padding_symbol) start_vector = tf.fill(dims=(b, 1), value=start_symbol) ys = tf.concat([start_vector, ys], axis=-1) - # Final dimension include EOS/SOS/PAD + logits = tf.zeros(shape=(b, max_len - 1, self.vocab_size + 3), dtype=tf.float32) # 3 symbols + # max_len = len + 2 (sos + eos) for i in range(self.max_length - 1): - - source_mask, target_mask = self.make_source_and_target_mask(encoded, ys) - output = self.decoder(ys, encoded, source_mask, target_mask, **kwargs) + ys_mask = self.make_mask(ys) + output = self.decoder(ys, encoded, ys_mask, None, **kwargs) logits = self.linear(output, **kwargs) prob = tf.nn.softmax(logits, axis=-1) - next_token = tf.argmax(prob, axis=-1, output_type=ys.dtype) - # update ys with the next token and ignore the first token (SOS) - i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(self.max_length), indexing="ij") + next_word = tf.argmax(prob, axis=-1, output_type=ys.dtype) + # ys.shape = B, T + i_mesh, j_mesh = tf.meshgrid(tf.range(b), tf.range(max_len), indexing='ij') indices = tf.stack([i_mesh[:, i + 1], j_mesh[:, i + 1]], axis=1) - ys = tf.tensor_scatter_nd_update(ys, indices, next_token[:, i]) + ys = tf.tensor_scatter_nd_update(ys, indices, next_word[:, i + 1]) - # Shape (N, max_length, vocab_size + 1) + # final_logits of shape (N, max_length - 1, vocab_size + 1) (whithout sos) return logits class MASTERPostProcessor(_MASTERPostProcessor): """Post processor for MASTER architectures - Args: vocab: string containing the ordered sequence of supported characters + ignore_case: if True, ignore case of letters + ignore_accents: if True, ignore accents of letters """ def __call__( @@ -546,60 +650,51 @@

                                                Source code for doctr.models.recognition.master.tensorflow

                                                probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype="int32") + out_idxs = tf.cast(out_idxs, dtype='int32') embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) -def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool = True, **kwargs: Any) -> MASTER: - - pretrained_backbone = pretrained_backbone and not pretrained +def _master(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> MASTER: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg["input_shape"] = kwargs.get("input_shape", _cfg["input_shape"]) - _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) - kwargs["vocab"] = _cfg["vocab"] - kwargs["input_shape"] = _cfg["input_shape"] + kwargs['vocab'] = _cfg['vocab'] # Build the model - model = MASTER( - backbone_fn(pretrained=pretrained_backbone, input_shape=_cfg["input_shape"], include_top=False), - cfg=_cfg, - **kwargs, - ) + model = MASTER(cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + load_pretrained_params(model, default_cfgs[arch]['url']) return model
                                                -[docs] +[docs] def master(pretrained: bool = False, **kwargs: Any) -> MASTER: """MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. - - >>> import tensorflow as tf - >>> from doctr.models import master - >>> model = master(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - + Example:: + >>> import tensorflow as tf + >>> from doctr.models import master + >>> model = master(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset - Returns: text recognition architecture """ - return _master("master", pretrained, magc_resnet31, **kwargs)
                                                + return _master('master', pretrained, **kwargs)
                                                @@ -633,7 +728,7 @@

                                                Source code for doctr.models.recognition.master.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html index 1bbbf829b1..93a3b2ea81 100644 --- a/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/parseq/tensorflow.html @@ -305,7 +305,7 @@

                                                Source code for doctr.models.recognition.parseq.tensorflow

                                                import numpy as np import tensorflow as tf -from keras import Model, layers +from tensorflow.keras import Model, layers from doctr.datasets import VOCABS from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward @@ -462,7 +462,6 @@

                                                Source code for doctr.models.recognition.parseq.tensorflow

                                                self.postprocessor = PARSeqPostProcessor(vocab=self.vocab) - @tf.function def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor: # Generates permutations of the target sequence. # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py @@ -509,7 +508,6 @@

                                                Source code for doctr.models.recognition.parseq.tensorflow

                                                ) return combined - @tf.function def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]: # Generate source and target mask for the decoder attention. sz = permutation.shape[0] @@ -529,7 +527,6 @@

                                                Source code for doctr.models.recognition.parseq.tensorflow

                                                target_mask = mask[1:, :-1] return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool) - @tf.function def decode( self, target: tf.Tensor, diff --git a/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html index f20a567cef..3a9989ef30 100644 --- a/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/sar/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple
                                                -
                                                 import tensorflow as tf
                                                -from tensorflow.keras import Model, Sequential, layers
                                                -
                                                -from doctr.datasets import VOCABS
                                                -from doctr.utils.repr import NestedObject
                                                +from tensorflow.keras import Sequential, layers, Model
                                                +from typing import Tuple, Dict, List, Any, Optional
                                                 
                                                -from ...classification import resnet31
                                                -from ...utils.tensorflow import load_pretrained_params
                                                +from ... import backbones
                                                +from ...utils import load_pretrained_params
                                                 from ..core import RecognitionModel, RecognitionPostProcessor
                                                +from doctr.utils.repr import NestedObject
                                                 
                                                -__all__ = ["SAR", "sar_resnet31"]
                                                +__all__ = ['SAR', 'SARPostProcessor', 'sar_vgg16_bn', 'sar_resnet31']
                                                 
                                                 default_cfgs: Dict[str, Dict[str, Any]] = {
                                                -    "sar_resnet31": {
                                                -        "mean": (0.694, 0.695, 0.693),
                                                -        "std": (0.299, 0.296, 0.301),
                                                -        "input_shape": (32, 128, 3),
                                                -        "vocab": VOCABS["french"],
                                                -        "url": None,
                                                +    'sar_vgg16_bn': {
                                                +        'mean': (.5, .5, .5),
                                                +        'std': (1., 1., 1.),
                                                +        'backbone': 'vgg16_bn', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
                                                +        'input_shape': (32, 128, 3),
                                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1-models/sar_vgg16bn-0d7e2c26.zip',
                                                +    },
                                                +    'sar_resnet31': {
                                                +        'mean': (.5, .5, .5),
                                                +        'std': (1., 1., 1.),
                                                +        'backbone': 'resnet31', 'rnn_units': 512, 'max_length': 30, 'num_decoders': 2,
                                                +        'input_shape': (32, 128, 3),
                                                +        'vocab': ('3K}7eé;5àÎYho]QwV6qU~W"XnbBvcADfËmy.9ÔpÛ*{CôïE%M4#ÈR:g@T$x?0î£|za1ù8,OG€P-'
                                                +                  'kçHëÀÂ2É/ûIJ\'j(LNÙFut[)èZs+&°Sd=Ï!<â_Ç>rêi`l'),
                                                +        'url': 'https://github.com/mindee/doctr/releases/download/v0.1.0/sar_resnet31-ea202587.zip',
                                                     },
                                                 }
                                                 
                                                 
                                                -class SAREncoder(layers.Layer, NestedObject):
                                                -    """Implements encoder module of the SAR model
                                                -
                                                -    Args:
                                                -        rnn_units: number of hidden rnn units
                                                -        dropout_prob: dropout probability
                                                -    """
                                                -
                                                -    def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
                                                -
                                                -        super().__init__()
                                                -        self.rnn = Sequential(
                                                -            [
                                                -                layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
                                                -                layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
                                                -            ]
                                                -        )
                                                -
                                                -    def call(
                                                -        self,
                                                -        x: tf.Tensor,
                                                -        **kwargs: Any,
                                                -    ) -> tf.Tensor:
                                                -        # (N, C)
                                                -        return self.rnn(x, **kwargs)
                                                -
                                                -
                                                 class AttentionModule(layers.Layer, NestedObject):
                                                     """Implements attention module of the SAR model
                                                 
                                                @@ -355,33 +321,20 @@ 

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                attention_units: number of hidden attention units """ - - def __init__(self, attention_units: int) -> None: + def __init__( + self, + attention_units: int + ) -> None: super().__init__() self.hidden_state_projector = layers.Conv2D( - attention_units, - 1, - strides=1, - use_bias=False, - padding="same", - kernel_initializer="he_normal", + attention_units, 1, strides=1, use_bias=False, padding='same', kernel_initializer='he_normal', ) self.features_projector = layers.Conv2D( - attention_units, - 3, - strides=1, - use_bias=True, - padding="same", - kernel_initializer="he_normal", + attention_units, 3, strides=1, use_bias=True, padding='same', kernel_initializer='he_normal', ) self.attention_projector = layers.Conv2D( - 1, - 1, - strides=1, - use_bias=False, - padding="same", - kernel_initializer="he_normal", + 1, 1, strides=1, use_bias=False, padding="same", kernel_initializer='he_normal', ) self.flatten = layers.Flatten() @@ -393,11 +346,10 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                ) -> tf.Tensor: [H, W] = features.get_shape().as_list()[1:3] - # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) - features_projection = self.features_projector(features, **kwargs) # shape (N, 1, 1, rnn_units) -> (N, 1, 1, attention_units) - hidden_state = tf.expand_dims(tf.expand_dims(hidden_state, axis=1), axis=1) hidden_state_projection = self.hidden_state_projector(hidden_state, **kwargs) + # shape (N, H, W, vgg_units) -> (N, H, W, attention_units) + features_projection = self.features_projector(features, **kwargs) projection = tf.math.tanh(hidden_state_projection + features_projection) # shape (N, H, W, attention_units) -> (N, H, W, 1) attention = self.attention_projector(projection, **kwargs) @@ -407,8 +359,9 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                # shape (N, H * W) -> (N, H, W, 1) attention_map = tf.reshape(attention, [-1, H, W, 1]) glimpse = tf.math.multiply(features, attention_map) - # shape (N, H * W) -> (N, C) - return tf.reduce_sum(glimpse, axis=[1, 2]) + # shape (N, H * W) -> (N, 1) + glimpse = tf.reduce_sum(glimpse, axis=[1, 2]) + return glimpse class SARDecoder(layers.Layer, NestedObject): @@ -420,11 +373,9 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                vocab_size: number of classes in the model alphabet embedding_units: number of hidden embedding units attention_units: number of hidden attention units - num_decoder_cells: number of LSTMCell layers to stack - dropout_prob: dropout probability + num_decoder_layers: number of LSTM layers to stack """ - def __init__( self, rnn_units: int, @@ -432,23 +383,23 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                vocab_size: int, embedding_units: int, attention_units: int, - num_decoder_cells: int = 2, - dropout_prob: float = 0.0, + num_decoder_layers: int = 2, + input_shape: Optional[List[Tuple[Optional[int]]]] = None, ) -> None: super().__init__() self.vocab_size = vocab_size - self.max_length = max_length - - self.embed = layers.Dense(embedding_units, use_bias=False) - self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1) - - self.lstm_cells = layers.StackedRNNCells( - [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells)] + self.lstm_decoder = layers.StackedRNNCells( + [layers.LSTMCell(rnn_units, dtype=tf.float32, implementation=1) for _ in range(num_decoder_layers)] ) + self.embed = layers.Dense(embedding_units, use_bias=False, input_shape=(None, self.vocab_size + 1)) self.attention_module = AttentionModule(attention_units) - self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True) - self.dropout = layers.Dropout(dropout_prob) + self.output_dense = layers.Dense(vocab_size + 1, use_bias=True, input_shape=(None, 2 * rnn_units)) + self.max_length = max_length + + # Initialize kernels + if input_shape is not None: + self.attention_module.call(layers.Input(input_shape[0][1:]), layers.Input((1, 1, rnn_units))) def call( self, @@ -458,53 +409,39 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                **kwargs: Any, ) -> tf.Tensor: - if gt is not None: - gt_embedding = self.embed_tgt(gt, **kwargs) - - logits_list: List[tf.Tensor] = [] - - for t in range(self.max_length + 1): # 32 - if t == 0: - # step to init the first states of the LSTMCell - states = self.lstm_cells.get_initial_state( - inputs=None, batch_size=features.shape[0], dtype=features.dtype - ) - prev_symbol = holistic - elif t == 1: - # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros - # (N, vocab_size + 1) --> (N, embedding_units) - prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1]) - prev_symbol = self.embed(prev_symbol, **kwargs) + # initialize states (each of shape (N, rnn_units)) + states = self.lstm_decoder.get_initial_state( + inputs=None, batch_size=features.shape[0], dtype=tf.float32 + ) + # run first step of lstm + # holistic: shape (N, rnn_units) + _, states = self.lstm_decoder(holistic, states, **kwargs) + # Initialize with the index of virtual START symbol (placed after <eos>) + symbol = tf.fill(features.shape[0], self.vocab_size + 1) + logits_list = [] + if kwargs.get('training') and gt is None: + raise ValueError('Need to provide labels during training for teacher forcing') + for t in range(self.max_length + 1): # keep 1 step for <eos> + # one-hot symbol with depth vocab_size + 1 + # embeded_symbol: shape (N, embedding_units) + embeded_symbol = self.embed(tf.one_hot(symbol, depth=self.vocab_size + 1), **kwargs) + logits, states = self.lstm_decoder(embeded_symbol, states, **kwargs) + glimpse = self.attention_module( + features, tf.expand_dims(tf.expand_dims(logits, axis=1), axis=1), **kwargs, + ) + # logits: shape (N, rnn_units), glimpse: shape (N, 1) + logits = tf.concat([logits, glimpse], axis=-1) + # shape (N, rnn_units + 1) -> (N, vocab_size + 1) + logits = self.output_dense(logits, **kwargs) + # update symbol with predicted logits for t+1 step + if kwargs.get('training'): + symbol = gt[:, t] # type: ignore[index] else: - if gt is not None: - # (N, embedding_units) -2 because of <bos> and <eos> (same) - prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs) - else: - # -1 to start at timestep where prev_symbol was initialized - index = tf.argmax(logits_list[t - 1], axis=-1) - # update prev_symbol with ones at the index of the previous logit vector - # (N, embedding_units) - index = tf.ones_like(index) - prev_symbol = tf.scatter_nd( - tf.expand_dims(index, axis=1), - prev_symbol, - tf.constant([features.shape[0], features.shape[-1]], dtype=tf.int64), - ) - - # (N, C), (N, C) take the last hidden state and cell state from current timestep - _, states = self.lstm_cells(prev_symbol, states, **kwargs) - # states = (hidden_state, cell_state) - hidden_state = states[0][0] - # (N, H, W, C), (N, C) --> (N, C) - glimpse = self.attention_module(features, hidden_state, **kwargs) - # (N, C), (N, C) --> (N, 2 * C) - logits = tf.concat([hidden_state, glimpse], axis=1) - logits = self.dropout(logits, **kwargs) - # (N, vocab_size + 1) - logits_list.append(self.output_dense(logits, **kwargs)) - - # (max_length + 1, N, vocab_size + 1) --> (N, max_length + 1, vocab_size + 1) - return tf.transpose(tf.stack(logits_list[1:]), (1, 0, 2)) + symbol = tf.argmax(logits, axis=-1) + logits_list.append(logits) + outputs = tf.stack(logits_list, axis=1) # shape (N, max_length + 1, vocab_size + 1) + + return outputs class SAR(Model, RecognitionModel): @@ -518,13 +455,11 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                embedding_units: number of embedding units attention_units: number of hidden units in attention module max_length: maximum word length handled by the model - num_decoder_cells: number of LSTMCell layers to stack - dropout_prob: dropout probability for the encoder and decoder - exportable: onnx exportable returns only logits - cfg: dictionary containing information about the model + num_decoders: number of LSTM to stack in decoder layer + """ - _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"] + _children_names: List[str] = ['feat_extractor', 'encoder', 'decoder', 'postprocessor'] def __init__( self, @@ -534,35 +469,36 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                embedding_units: int = 512, attention_units: int = 512, max_length: int = 30, - num_decoder_cells: int = 2, - dropout_prob: float = 0.0, - exportable: bool = False, + num_decoders: int = 2, cfg: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() self.vocab = vocab - self.exportable = exportable self.cfg = cfg + self.max_length = max_length + 1 # Add 1 timestep for EOS after the longest word self.feat_extractor = feature_extractor - self.encoder = SAREncoder(rnn_units, dropout_prob) + self.encoder = Sequential( + [ + layers.LSTM(units=rnn_units, return_sequences=True), + layers.LSTM(units=rnn_units, return_sequences=False) + ] + ) + # Initialize the kernels (watch out for reduce_max) + self.encoder.build(input_shape=(None,) + self.feat_extractor.output_shape[2:]) + self.decoder = SARDecoder( - rnn_units, - self.max_length, - len(vocab), - embedding_units, - attention_units, - num_decoder_cells, - dropout_prob, + rnn_units, max_length, len(vocab), embedding_units, attention_units, num_decoders, + input_shape=[self.feat_extractor.output_shape, self.encoder.output_shape] ) self.postprocessor = SARPostProcessor(vocab=vocab) - @staticmethod def compute_loss( + self, model_output: tf.Tensor, gt: tf.Tensor, seq_len: tf.Tensor, @@ -590,7 +526,7 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                mask_values = tf.zeros_like(cce) mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) - ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) + ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, tf.float32)) return tf.expand_dims(ce_loss, axis=1) def call( @@ -603,25 +539,14 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                ) -> Dict[str, Any]: features = self.feat_extractor(x, **kwargs) - # vertical max pooling --> (N, C, W) - pooled_features = tf.reduce_max(features, axis=1) - # holistic (N, C) + pooled_features = tf.reduce_max(features, axis=1) # vertical max pooling encoded = self.encoder(pooled_features, **kwargs) - if target is not None: - gt, seq_len = self.build_target(target) + gt, seq_len = self.compute_target(target) seq_len = tf.cast(seq_len, tf.int32) - - if kwargs.get("training", False) and target is None: - raise ValueError("Need to provide labels during training for teacher forcing") - decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs) out: Dict[str, tf.Tensor] = {} - if self.exportable: - out["logits"] = decoded_features - return out - if return_model_output: out["out_map"] = decoded_features @@ -630,7 +555,7 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                out["preds"] = self.postprocessor(decoded_features) if target is not None: - out["loss"] = self.compute_loss(decoded_features, gt, seq_len) + out['loss'] = self.compute_loss(decoded_features, gt, seq_len) return out @@ -640,6 +565,8 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                Args: vocab: string containing the ordered sequence of supported characters + ignore_case: if True, ignore case of letters + ignore_accents: if True, ignore accents of letters """ def __call__( @@ -654,61 +581,86 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                probs = tf.math.reduce_min(probs, axis=1) # decode raw output of the model with tf_label_to_idx - out_idxs = tf.cast(out_idxs, dtype="int32") + out_idxs = tf.cast(out_idxs, dtype='int32') embedding = tf.constant(self._embedding, dtype=tf.string) decoded_strings_pred = tf.strings.reduce_join(inputs=tf.nn.embedding_lookup(embedding, out_idxs), axis=-1) decoded_strings_pred = tf.strings.split(decoded_strings_pred, "<eos>") - decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] + decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value='not valid')[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] return list(zip(word_values, probs.numpy().tolist())) -def _sar( - arch: str, - pretrained: bool, - backbone_fn, - pretrained_backbone: bool = True, - input_shape: Optional[Tuple[int, int, int]] = None, - **kwargs: Any, -) -> SAR: - - pretrained_backbone = pretrained_backbone and not pretrained +def _sar(arch: str, pretrained: bool, input_shape: Tuple[int, int, int] = None, **kwargs: Any) -> SAR: # Patch the config _cfg = deepcopy(default_cfgs[arch]) - _cfg["input_shape"] = input_shape or _cfg["input_shape"] - _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) + _cfg['input_shape'] = input_shape or _cfg['input_shape'] + _cfg['vocab'] = kwargs.get('vocab', _cfg['vocab']) + _cfg['rnn_units'] = kwargs.get('rnn_units', _cfg['rnn_units']) + _cfg['embedding_units'] = kwargs.get('embedding_units', _cfg['rnn_units']) + _cfg['attention_units'] = kwargs.get('attention_units', _cfg['rnn_units']) + _cfg['max_length'] = kwargs.get('max_length', _cfg['max_length']) + _cfg['num_decoders'] = kwargs.get('num_decoders', _cfg['num_decoders']) # Feature extractor - feat_extractor = backbone_fn( - pretrained=pretrained_backbone, - input_shape=_cfg["input_shape"], + feat_extractor = backbones.__dict__[default_cfgs[arch]['backbone']]( + input_shape=_cfg['input_shape'], include_top=False, ) - kwargs["vocab"] = _cfg["vocab"] + kwargs['vocab'] = _cfg['vocab'] + kwargs['rnn_units'] = _cfg['rnn_units'] + kwargs['embedding_units'] = _cfg['embedding_units'] + kwargs['attention_units'] = _cfg['attention_units'] + kwargs['max_length'] = _cfg['max_length'] + kwargs['num_decoders'] = _cfg['num_decoders'] # Build the model model = SAR(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + load_pretrained_params(model, default_cfgs[arch]['url']) return model +
                                                +[docs] +def sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) -> SAR: + """SAR with a VGG16 feature extractor as described in `"Show, Attend and Read:A Simple and Strong + Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. + + Example:: + >>> import tensorflow as tf + >>> from doctr.models import sar_vgg16_bn + >>> model = sar_vgg16_bn(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) + + Args: + pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + + Returns: + text recognition architecture + """ + + return _sar('sar_vgg16_bn', pretrained, **kwargs)
                                                + + +
                                                -[docs] +[docs] def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR: """SAR with a resnet-31 feature extractor as described in `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_. - >>> import tensorflow as tf - >>> from doctr.models import sar_resnet31 - >>> model = sar_resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) + Example: + >>> import tensorflow as tf + >>> from doctr.models import sar_resnet31 + >>> model = sar_resnet31(pretrained=False) + >>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32) + >>> out = model(input_tensor) Args: pretrained (bool): If True, returns a model pre-trained on our text recognition dataset @@ -717,7 +669,7 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                text recognition architecture """ - return _sar("sar_resnet31", pretrained, resnet31, **kwargs)
                                                + return _sar('sar_resnet31', pretrained, **kwargs)
                                                @@ -751,7 +703,7 @@

                                                Source code for doctr.models.recognition.sar.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html b/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html index 8ac4cc55ce..aecde3662a 100644 --- a/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html +++ b/v0.6.0/_modules/doctr/models/recognition/vitstr/tensorflow.html @@ -235,12 +235,15 @@

                                                Package Reference

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                -# Copyright (C) 2022, Mindee.
                                                +# Copyright (C) 2021-2024, Mindee.
                                                 
                                                 # This program is licensed under the Apache License 2.0.
                                                 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                @@ -304,7 +307,7 @@ 

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                from doctr.datasets import VOCABS from ...classification import vit_b, vit_s -from ...utils.tensorflow import load_pretrained_params +from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params from .base import _ViTSTR, _ViTSTRPostProcessor __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"] @@ -315,14 +318,14 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                "std": (0.299, 0.296, 0.301), "input_shape": (32, 128, 3), "vocab": VOCABS["french"], - "url": None, + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_small-d28b8d92.weights.h5&src=0", }, "vitstr_base": { "mean": (0.694, 0.695, 0.693), "std": (0.299, 0.296, 0.301), "input_shape": (32, 128, 3), "vocab": VOCABS["french"], - "url": None, + "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_base-9ad6eb84.weights.h5&src=0", }, } @@ -332,6 +335,7 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_. Args: + ---- feature_extractor: the backbone serving as feature extractor vocab: vocabulary used for encoding embedding_units: number of embedding units @@ -349,22 +353,20 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                feature_extractor, vocab: str, embedding_units: int, - max_length: int = 25, + max_length: int = 32, dropout_prob: float = 0.0, input_shape: Tuple[int, int, int] = (32, 128, 3), # different from paper exportable: bool = False, cfg: Optional[Dict[str, Any]] = None, ) -> None: - super().__init__() self.vocab = vocab self.exportable = exportable self.cfg = cfg - # NOTE: different from paper, who uses eos also as pad token - self.max_length = max_length + 3 # Add 1 step for EOS, 1 for SOS, 1 for PAD + self.max_length = max_length + 2 # +2 for SOS and EOS self.feat_extractor = feature_extractor - self.head = layers.Dense(len(self.vocab) + 3, name="head") + self.head = layers.Dense(len(self.vocab) + 1, name="head") # +1 for EOS self.postprocessor = ViTSTRPostProcessor(vocab=self.vocab) @@ -378,11 +380,13 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                Sequences are masked after the EOS character. Args: + ---- model_output: predicted logits of the model gt: the encoded tensor with gt labels seq_len: lengths of each gt word inside the batch Returns: + ------- The loss of the model on the batch """ # Input length : number of steps @@ -392,11 +396,11 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                # One-hot gt labels oh_gt = tf.one_hot(gt, depth=model_output.shape[2]) # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]! - # The "masked" first gt char is <sos>. Delete last logit of the model output. - cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt[:, 1:, :], model_output[:, :-1, :]) + # The "masked" first gt char is <sos>. + cce = tf.nn.softmax_cross_entropy_with_logits(oh_gt[:, 1:, :], model_output) # Compute mask mask_values = tf.zeros_like(cce) - mask_2d = tf.sequence_mask(seq_len, input_len - 1) # delete the last mask timestep as well + mask_2d = tf.sequence_mask(seq_len, input_len) masked_loss = tf.where(mask_2d, cce, mask_values) ce_loss = tf.math.divide(tf.reduce_sum(masked_loss, axis=1), tf.cast(seq_len, model_output.dtype)) @@ -410,7 +414,6 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                return_preds: bool = False, **kwargs: Any, ) -> Dict[str, Any]: - features = self.feat_extractor(x, **kwargs) # (batch_size, patches_seqlen, d_model) if target is not None: @@ -420,12 +423,13 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                if kwargs.get("training", False) and target is None: raise ValueError("Need to provide labels during training") - features = features[:, : self.max_length + 1] # add 1 for unused cls token (ViT) - # (batch_size, max_length + 1, d_model) + features = features[:, : self.max_length] # (batch_size, max_length, d_model) B, N, E = features.shape features = tf.reshape(features, (B * N, E)) - logits = tf.reshape(self.head(features), (B, N, len(self.vocab) + 3)) # (batch_size, max_length + 1, vocab + 3) - decoded_features = logits[:, 1:] # remove cls_token + logits = tf.reshape( + self.head(features, **kwargs), (B, N, len(self.vocab) + 1) + ) # (batch_size, max_length, vocab + 1) + decoded_features = _bf16_to_float32(logits[:, 1:]) # remove cls_token out: Dict[str, tf.Tensor] = {} if self.exportable: @@ -449,6 +453,7 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                """Post processor for ViTSTR architecture Args: + ---- vocab: string containing the ordered sequence of supported characters """ @@ -458,10 +463,7 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                ) -> List[Tuple[str, float]]: # compute pred with argmax for attention models out_idxs = tf.math.argmax(logits, axis=2) - # N x L - probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2) - # Take the minimum confidence of the sequence - probs = tf.math.reduce_min(probs, axis=1) + preds_prob = tf.math.reduce_max(tf.nn.softmax(logits, axis=-1), axis=-1) # decode raw output of the model with tf_label_to_idx out_idxs = tf.cast(out_idxs, dtype="int32") @@ -471,39 +473,50 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0] word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()] - return list(zip(word_values, probs.numpy().tolist())) + # compute probabilties for each word up to the EOS token + probs = [ + preds_prob[i, : len(word)].numpy().clip(0, 1).mean().item() if word else 0.0 + for i, word in enumerate(word_values) + ] + + return list(zip(word_values, probs)) def _vitstr( arch: str, pretrained: bool, backbone_fn, - pretrained_backbone: bool = False, # NOTE: training from scratch without a pretrained backbone works better input_shape: Optional[Tuple[int, int, int]] = None, **kwargs: Any, ) -> ViTSTR: - - pretrained_backbone = pretrained_backbone and not pretrained - # Patch the config _cfg = deepcopy(default_cfgs[arch]) _cfg["input_shape"] = input_shape or _cfg["input_shape"] _cfg["vocab"] = kwargs.get("vocab", _cfg["vocab"]) + patch_size = kwargs.get("patch_size", (4, 8)) kwargs["vocab"] = _cfg["vocab"] # Feature extractor feat_extractor = backbone_fn( - pretrained=pretrained_backbone, + # NOTE: we don't use a pretrained backbone for non-rectangular patches to avoid the pos embed mismatch + pretrained=False, input_shape=_cfg["input_shape"], + patch_size=patch_size, include_top=False, ) + kwargs.pop("patch_size", None) + kwargs.pop("pretrained_backbone", None) + # Build the model model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs) # Load pretrained parameters if pretrained: - load_pretrained_params(model, default_cfgs[arch]["url"]) + # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning + load_pretrained_params( + model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"] + ) return model @@ -521,17 +534,20 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the ViTSTR architecture Returns: + ------- text recognition architecture """ - return _vitstr( "vitstr_small", pretrained, vit_s, embedding_units=384, + patch_size=(4, 8), **kwargs, )
                                                @@ -550,17 +566,20 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                >>> out = model(input_tensor) Args: + ---- pretrained (bool): If True, returns a model pre-trained on our text recognition dataset + **kwargs: keyword arguments of the ViTSTR architecture Returns: + ------- text recognition architecture """ - return _vitstr( "vitstr_base", pretrained, vit_b, embedding_units=768, + patch_size=(4, 8), **kwargs, )
                                                @@ -596,7 +615,7 @@

                                                Source code for doctr.models.recognition.vitstr.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/recognition/zoo.html b/v0.6.0/_modules/doctr/models/recognition/zoo.html index d1dec9eb03..0f1bff8861 100644 --- a/v0.6.0/_modules/doctr/models/recognition/zoo.html +++ b/v0.6.0/_modules/doctr/models/recognition/zoo.html @@ -226,35 +226,20 @@

                                                Source code for doctr.models.recognition.zoo

                                                -# Copyright (C) 2021-2022, Mindee.
                                                -
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -from typing import Any, List
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                -from doctr.file_utils import is_tf_available
                                                -from doctr.models.preprocessor import PreProcessor
                                                +from typing import Any
                                                 
                                                +from doctr.file_utils import is_tf_available, is_torch_available
                                                +from .core import RecognitionPredictor
                                                +from ..preprocessor import PreProcessor
                                                 from .. import recognition
                                                -from .predictor import RecognitionPredictor
                                                -
                                                -__all__ = ["recognition_predictor"]
                                                 
                                                 
                                                -ARCHS: List[str] = [
                                                -    "crnn_vgg16_bn",
                                                -    "crnn_mobilenet_v3_small",
                                                -    "crnn_mobilenet_v3_large",
                                                -    "sar_resnet31",
                                                -    "master",
                                                -    "vitstr_small",
                                                -    "vitstr_base",
                                                -]
                                                +__all__ = ["recognition_predictor"]
                                                 
                                                 
                                                -def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                                                +if is_tf_available():
                                                +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31', 'master']
                                                +elif is_torch_available():
                                                +    ARCHS = ['crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31']
                                                 
                                                -    if isinstance(arch, str):
                                                -        if arch not in ARCHS:
                                                -            raise ValueError(f"unknown architecture '{arch}'")
                                                 
                                                -        _model = recognition.__dict__[arch](
                                                -            pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
                                                -        )
                                                -    else:
                                                -        if not isinstance(arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR)):
                                                -            raise ValueError(f"unknown architecture: {type(arch)}")
                                                -        _model = arch
                                                +def _predictor(arch: str, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
                                                 
                                                -    kwargs.pop("pretrained_backbone", None)
                                                +    if arch not in ARCHS:
                                                +        raise ValueError(f"unknown architecture '{arch}'")
                                                 
                                                -    kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
                                                -    kwargs["std"] = kwargs.get("std", _model.cfg["std"])
                                                -    kwargs["batch_size"] = kwargs.get("batch_size", 32)
                                                -    input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
                                                -    predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)
                                                +    _model = recognition.__dict__[arch](pretrained=pretrained)
                                                +    kwargs['mean'] = kwargs.get('mean', _model.cfg['mean'])
                                                +    kwargs['std'] = kwargs.get('std', _model.cfg['std'])
                                                +    kwargs['batch_size'] = kwargs.get('batch_size', 32)
                                                +    predictor = RecognitionPredictor(
                                                +        PreProcessor(_model.cfg['input_shape'][:2], preserve_aspect_ratio=True, **kwargs),
                                                +        _model
                                                +    )
                                                 
                                                     return predictor
                                                 
                                                 
                                                 
                                                -[docs] -def recognition_predictor(arch: Any = "crnn_vgg16_bn", pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: +[docs] +def recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor: """Text recognition architecture. Example:: @@ -355,7 +327,7 @@

                                                Source code for doctr.models.recognition.zoo

                                                        >>> out = model([input_page])
                                                 
                                                     Args:
                                                -        arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
                                                +        arch: name of the architecture to use ('crnn_vgg16_bn', 'crnn_resnet31', 'sar_vgg16_bn', 'sar_resnet31')
                                                         pretrained: If True, returns a model pre-trained on our text recognition dataset
                                                 
                                                     Returns:
                                                @@ -396,7 +368,7 @@ 

                                                Source code for doctr.models.recognition.zoo

                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/models/zoo.html b/v0.6.0/_modules/doctr/models/zoo.html index 7965effc9f..bfa5a6fdf4 100644 --- a/v0.6.0/_modules/doctr/models/zoo.html +++ b/v0.6.0/_modules/doctr/models/zoo.html @@ -226,35 +226,15 @@

                                                Source code for doctr.models.zoo

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 from typing import Any
                                                -
                                                +from .core import OCRPredictor
                                                 from .detection.zoo import detection_predictor
                                                -from .predictor import OCRPredictor
                                                 from .recognition.zoo import recognition_predictor
                                                 
                                                +
                                                 __all__ = ["ocr_predictor"]
                                                 
                                                 
                                                -def _predictor(
                                                -    det_arch: Any,
                                                -    reco_arch: Any,
                                                -    pretrained: bool,
                                                -    pretrained_backbone: bool = True,
                                                -    assume_straight_pages: bool = True,
                                                -    preserve_aspect_ratio: bool = False,
                                                -    symmetric_pad: bool = True,
                                                -    det_bs: int = 2,
                                                -    reco_bs: int = 128,
                                                -    detect_orientation: bool = False,
                                                -    detect_language: bool = False,
                                                -    **kwargs,
                                                -) -> OCRPredictor:
                                                +def _predictor(det_arch: str, reco_arch: str, pretrained: bool, det_bs=2, reco_bs=128) -> OCRPredictor:
                                                 
                                                     # Detection
                                                -    det_predictor = detection_predictor(
                                                -        det_arch,
                                                -        pretrained=pretrained,
                                                -        pretrained_backbone=pretrained_backbone,
                                                -        batch_size=det_bs,
                                                -        assume_straight_pages=assume_straight_pages,
                                                -        preserve_aspect_ratio=preserve_aspect_ratio,
                                                -        symmetric_pad=symmetric_pad,
                                                -    )
                                                +    det_predictor = detection_predictor(det_arch, pretrained=pretrained, batch_size=det_bs)
                                                 
                                                     # Recognition
                                                -    reco_predictor = recognition_predictor(
                                                -        reco_arch, pretrained=pretrained, pretrained_backbone=pretrained_backbone, batch_size=reco_bs
                                                -    )
                                                +    reco_predictor = recognition_predictor(reco_arch, pretrained=pretrained, batch_size=reco_bs)
                                                 
                                                -    return OCRPredictor(
                                                -        det_predictor,
                                                -        reco_predictor,
                                                -        assume_straight_pages=assume_straight_pages,
                                                -        preserve_aspect_ratio=preserve_aspect_ratio,
                                                -        symmetric_pad=symmetric_pad,
                                                -        detect_orientation=detect_orientation,
                                                -        detect_language=detect_language,
                                                -        **kwargs,
                                                -    )
                                                +    return OCRPredictor(det_predictor, reco_predictor)
                                                 
                                                 
                                                 
                                                -[docs] +[docs] def ocr_predictor( - det_arch: Any = "db_resnet50", - reco_arch: Any = "crnn_vgg16_bn", + det_arch: str = 'db_resnet50', + reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, - pretrained_backbone: bool = True, - assume_straight_pages: bool = True, - preserve_aspect_ratio: bool = False, - symmetric_pad: bool = True, - export_as_straight_boxes: bool = False, - detect_orientation: bool = False, - detect_language: bool = False, - **kwargs: Any, + **kwargs: Any ) -> OCRPredictor: """End-to-end OCR architecture using one model for localization, and another for text recognition. - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) - >>> out = model([input_page]) + Example:: + >>> import numpy as np + >>> from doctr.models import ocr_predictor + >>> model = ocr_predictor(pretrained=True) + >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8) + >>> out = model([input_page]) Args: - det_arch: name of the detection architecture or the model itself to use - (e.g. 'db_resnet50', 'db_mobilenet_v3_large') - reco_arch: name of the recognition architecture or the model itself to use - (e.g. 'crnn_vgg16_bn', 'sar_resnet31') + arch: name of the architecture to use ('db_sar_vgg', 'db_sar_resnet', 'db_crnn_vgg', 'db_crnn_resnet') pretrained: If True, returns a model pre-trained on our OCR dataset - pretrained_backbone: If True, returns a model with a pretrained backbone - assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages - without rotated textual elements. - preserve_aspect_ratio: If True, pad the input document image to preserve the aspect ratio before - running the detection model on it. - symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right. - export_as_straight_boxes: when assume_straight_pages is set to False, export final predictions - (potentially rotated) as straight bounding boxes. - detect_orientation: if True, the estimated general page orientation will be added to the predictions for each - page. Doing so will slightly deteriorate the overall latency. - detect_language: if True, the language prediction will be added to the predictions for each - page. Doing so will slightly deteriorate the overall latency. - kwargs: keyword args of `OCRPredictor` Returns: OCR predictor """ - return _predictor( - det_arch, - reco_arch, - pretrained, - pretrained_backbone=pretrained_backbone, - assume_straight_pages=assume_straight_pages, - preserve_aspect_ratio=preserve_aspect_ratio, - symmetric_pad=symmetric_pad, - export_as_straight_boxes=export_as_straight_boxes, - detect_orientation=detect_orientation, - detect_language=detect_language, - **kwargs, - )
                                                + return _predictor(det_arch, reco_arch, pretrained, **kwargs)
                                                @@ -440,7 +354,7 @@

                                                Source code for doctr.models.zoo

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/transforms/modules/base.html b/v0.6.0/_modules/doctr/transforms/modules/base.html index e3d55174af..e7b5ea10d9 100644 --- a/v0.6.0/_modules/doctr/transforms/modules/base.html +++ b/v0.6.0/_modules/doctr/transforms/modules/base.html @@ -226,35 +226,20 @@

                                                Source code for doctr.transforms.modules.base

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                -import math
                                                 import random
                                                -from typing import Any, Callable, Dict, List, Tuple
                                                -
                                                -import numpy as np
                                                +from typing import List, Any, Callable
                                                 
                                                 from doctr.utils.repr import NestedObject
                                                -
                                                 from .. import functional as F
                                                 
                                                -__all__ = ["SampleCompose", "ImageTransform", "ColorInversion", "OneOf", "RandomApply", "RandomRotate", "RandomCrop"]
                                                -
                                                -
                                                -class SampleCompose(NestedObject):
                                                -    """Implements a wrapper that will apply transformations sequentially on both image and target
                                                -
                                                -    .. tabs::
                                                -
                                                -        .. tab:: TensorFlow
                                                -
                                                -            .. code:: python
                                                -
                                                -                >>> import numpy as np
                                                -                >>> import tensorflow as tf
                                                -                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
                                                -                >>> transfo = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
                                                -                >>> out, out_boxes = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), np.zeros((2, 4)))
                                                -
                                                -        .. tab:: PyTorch
                                                -
                                                -            .. code:: python
                                                -
                                                -                >>> import numpy as np
                                                -                >>> import torch
                                                -                >>> from doctr.transforms import SampleCompose, ImageTransform, ColorInversion, RandomRotate
                                                -                >>> transfos = SampleCompose([ImageTransform(ColorInversion((32, 32))), RandomRotate(30)])
                                                -                >>> out, out_boxes = transfos(torch.rand(8, 64, 64, 3), np.zeros((2, 4)))
                                                -
                                                -    Args:
                                                -        transforms: list of transformation modules
                                                -    """
                                                -
                                                -    _children_names: List[str] = ["sample_transforms"]
                                                -
                                                -    def __init__(self, transforms: List[Callable[[Any, Any], Tuple[Any, Any]]]) -> None:
                                                -        self.sample_transforms = transforms
                                                -
                                                -    def __call__(self, x: Any, target: Any) -> Tuple[Any, Any]:
                                                -        for t in self.sample_transforms:
                                                -            x, target = t(x, target)
                                                -
                                                -        return x, target
                                                -
                                                -
                                                -class ImageTransform(NestedObject):
                                                -    """Implements a transform wrapper to turn an image-only transformation into an image+target transform
                                                -
                                                -    .. tabs::
                                                -
                                                -        .. tab:: TensorFlow
                                                -
                                                -            .. code:: python
                                                -
                                                -                >>> import tensorflow as tf
                                                -                >>> from doctr.transforms import ImageTransform, ColorInversion
                                                -                >>> transfo = ImageTransform(ColorInversion((32, 32)))
                                                -                >>> out, _ = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1), None)
                                                -
                                                -        .. tab:: PyTorch
                                                -
                                                -            .. code:: python
                                                -
                                                -                >>> import torch
                                                -                >>> from doctr.transforms import ImageTransform, ColorInversion
                                                -                >>> transfo = ImageTransform(ColorInversion((32, 32)))
                                                -                >>> out, _ = transfo(torch.rand(8, 64, 64, 3), None)
                                                -
                                                -    Args:
                                                -        transform: the image transformation module to wrap
                                                -    """
                                                -
                                                -    _children_names: List[str] = ["img_transform"]
                                                -
                                                -    def __init__(self, transform: Callable[[Any], Any]) -> None:
                                                -        self.img_transform = transform
                                                 
                                                -    def __call__(self, img: Any, target: Any) -> Tuple[Any, Any]:
                                                -        img = self.img_transform(img)
                                                -        return img, target
                                                +__all__ = ['ColorInversion', 'OneOf', 'RandomApply']
                                                 
                                                 
                                                 
                                                -[docs] +[docs] class ColorInversion(NestedObject): """Applies the following tranformation to a tensor (image or batch of images): convert to grayscale, colorize (shift 0-values randomly), and then invert colors - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import ColorInversion - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import ColorInversion - >>> transfo = ColorInversion(min_val=0.6) - >>> out = transfo(torch.rand(8, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = ColorInversion(min_val=0.6) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: min_val: range [min_val, 1] to colorize RGB pixels """ - def __init__(self, min_val: float = 0.5) -> None: self.min_val = min_val @@ -428,35 +317,21 @@

                                                Source code for doctr.transforms.modules.base

                                                -[docs] +[docs] class OneOf(NestedObject): """Randomly apply one of the input transformations - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import OneOf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import OneOf - >>> transfo = OneOf([JpegQuality(), Gamma()]) - >>> out = transfo(torch.rand(1, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = OneOf([JpegQuality(), Gamma()]) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transforms: list of transformations, one only will be picked """ - _children_names: List[str] = ["transforms"] + _children_names: List[str] = ['transforms'] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -470,36 +345,21 @@

                                                Source code for doctr.transforms.modules.base

                                                -[docs] +[docs] class RandomApply(NestedObject): """Apply with a probability p the input transformation - .. tabs:: - - .. tab:: TensorFlow - - .. code:: python - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomApply - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - .. tab:: PyTorch - - .. code:: python - - >>> import torch - >>> from doctr.transforms import RandomApply - >>> transfo = RandomApply(Gamma(), p=.5) - >>> out = transfo(torch.rand(1, 64, 64, 3)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = RandomApply(Gamma(), p=.5) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transform: transformation to apply p: probability to apply """ - - def __init__(self, transform: Callable[[Any], Any], p: float = 0.5) -> None: + def __init__(self, transform: Callable[[Any], Any], p: float = .5) -> None: self.transform = transform self.p = p @@ -511,70 +371,6 @@

                                                Source code for doctr.transforms.modules.base

                                                return self.transform(img) return img
                                                - - -
                                                -[docs] -class RandomRotate(NestedObject): - """Randomly rotate a tensor image and its boxes - - .. image:: https://doctr-static.mindee.com/models?id=v0.4.0/rotation_illustration.png&src=0 - :align: center - - Args: - max_angle: maximum angle for rotation, in degrees. Angles will be uniformly picked in - [-max_angle, max_angle] - expand: whether the image should be padded before the rotation - """ - - def __init__(self, max_angle: float = 5.0, expand: bool = False) -> None: - self.max_angle = max_angle - self.expand = expand - - def extra_repr(self) -> str: - return f"max_angle={self.max_angle}, expand={self.expand}" - - def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]: - angle = random.uniform(-self.max_angle, self.max_angle) - r_img, r_polys = F.rotate_sample(img, target, angle, self.expand) - # Removes deleted boxes - is_kept = (r_polys.max(1) > r_polys.min(1)).sum(1) == 2 - return r_img, r_polys[is_kept]
                                                - - - -
                                                -[docs] -class RandomCrop(NestedObject): - """Randomly crop a tensor image and its boxes - - Args: - scale: tuple of floats, relative (min_area, max_area) of the crop - ratio: tuple of float, relative (min_ratio, max_ratio) where ratio = h/w - """ - - def __init__(self, scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33)) -> None: - self.scale = scale - self.ratio = ratio - - def extra_repr(self) -> str: - return f"scale={self.scale}, ratio={self.ratio}" - - def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]: - scale = random.uniform(self.scale[0], self.scale[1]) - ratio = random.uniform(self.ratio[0], self.ratio[1]) - # Those might overflow - crop_h = math.sqrt(scale * ratio) - crop_w = math.sqrt(scale / ratio) - xmin, ymin = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h) - xmax, ymax = xmin + crop_w, ymin + crop_h - # Clip them - xmin, ymin = max(xmin, 0), max(ymin, 0) - xmax, ymax = min(xmax, 1), min(ymax, 1) - - croped_img, crop_boxes = F.crop_detection(img, target["boxes"], (xmin, ymin, xmax, ymax)) - return croped_img, dict(boxes=crop_boxes)
                                                -
                                                @@ -607,7 +403,7 @@

                                                Source code for doctr.transforms.modules.base

                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html b/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html index 0e4ebeb632..51b31b4fc4 100644 --- a/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html +++ b/v0.6.0/_modules/doctr/transforms/modules/tensorflow.html @@ -226,35 +226,20 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                 import random
                                                -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
                                                -
                                                -import numpy as np
                                                 import tensorflow as tf
                                                -import tensorflow_addons as tfa
                                                +from typing import List, Any, Tuple, Callable
                                                 
                                                 from doctr.utils.repr import NestedObject
                                                 
                                                -from ..functional.tensorflow import random_shadow
                                                -
                                                -__all__ = [
                                                -    "Compose",
                                                -    "Resize",
                                                -    "Normalize",
                                                -    "LambdaTransformation",
                                                -    "ToGray",
                                                -    "RandomBrightness",
                                                -    "RandomContrast",
                                                -    "RandomSaturation",
                                                -    "RandomHue",
                                                -    "RandomGamma",
                                                -    "RandomJpegQuality",
                                                -    "GaussianBlur",
                                                -    "ChannelShuffle",
                                                -    "GaussianNoise",
                                                -    "RandomHorizontalFlip",
                                                -    "RandomShadow",
                                                -]
                                                +
                                                +__all__ = ['Compose', 'Resize', 'Normalize', 'LambdaTransformation', 'ToGray', 'RandomBrightness',
                                                +           'RandomContrast', 'RandomSaturation', 'RandomHue', 'RandomGamma', 'RandomJpegQuality']
                                                 
                                                 
                                                 
                                                -[docs] +[docs] class Compose(NestedObject): """Implements a wrapper that will apply transformations sequentially - >>> import tensorflow as tf - >>> from doctr.transforms import Compose, Resize - >>> transfos = Compose([Resize((32, 32))]) - >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Compose, Resize + >>> import tensorflow as tf + >>> transfos = Compose([Resize((32, 32))]) + >>> out = transfos(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: transforms: list of transformation modules """ - _children_names: List[str] = ["transforms"] + _children_names: List[str] = ['transforms'] def __init__(self, transforms: List[Callable[[Any], Any]]) -> None: self.transforms = transforms @@ -354,14 +320,15 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class Resize(NestedObject): """Resizes a tensor to a target size - >>> import tensorflow as tf - >>> from doctr.transforms import Resize - >>> transfo = Resize((32, 32)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Resize + >>> import tensorflow as tf + >>> transfo = Resize((32, 32)) + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: output_size: expected output size @@ -369,11 +336,10 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                preserve_aspect_ratio: if `True`, preserve aspect ratio and pad the rest with zeros symmetric_pad: if `True` while preserving aspect ratio, the padding will be done symmetrically """ - def __init__( self, - output_size: Union[int, Tuple[int, int]], - method: str = "bilinear", + output_size: Tuple[int, int], + method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False, ) -> None: @@ -382,114 +348,70 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                self.preserve_aspect_ratio = preserve_aspect_ratio self.symmetric_pad = symmetric_pad - if isinstance(self.output_size, int): - self.wanted_size = (self.output_size, self.output_size) - elif isinstance(self.output_size, (tuple, list)): - self.wanted_size = self.output_size - else: - raise AssertionError("Output size should be either a list, a tuple or an int") - def extra_repr(self) -> str: _repr = f"output_size={self.output_size}, method='{self.method}'" if self.preserve_aspect_ratio: _repr += f", preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}" return _repr - def __call__( - self, - img: tf.Tensor, - target: Optional[np.ndarray] = None, - ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: - - input_dtype = img.dtype - - img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio) - # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio - raw_shape = img.shape[:2] + def __call__(self, img: tf.Tensor) -> tf.Tensor: + img = tf.image.resize(img, self.output_size, self.method, self.preserve_aspect_ratio) if self.preserve_aspect_ratio: - if isinstance(self.output_size, (tuple, list)): - # In that case we need to pad because we want to enforce both width and height - if not self.symmetric_pad: - offset = (0, 0) - elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - if self.preserve_aspect_ratio: - # Get absolute coords - if target.shape[1:] == (4,): - if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] - target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] - target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] - else: - target[:, [0, 2]] *= raw_shape[1] / img.shape[1] - target[:, [1, 3]] *= raw_shape[0] / img.shape[0] - elif target.shape[1:] == (4, 2): - if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] - target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] - target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] - else: - target[..., 0] *= raw_shape[1] / img.shape[1] - target[..., 1] *= raw_shape[0] / img.shape[0] - else: - raise AssertionError - return tf.cast(img, dtype=input_dtype), target - - return tf.cast(img, dtype=input_dtype)
                                                + # pad width + if not self.symmetric_pad: + offset = (0, 0) + elif self.output_size[0] == img.shape[0]: + offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) + else: + offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) + img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) + return img
                                                -[docs] +[docs] class Normalize(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - >>> import tensorflow as tf - >>> from doctr.transforms import Normalize - >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: mean: average value per channel std: standard deviation per channel """ - def __init__(self, mean: Tuple[float, float, float], std: Tuple[float, float, float]) -> None: - self.mean = tf.constant(mean) - self.std = tf.constant(std) + self.mean = tf.constant(mean, dtype=tf.float32) + self.std = tf.constant(std, dtype=tf.float32) def extra_repr(self) -> str: return f"mean={self.mean.numpy().tolist()}, std={self.std.numpy().tolist()}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - img -= tf.cast(self.mean, dtype=img.dtype) - img /= tf.cast(self.std, dtype=img.dtype) + img -= self.mean + img /= self.std return img
                                                -[docs] +[docs] class LambdaTransformation(NestedObject): """Normalize a tensor to a Gaussian distribution for each channel - >>> import tensorflow as tf - >>> from doctr.transforms import LambdaTransformation - >>> transfo = LambdaTransformation(lambda x: x/ 255.) - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import LambdaTransformation + >>> import tensorflow as tf + >>> transfo = LambdaTransformation(lambda x: x/ 255.) + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: fn: the function to be applied to the input tensor """ - def __init__(self, fn: Callable[[tf.Tensor], tf.Tensor]) -> None: self.fn = fn @@ -499,41 +421,37 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class ToGray(NestedObject): """Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor - >>> import tensorflow as tf - >>> from doctr.transforms import ToGray - >>> transfo = ToGray() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = ToGray() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) """ - - def __init__(self, num_output_channels: int = 1): - self.num_output_channels = num_output_channels - def __call__(self, img: tf.Tensor) -> tf.Tensor: - img = tf.image.rgb_to_grayscale(img) - return img if self.num_output_channels == 1 else tf.repeat(img, self.num_output_channels, axis=-1)
                                                + return tf.image.rgb_to_grayscale(img)
                                                -[docs] +[docs] class RandomBrightness(NestedObject): """Randomly adjust brightness of a tensor (batch of images or image) by adding a delta to all pixels - >>> import tensorflow as tf - >>> from doctr.transforms import RandomBrightness - >>> transfo = RandomBrightness() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Brightness() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] p: probability to apply transformation """ - def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -546,21 +464,21 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class RandomContrast(NestedObject): """Randomly adjust contrast of a tensor (batch of images or image) by adjusting each pixel: (img - mean) * contrast_factor + mean. - >>> import tensorflow as tf - >>> from doctr.transforms import RandomContrast - >>> transfo = RandomContrast() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Contrast() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce contrast if factor<1) """ - - def __init__(self, delta: float = 0.3) -> None: + def __init__(self, delta: float = .3) -> None: self.delta = delta def extra_repr(self) -> str: @@ -572,21 +490,21 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class RandomSaturation(NestedObject): """Randomly adjust saturation of a tensor (batch of images or image) by converting to HSV and increasing saturation by a factor. - >>> import tensorflow as tf - >>> from doctr.transforms import RandomSaturation - >>> transfo = RandomSaturation() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Saturation() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: delta: multiplicative factor is picked in [1-delta, 1+delta] (reduce saturation if factor<1) """ - - def __init__(self, delta: float = 0.5) -> None: + def __init__(self, delta: float = .5) -> None: self.delta = delta def extra_repr(self) -> str: @@ -598,19 +516,19 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class RandomHue(NestedObject): """Randomly adjust hue of a tensor (batch of images or image) by converting to HSV and adding a delta - >>> import tensorflow as tf - >>> from doctr.transforms import RandomHue - >>> transfo = RandomHue() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Hue() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: max_delta: offset to add to each pixel is randomly picked in [-max_delta, max_delta] """ - def __init__(self, max_delta: float = 0.3) -> None: self.max_delta = max_delta @@ -623,14 +541,15 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class RandomGamma(NestedObject): """randomly performs gamma correction for a tensor (batch of images or image) - >>> import tensorflow as tf - >>> from doctr.transforms import RandomGamma - >>> transfo = RandomGamma() - >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) + Example: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = Gamma() + >>> out = transfo(tf.random.uniform(shape=[8, 64, 64, 3], minval=0, maxval=1)) Args: min_gamma: non-negative real number, lower bound for gamma param @@ -638,7 +557,6 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                min_gain: lower bound for constant multiplier max_gain: upper bound for constant multiplier """ - def __init__( self, min_gamma: float = 0.5, @@ -663,20 +581,20 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                -[docs] +[docs] class RandomJpegQuality(NestedObject): """Randomly adjust jpeg quality of a 3 dimensional RGB image - >>> import tensorflow as tf - >>> from doctr.transforms import RandomJpegQuality - >>> transfo = RandomJpegQuality() - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) + Example:: + >>> from doctr.transforms import Normalize + >>> import tensorflow as tf + >>> transfo = JpegQuality() + >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) Args: min_quality: int between [0, 100] max_quality: int between [0, 100] """ - def __init__(self, min_quality: int = 60, max_quality: int = 100) -> None: self.min_quality = min_quality self.max_quality = max_quality @@ -685,167 +603,10 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                return f"min_quality={self.min_quality}" def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.image.random_jpeg_quality(img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality)
                                                - - - -
                                                -[docs] -class GaussianBlur(NestedObject): - """Randomly adjust jpeg quality of a 3 dimensional RGB image - - >>> import tensorflow as tf - >>> from doctr.transforms import GaussianBlur - >>> transfo = GaussianBlur(3, (.1, 5)) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - kernel_shape: size of the blurring kernel - std: min and max value of the standard deviation - """ - - def __init__(self, kernel_shape: Union[int, Iterable[int]], std: Tuple[float, float]) -> None: - self.kernel_shape = kernel_shape - self.std = std - - def extra_repr(self) -> str: - return f"kernel_shape={self.kernel_shape}, std={self.std}" - - @tf.function - def __call__(self, img: tf.Tensor) -> tf.Tensor: - sigma = random.uniform(self.std[0], self.std[1]) - return tfa.image.gaussian_filter2d( - img, - filter_shape=self.kernel_shape, - sigma=sigma, + return tf.image.random_jpeg_quality( + img, min_jpeg_quality=self.min_quality, max_jpeg_quality=self.max_quality )
                                                - - -
                                                -[docs] -class ChannelShuffle(NestedObject): - """Randomly shuffle channel order of a given image""" - - def __init__(self): - pass - - def __call__(self, img: tf.Tensor) -> tf.Tensor: - return tf.transpose(tf.random.shuffle(tf.transpose(img, perm=[2, 0, 1])), perm=[1, 2, 0])
                                                - - - -
                                                -[docs] -class GaussianNoise(NestedObject): - """Adds Gaussian Noise to the input tensor - - >>> import tensorflow as tf - >>> from doctr.transforms import GaussianNoise - >>> transfo = GaussianNoise(0., 1.) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - mean : mean of the gaussian distribution - std : std of the gaussian distribution - """ - - def __init__(self, mean: float = 0.0, std: float = 1.0) -> None: - super().__init__() - self.std = std - self.mean = mean - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - # Reshape the distribution - noise = self.mean + 2 * self.std * tf.random.uniform(x.shape) - self.std - if x.dtype == tf.uint8: - return tf.cast( - tf.clip_by_value(tf.math.round(tf.cast(x, dtype=tf.float32) + 255 * noise), 0, 255), dtype=tf.uint8 - ) - else: - return tf.cast(tf.clip_by_value(x + noise, 0, 1), dtype=x.dtype) - - def extra_repr(self) -> str: - return f"mean={self.mean}, std={self.std}"
                                                - - - -
                                                -[docs] -class RandomHorizontalFlip(NestedObject): - """Adds random horizontal flip to the input tensor/np.ndarray - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomHorizontalFlip - >>> transfo = RandomHorizontalFlip(p=0.5) - >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1) - >>> target = { - >>> "boxes": np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32), - >>> "labels": np.ones(1, dtype= np.int64) - >>> } - >>> out = transfo(image, target) - - Args: - p : probability of Horizontal Flip - """ - - def __init__(self, p: float) -> None: - super().__init__() - self.p = p - - def __call__(self, img: Union[tf.Tensor, np.ndarray], target: Dict[str, Any]) -> Tuple[tf.Tensor, Dict[str, Any]]: - """ - Args: - img: Image to be flipped. - target: Dictionary with boxes (in relative coordinates of shape (N, 4)) and labels as keys - Returns: - Tuple of numpy nd-array or Tensor and target - """ - if np.random.rand(1) <= self.p: - _img = tf.image.flip_left_right(img) - _target = target.copy() - # Changing the relative bbox coordinates - _target["boxes"][:, ::2] = 1 - target["boxes"][:, [2, 0]] - return _img, _target - return img, target
                                                - - - -
                                                -[docs] -class RandomShadow(NestedObject): - """Adds random shade to the input image - - >>> import tensorflow as tf - >>> from doctr.transforms import RandomShadow - >>> transfo = RandomShadow(0., 1.) - >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)) - - Args: - opacity_range : minimum and maximum opacity of the shade - """ - - def __init__(self, opacity_range: Optional[Tuple[float, float]] = None) -> None: - super().__init__() - self.opacity_range = opacity_range if isinstance(opacity_range, tuple) else (0.2, 0.8) - - def __call__(self, x: tf.Tensor) -> tf.Tensor: - # Reshape the distribution - if x.dtype == tf.uint8: - return tf.cast( - tf.clip_by_value( - tf.math.round(255 * random_shadow(tf.cast(x, dtype=tf.float32) / 255, self.opacity_range)), - 0, - 255, - ), - dtype=tf.uint8, - ) - else: - return tf.clip_by_value(random_shadow(x, self.opacity_range), 0, 1) - - def extra_repr(self) -> str: - return f"opacity_range={self.opacity_range}"
                                                -
                                                @@ -878,7 +639,7 @@

                                                Source code for doctr.transforms.modules.tensorflow

                                                +
                                                diff --git a/v0.6.0/_modules/doctr/utils/metrics.html b/v0.6.0/_modules/doctr/utils/metrics.html index 49272f770b..20af9416ea 100644 --- a/v0.6.0/_modules/doctr/utils/metrics.html +++ b/v0.6.0/_modules/doctr/utils/metrics.html @@ -226,35 +226,20 @@

                                                Source code for doctr.utils.metrics

                                                -# Copyright (C) 2021-2022, Mindee.
                                                -
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -from typing import Dict, List, Optional, Tuple
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                -import cv2
                                                 import numpy as np
                                                -from scipy.optimize import linear_sum_assignment
                                                +import cv2
                                                +from typing import List, Tuple, Dict, Optional
                                                 from unidecode import unidecode
                                                +from scipy.optimize import linear_sum_assignment
                                                +from doctr.utils.geometry import rbbox_to_polygon
                                                 
                                                -__all__ = [
                                                -    "TextMatch",
                                                -    "box_iou",
                                                -    "box_ioa",
                                                -    "mask_iou",
                                                -    "polygon_iou",
                                                -    "nms",
                                                -    "LocalizationConfusion",
                                                -    "OCRMetric",
                                                -    "DetectionMetric",
                                                -]
                                                +__all__ = ['TextMatch', 'box_iou', 'box_ioa', 'mask_iou', 'rbox_to_mask',
                                                +           'nms', 'LocalizationConfusion', 'OCRMetric']
                                                 
                                                 
                                                 def string_match(word1: str, word2: str) -> Tuple[bool, bool, bool, bool]:
                                                -    """Performs string comparison with multiple levels of tolerance
                                                +    """Perform string comparison with multiple levels of tolerance
                                                 
                                                     Args:
                                                         word1: a string
                                                @@ -326,52 +302,51 @@ 

                                                Source code for doctr.utils.metrics

                                                         a tuple with booleans specifying respectively whether the raw strings, their lower-case counterparts, their
                                                             unidecode counterparts and their lower-case unidecode counterparts match
                                                     """
                                                -    raw_match = word1 == word2
                                                -    caseless_match = word1.lower() == word2.lower()
                                                -    unidecode_match = unidecode(word1) == unidecode(word2)
                                                +    raw_match = (word1 == word2)
                                                +    caseless_match = (word1.lower() == word2.lower())
                                                +    unidecode_match = (unidecode(word1) == unidecode(word2))
                                                 
                                                     # Warning: the order is important here otherwise the pair ("EUR", "€") cannot be matched
                                                -    unicase_match = unidecode(word1).lower() == unidecode(word2).lower()
                                                +    unicase_match = (unidecode(word1).lower() == unidecode(word2).lower())
                                                 
                                                     return raw_match, caseless_match, unidecode_match, unicase_match
                                                 
                                                 
                                                 
                                                -[docs] +[docs] class TextMatch: - r"""Implements text match metric (word-level accuracy) for recognition task. + """Implements text match metric (word-level accuracy) for recognition task. The raw aggregated metric is computed as follows: .. math:: - \forall X, Y \in \mathcal{W}^N, - TextMatch(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N f_{Y_i}(X_i) + \\forall X, Y \\in \\mathcal{W}^N, + TextMatch(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N f_{Y_i}(X_i) with the indicator function :math:`f_{a}` defined as: .. math:: - \forall a, x \in \mathcal{W}, - f_a(x) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } x = a \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{W}` is the set of all possible character sequences, + \\forall a, x \\in \\mathcal{W}, + f_a(x) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } x = a \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{W}` is the set of all possible character sequences, :math:`N` is a strictly positive integer. - >>> from doctr.utils import TextMatch - >>> metric = TextMatch() - >>> metric.update(['Hello', 'world'], ['hello', 'world']) - >>> metric.summary() + Example:: + >>> from doctr.utils import TextMatch + >>> metric = TextMatch() + >>> metric.update(['Hello', 'world'], ['hello', 'world']) + >>> metric.summary() """ def __init__(self) -> None: self.reset() -
                                                -[docs] def update( self, gt: List[str], @@ -381,8 +356,7 @@

                                                Source code for doctr.utils.metrics

                                                 
                                                         Args:
                                                             gt: list of groung-truth character sequences
                                                -            pred: list of predicted character sequences
                                                -        """
                                                +            pred: list of predicted character sequences"""
                                                 
                                                         if len(gt) != len(pred):
                                                             raise AssertionError("prediction size does not match with ground-truth labels size")
                                                @@ -394,11 +368,10 @@ 

                                                Source code for doctr.utils.metrics

                                                             self.unidecode += int(_unidecode)
                                                             self.unicase += int(_unicase)
                                                 
                                                -        self.total += len(gt)
                                                - + self.total += len(gt)
                                                -[docs] +[docs] def summary(self) -> Dict[str, float]: """Computes the aggregated metrics @@ -427,17 +400,16 @@

                                                Source code for doctr.utils.metrics

                                                 
                                                 
                                                 def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                                                -    """Computes the IoU between two sets of bounding boxes
                                                +    """Compute the IoU between two sets of bounding boxes
                                                 
                                                     Args:
                                                         boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
                                                         boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
                                                -
                                                     Returns:
                                                         the IoU matrix of shape (N, M)
                                                     """
                                                 
                                                -    iou_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
                                                +    iou_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
                                                 
                                                     if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
                                                         l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
                                                @@ -456,18 +428,17 @@ 

                                                Source code for doctr.utils.metrics

                                                 
                                                 
                                                 def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
                                                -    """Computes the IoA (intersection over area) between two sets of bounding boxes:
                                                +    """Compute the IoA (intersection over area) between two sets of bounding boxes:
                                                     ioa(i, j) = inter(i, j) / area(i)
                                                 
                                                     Args:
                                                         boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax)
                                                         boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax)
                                                -
                                                     Returns:
                                                         the IoA matrix of shape (N, M)
                                                     """
                                                 
                                                -    ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
                                                +    ioa_mat = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32)
                                                 
                                                     if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0:
                                                         l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1)
                                                @@ -486,7 +457,7 @@ 

                                                Source code for doctr.utils.metrics

                                                 
                                                 
                                                 def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray:
                                                -    """Computes the IoU between two sets of boolean masks
                                                +    """Compute the IoU between two sets of boolean masks
                                                 
                                                     Args:
                                                         masks_1: boolean masks of shape (N, H, W)
                                                @@ -499,110 +470,50 @@ 

                                                Source code for doctr.utils.metrics

                                                     if masks_1.shape[1:] != masks_2.shape[1:]:
                                                         raise AssertionError("both boolean masks should have the same spatial shape")
                                                 
                                                -    iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
                                                +    iou_mat = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32)
                                                 
                                                     if masks_1.shape[0] > 0 and masks_2.shape[0] > 0:
                                                +        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...])
                                                +        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...])
                                                         axes = tuple(range(2, masks_1.ndim + 1))
                                                -        intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                                                -        union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes)
                                                -        iou_mat = intersection / union
                                                +        iou_mat = intersection.sum(axis=axes) / union.sum(axis=axes)
                                                 
                                                     return iou_mat
                                                 
                                                 
                                                -def polygon_iou(
                                                -    polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False
                                                -) -> np.ndarray:
                                                -    """Computes the IoU between two sets of rotated bounding boxes
                                                -
                                                -    Args:
                                                -        polys_1: rotated bounding boxes of shape (N, 4, 2)
                                                -        polys_2: rotated bounding boxes of shape (M, 4, 2)
                                                -        mask_shape: spatial shape of the intermediate masks
                                                -        use_broadcasting: if set to True, leverage broadcasting speedup by consuming more memory
                                                -
                                                -    Returns:
                                                -        the IoU matrix of shape (N, M)
                                                -    """
                                                -
                                                -    if polys_1.ndim != 3 or polys_2.ndim != 3:
                                                -        raise AssertionError("expects boxes to be in format (N, 4, 2)")
                                                -
                                                -    iou_mat: np.ndarray = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32)
                                                -
                                                -    if polys_1.shape[0] > 0 and polys_2.shape[0] > 0:
                                                -        if use_broadcasting:
                                                -            masks_1 = rbox_to_mask(polys_1, shape=mask_shape)
                                                -            masks_2 = rbox_to_mask(polys_2, shape=mask_shape)
                                                -            iou_mat = mask_iou(masks_1, masks_2)
                                                -        else:
                                                -            # Save memory by doing the computation for each pair
                                                -            for idx, b1 in enumerate(polys_1):
                                                -                m1 = _rbox_to_mask(b1, mask_shape)
                                                -                for _idx, b2 in enumerate(polys_2):
                                                -                    m2 = _rbox_to_mask(b2, mask_shape)
                                                -                    iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum()
                                                -
                                                -    return iou_mat
                                                -
                                                -
                                                -def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                                                -    """Converts a rotated bounding box to a boolean mask
                                                -
                                                -    Args:
                                                -        box: rotated bounding box of shape (4, 2)
                                                -        shape: spatial shapes of the output masks
                                                -
                                                -    Returns:
                                                -        the boolean mask of the specified shape
                                                -    """
                                                -
                                                -    mask: np.ndarray = np.zeros(shape, dtype=np.uint8)
                                                -    # Get absolute coords
                                                -    if box.dtype != int:
                                                -        abs_box = box.copy()
                                                -        abs_box[:, 0] = abs_box[:, 0] * shape[1]
                                                -        abs_box[:, 1] = abs_box[:, 1] * shape[0]
                                                -        abs_box = abs_box.round().astype(int)
                                                -    else:
                                                -        abs_box = box
                                                -        abs_box[2:] = abs_box[2:] + 1
                                                -    cv2.fillPoly(mask, [abs_box - 1], 1)
                                                -
                                                -    return mask.astype(bool)
                                                -
                                                -
                                                 def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray:
                                                -    """Converts rotated bounding boxes to boolean masks
                                                +    """Convert boxes to masks
                                                 
                                                     Args:
                                                -        boxes: rotated bounding boxes of shape (N, 4, 2)
                                                +        boxes: rotated bounding boxes of shape (N, 5) in format (x, y, w, h, alpha)
                                                         shape: spatial shapes of the output masks
                                                 
                                                     Returns:
                                                         the boolean masks of shape (N, H, W)
                                                     """
                                                 
                                                -    masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
                                                +    masks = np.zeros((boxes.shape[0], *shape), dtype=np.uint8)
                                                 
                                                     if boxes.shape[0] > 0:
                                                         # Get absolute coordinates
                                                -        if boxes.dtype != int:
                                                +        if boxes.dtype != np.int:
                                                             abs_boxes = boxes.copy()
                                                -            abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1]
                                                -            abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0]
                                                -            abs_boxes = abs_boxes.round().astype(int)
                                                +            abs_boxes[:, [0, 2]] = abs_boxes[:, [0, 2]] * shape[1]
                                                +            abs_boxes[:, [1, 3]] = abs_boxes[:, [1, 3]] * shape[0]
                                                +            abs_boxes = abs_boxes.round().astype(np.int)
                                                         else:
                                                             abs_boxes = boxes
                                                             abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1
                                                 
                                                         # TODO: optimize slicing to improve vectorization
                                                         for idx, _box in enumerate(abs_boxes):
                                                -            cv2.fillPoly(masks[idx], [_box - 1], 1)
                                                +            box = rbbox_to_polygon(_box)
                                                +            cv2.fillPoly(masks[idx], [np.array(box, np.int32)], 1)
                                                +
                                                     return masks.astype(bool)
                                                 
                                                 
                                                -def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]:
                                                +def nms(boxes: np.ndarray, thresh: float = .5) -> List[int]:
                                                     """Perform non-max suppression, borrowed from <https://github.com/rbgirshick/fast-rcnn>`_.
                                                 
                                                     Args:
                                                @@ -641,76 +552,66 @@ 

                                                Source code for doctr.utils.metrics

                                                 
                                                 
                                                 
                                                -[docs] +[docs] class LocalizationConfusion: - r"""Implements common confusion metrics and mean IoU for localization evaluation. + """Implements common confusion metrics and mean IoU for localization evaluation. The aggregated metrics are computed as follows: .. math:: - \forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ - Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ - Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ - meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j) + \\forall Y \\in \\mathcal{B}^N, \\forall X \\in \\mathcal{B}^M, \\\\ + Recall(X, Y) = \\frac{1}{N} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + Precision(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^N g_{X}(Y_i) \\\\ + meanIoU(X, Y) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(X_i, Y_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`g_{X}` defined as: .. math:: - \forall y \in \mathcal{B}, - g_X(y) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } y\mbox{ has been assigned to any }(X_i)_i\mbox{ with an }IoU \geq 0.5 \\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, + \\forall y \\in \\mathcal{B}, + g_X(y) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } y\\mbox{ has been assigned to any }(X_i)_i\\mbox{ with an }IoU \\geq 0.5 \\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - >>> import numpy as np - >>> from doctr.utils import LocalizationConfusion - >>> metric = LocalizationConfusion(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) - >>> metric.summary() + Example:: + >>> import numpy as np + >>> from doctr.utils import LocalizationConfusion + >>> metric = LocalizationConfusion(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]])) + >>> metric.summary() Args: iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, - use_polygons: bool = False, + rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh - self.use_polygons = use_polygons + self.rotated_bbox = rotated_bbox self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() -
                                                -[docs] def update(self, gts: np.ndarray, preds: np.ndarray) -> None: - """Updates the metric - - Args: - gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - """ if preds.shape[0] > 0: # Compute IoU - if self.use_polygons: - iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting) + if self.rotated_bbox: + mask_gts = rbox_to_mask(gts, shape=self.mask_shape) + mask_preds = rbox_to_mask(preds, shape=self.mask_shape) + iou_mat = mask_iou(mask_gts, mask_preds) else: iou_mat = box_iou(gts, preds) - self.tot_iou += float(iou_mat.max(axis=0).sum()) + self.tot_iou += float(iou_mat.max(axis=1).sum()) # Assign pairs gt_indices, pred_indices = linear_sum_assignment(-iou_mat) @@ -718,11 +619,10 @@

                                                Source code for doctr.utils.metrics

                                                 
                                                         # Update counts
                                                         self.num_gts += gts.shape[0]
                                                -        self.num_preds += preds.shape[0]
                                                - + self.num_preds += preds.shape[0]
                                                -[docs] +[docs] def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: """Computes the aggregated metrics @@ -746,70 +646,64 @@

                                                Source code for doctr.utils.metrics

                                                         self.num_gts = 0
                                                         self.num_preds = 0
                                                         self.matches = 0
                                                -        self.tot_iou = 0.0
                                                + self.tot_iou = 0.
                                                -[docs] +[docs] class OCRMetric: - r"""Implements an end-to-end OCR metric. + """Implements end-to-end OCR metric. The aggregated metrics are computed as follows: .. math:: - \forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, - \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ - Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) + \\forall (B, L) \\in \\mathcal{B}^N \\times \\mathcal{L}^N, + \\forall (\\hat{B}, \\hat{L}) \\in \\mathcal{B}^M \\times \\mathcal{L}^M, \\\\ + Recall(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{N} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + Precision(B, \\hat{B}, L, \\hat{L}) = \\frac{1}{M} \\sum\\limits_{i=1}^N h_{B,L}(\\hat{B}_i, \\hat{L}_i) \\\\ + meanIoU(B, \\hat{B}) = \\frac{1}{M} \\sum\\limits_{i=1}^M \\max\\limits_{j \\in [1, N]} IoU(\\hat{B}_i, B_j) with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and :math:`y`, and the function :math:`h_{B, L}` defined as: .. math:: - \forall (b, l) \in \mathcal{B} \times \mathcal{L}, - h_{B,L}(b, l) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } l = L_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{L}` is the set of possible character sequences, + \\forall (b, l) \\in \\mathcal{B} \\times \\mathcal{L}, + h_{B,L}(b, l) = \\left\\{ + \\begin{array}{ll} + 1 & \\mbox{if } b\\mbox{ has been assigned to a given }B_j\\mbox{ with an } \\\\ + & IoU \\geq 0.5 \\mbox{ and that for this assignment, } l = L_j\\\\ + 0 & \\mbox{otherwise.} + \\end{array} + \\right. + + where :math:`\\mathcal{B}` is the set of possible bounding boxes, + :math:`\\mathcal{L}` is the set of possible character sequences, :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - >>> import numpy as np - >>> from doctr.utils import OCRMetric - >>> metric = OCRMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - >>> ['hello'], ['hello', 'world']) - >>> metric.summary() + Example:: + >>> import numpy as np + >>> from doctr.utils import OCRMetric + >>> metric = OCRMetric(iou_thresh=0.5) + >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), + ['hello'], ['hello', 'world']) + >>> metric.summary() Args: iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, - use_polygons: bool = False, + rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh - self.use_polygons = use_polygons + self.rotated_bbox = rotated_bbox self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() -
                                                -[docs] def update( self, gt_boxes: np.ndarray, @@ -817,28 +711,21 @@

                                                Source code for doctr.utils.metrics

                                                         gt_labels: List[str],
                                                         pred_labels: List[str],
                                                     ) -> None:
                                                -        """Updates the metric
                                                -
                                                -        Args:
                                                -            gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones
                                                -            pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones
                                                -            gt_labels: a list of N string labels
                                                -            pred_labels: a list of M string labels
                                                -        """
                                                 
                                                         if gt_boxes.shape[0] != len(gt_labels) or pred_boxes.shape[0] != len(pred_labels):
                                                -            raise AssertionError(
                                                -                "there should be the same number of boxes and string both for the ground truth " "and the predictions"
                                                -            )
                                                +            raise AssertionError("there should be the same number of boxes and string both for the ground truth "
                                                +                                 "and the predictions")
                                                 
                                                         # Compute IoU
                                                         if pred_boxes.shape[0] > 0:
                                                -            if self.use_polygons:
                                                -                iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting)
                                                +            if self.rotated_bbox:
                                                +                mask_gts = rbox_to_mask(gt_boxes, shape=self.mask_shape)
                                                +                mask_preds = rbox_to_mask(pred_boxes, shape=self.mask_shape)
                                                +                iou_mat = mask_iou(mask_gts, mask_preds)
                                                             else:
                                                                 iou_mat = box_iou(gt_boxes, pred_boxes)
                                                 
                                                -            self.tot_iou += float(iou_mat.max(axis=0).sum())
                                                +            self.tot_iou += float(iou_mat.max(axis=1).sum())
                                                 
                                                             # Assign pairs
                                                             gt_indices, pred_indices = linear_sum_assignment(-iou_mat)
                                                @@ -852,16 +739,15 @@ 

                                                Source code for doctr.utils.metrics

                                                                 self.unicase_matches += int(_unicase)
                                                 
                                                         self.num_gts += gt_boxes.shape[0]
                                                -        self.num_preds += pred_boxes.shape[0]
                                                - + self.num_preds += pred_boxes.shape[0]
                                                -[docs] +[docs] def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float]], Optional[float]]: """Computes the aggregated metrics Returns: - a tuple with the recall & precision for each string comparison and the mean IoU + a tuple with the recall & precision for each string comparison flexibility and the mean IoU """ # Recall @@ -889,141 +775,12 @@

                                                Source code for doctr.utils.metrics

                                                     def reset(self) -> None:
                                                         self.num_gts = 0
                                                         self.num_preds = 0
                                                -        self.tot_iou = 0.0
                                                +        self.tot_iou = 0.
                                                         self.raw_matches = 0
                                                         self.caseless_matches = 0
                                                         self.unidecode_matches = 0
                                                         self.unicase_matches = 0
                                                - - -
                                                -[docs] -class DetectionMetric: - r"""Implements an object detection metric. - - The aggregated metrics are computed as follows: - - .. math:: - \forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, - \forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ - Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ - meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j) - - with the function :math:`IoU(x, y)` being the Intersection over Union between bounding boxes :math:`x` and - :math:`y`, and the function :math:`h_{B, C}` defined as: - - .. math:: - \forall (b, c) \in \mathcal{B} \times \mathcal{C}, - h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} - \right. - - where :math:`\mathcal{B}` is the set of possible bounding boxes, - :math:`\mathcal{C}` is the set of possible class indices, - :math:`N` (number of ground truths) and :math:`M` (number of predictions) are strictly positive integers. - - >>> import numpy as np - >>> from doctr.utils import DetectionMetric - >>> metric = DetectionMetric(iou_thresh=0.5) - >>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]), - >>> np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64)) - >>> metric.summary() - - Args: - iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match - use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory - """ - - def __init__( - self, - iou_thresh: float = 0.5, - use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, - ) -> None: - self.iou_thresh = iou_thresh - self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting - self.reset() - -
                                                -[docs] - def update( - self, - gt_boxes: np.ndarray, - pred_boxes: np.ndarray, - gt_labels: np.ndarray, - pred_labels: np.ndarray, - ) -> None: - """Updates the metric - - Args: - gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones - pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones - gt_labels: an array of class indices of shape (N,) - pred_labels: an array of class indices of shape (M,) - """ - - if gt_boxes.shape[0] != gt_labels.shape[0] or pred_boxes.shape[0] != pred_labels.shape[0]: - raise AssertionError( - "there should be the same number of boxes and string both for the ground truth " "and the predictions" - ) - - # Compute IoU - if pred_boxes.shape[0] > 0: - if self.use_polygons: - iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting) - else: - iou_mat = box_iou(gt_boxes, pred_boxes) - - self.tot_iou += float(iou_mat.max(axis=0).sum()) - - # Assign pairs - gt_indices, pred_indices = linear_sum_assignment(-iou_mat) - is_kept = iou_mat[gt_indices, pred_indices] >= self.iou_thresh - # Category comparison - self.num_matches += int((gt_labels[gt_indices[is_kept]] == pred_labels[pred_indices[is_kept]]).sum()) - - self.num_gts += gt_boxes.shape[0] - self.num_preds += pred_boxes.shape[0]
                                                - - -
                                                -[docs] - def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: - """Computes the aggregated metrics - - Returns: - a tuple with the recall & precision for each class prediction and the mean IoU - """ - - # Recall - recall = self.num_matches / self.num_gts if self.num_gts > 0 else None - - # Precision - precision = self.num_matches / self.num_preds if self.num_preds > 0 else None - - # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None - - return recall, precision, mean_iou
                                                - - - def reset(self) -> None: - self.num_gts = 0 - self.num_preds = 0 - self.tot_iou = 0.0 - self.num_matches = 0
                                                -
                                                @@ -1056,7 +813,7 @@

                                                Source code for doctr.utils.metrics

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/doctr/utils/visualization.html b/v0.6.0/_modules/doctr/utils/visualization.html index 0c380d2035..21743f6182 100644 --- a/v0.6.0/_modules/doctr/utils/visualization.html +++ b/v0.6.0/_modules/doctr/utils/visualization.html @@ -226,35 +226,20 @@

                                                Source code for doctr.utils.visualization

                                                -# Copyright (C) 2021-2022, Mindee.
                                                +# Copyright (C) 2021, Mindee.
                                                 
                                                -# This program is licensed under the Apache License 2.0.
                                                -# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
                                                +# This program is licensed under the Apache License version 2.
                                                +# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
                                                 
                                                -from copy import deepcopy
                                                -from typing import Any, Dict, List, Optional, Tuple, Union
                                                -
                                                -import cv2
                                                -import matplotlib.patches as patches
                                                 import matplotlib.pyplot as plt
                                                +from matplotlib.figure import Figure
                                                +import matplotlib.patches as patches
                                                 import mplcursors
                                                +from PIL import ImageFont, ImageDraw, Image
                                                 import numpy as np
                                                -from matplotlib.figure import Figure
                                                -from PIL import Image, ImageDraw
                                                -from unidecode import unidecode
                                                +import cv2
                                                +from typing import Tuple, List, Dict, Any, Union
                                                 
                                                -from .common_types import BoundingBox, Polygon4P
                                                -from .fonts import get_font
                                                +from .common_types import BoundingBox, RotatedBbox
                                                 
                                                -__all__ = ["visualize_page", "synthesize_page", "draw_boxes"]
                                                +__all__ = ['visualize_page', 'synthetize_page']
                                                 
                                                 
                                                -def rect_patch(
                                                -    geometry: BoundingBox,
                                                +def create_rect_patch(
                                                +    geometry: Union[BoundingBox, RotatedBbox],
                                                +    label: str,
                                                     page_dimensions: Tuple[int, int],
                                                -    label: Optional[str] = None,
                                                -    color: Tuple[float, float, float] = (0, 0, 0),
                                                +    color: Tuple[int, int, int],
                                                     alpha: float = 0.3,
                                                     linewidth: int = 2,
                                                     fill: bool = True,
                                                -    preserve_aspect_ratio: bool = False,
                                                -) -> patches.Rectangle:
                                                -    """Create a matplotlib rectangular patch for the element
                                                +) -> patches.Patch:
                                                +    """Create a matplotlib patch (rectangle) bounding the element
                                                 
                                                     Args:
                                                         geometry: bounding box of the element
                                                -        page_dimensions: dimensions of the Page in format (height, width)
                                                         label: label to display when hovered
                                                +        page_dimensions: dimensions of the Page
                                                         color: color to draw box
                                                         alpha: opacity parameter to fill the boxes, 0 = transparent
                                                         linewidth: line width
                                                -        fill: whether the patch should be filled
                                                -        preserve_aspect_ratio: pass True if you passed True to the predictor
                                                 
                                                     Returns:
                                                         a rectangular Patch
                                                     """
                                                -
                                                -    if len(geometry) != 2 or any(not isinstance(elt, tuple) or len(elt) != 2 for elt in geometry):
                                                -        raise ValueError("invalid geometry format")
                                                -
                                                -    # Unpack
                                                     height, width = page_dimensions
                                                -    (xmin, ymin), (xmax, ymax) = geometry
                                                -    # Switch to absolute coords
                                                -    if preserve_aspect_ratio:
                                                -        width = height = max(height, width)
                                                -    xmin, w = xmin * width, (xmax - xmin) * width
                                                -    ymin, h = ymin * height, (ymax - ymin) * height
                                                -
                                                -    return patches.Rectangle(
                                                -        (xmin, ymin),
                                                -        w,
                                                -        h,
                                                -        fill=fill,
                                                -        linewidth=linewidth,
                                                -        edgecolor=(*color, alpha),
                                                -        facecolor=(*color, alpha),
                                                -        label=label,
                                                -    )
                                                -
                                                -
                                                -def polygon_patch(
                                                -    geometry: np.ndarray,
                                                -    page_dimensions: Tuple[int, int],
                                                -    label: Optional[str] = None,
                                                -    color: Tuple[float, float, float] = (0, 0, 0),
                                                -    alpha: float = 0.3,
                                                -    linewidth: int = 2,
                                                -    fill: bool = True,
                                                -    preserve_aspect_ratio: bool = False,
                                                -) -> patches.Polygon:
                                                -    """Create a matplotlib polygon patch for the element
                                                -
                                                -    Args:
                                                -        geometry: bounding box of the element
                                                -        page_dimensions: dimensions of the Page in format (height, width)
                                                -        label: label to display when hovered
                                                -        color: color to draw box
                                                -        alpha: opacity parameter to fill the boxes, 0 = transparent
                                                -        linewidth: line width
                                                -        fill: whether the patch should be filled
                                                -        preserve_aspect_ratio: pass True if you passed True to the predictor
                                                -
                                                -    Returns:
                                                -        a polygon Patch
                                                -    """
                                                -
                                                -    if not geometry.shape == (4, 2):
                                                -        raise ValueError("invalid geometry format")
                                                -
                                                -    # Unpack
                                                -    height, width = page_dimensions
                                                -    geometry[:, 0] = geometry[:, 0] * (max(width, height) if preserve_aspect_ratio else width)
                                                -    geometry[:, 1] = geometry[:, 1] * (max(width, height) if preserve_aspect_ratio else height)
                                                -
                                                -    return patches.Polygon(
                                                -        geometry,
                                                -        fill=fill,
                                                -        linewidth=linewidth,
                                                -        edgecolor=(*color, alpha),
                                                -        facecolor=(*color, alpha),
                                                -        label=label,
                                                -    )
                                                -
                                                -
                                                -def create_obj_patch(
                                                -    geometry: Union[BoundingBox, Polygon4P, np.ndarray],
                                                -    page_dimensions: Tuple[int, int],
                                                -    **kwargs: Any,
                                                -) -> patches.Patch:
                                                -    """Create a matplotlib patch for the element
                                                -
                                                -    Args:
                                                -        geometry: bounding box (straight or rotated) of the element
                                                -        page_dimensions: dimensions of the page in format (height, width)
                                                -
                                                -    Returns:
                                                -        a matplotlib Patch
                                                -    """
                                                -    if isinstance(geometry, tuple):
                                                -        if len(geometry) == 2:  # straight word BB (2 pts)
                                                -            return rect_patch(geometry, page_dimensions, **kwargs)  # type: ignore[arg-type]
                                                -        elif len(geometry) == 4:  # rotated word BB (4 pts)
                                                -            return polygon_patch(np.asarray(geometry), page_dimensions, **kwargs)
                                                -    elif isinstance(geometry, np.ndarray) and geometry.shape == (4, 2):  # rotated line
                                                -        return polygon_patch(geometry, page_dimensions, **kwargs)
                                                -    raise ValueError("invalid geometry format")
                                                +    if len(geometry) == 5:
                                                +        x, y, w, h, a = geometry  # type: ignore[misc]
                                                +        x, w = x * width, w * width
                                                +        y, h = y * height, h * height
                                                +        points = cv2.boxPoints(((x, y), (w, h), a))
                                                +        return patches.Polygon(
                                                +            points,
                                                +            fill=fill,
                                                +            linewidth=linewidth,
                                                +            edgecolor=(*color, alpha),
                                                +            facecolor=(*color, alpha),
                                                +            label=label
                                                +        )
                                                +    else:
                                                +        (xmin, ymin), (xmax, ymax) = geometry  # type: ignore[misc]
                                                +        xmin, xmax = xmin * width, xmax * width
                                                +        ymin, ymax = ymin * height, ymax * height
                                                +        return patches.Rectangle(
                                                +            (xmin, ymin),
                                                +            xmax - xmin,
                                                +            ymax - ymin,
                                                +            fill=fill,
                                                +            linewidth=linewidth,
                                                +            edgecolor=(*color, alpha),
                                                +            facecolor=(*color, alpha),
                                                +            label=label
                                                +        )
                                                 
                                                 
                                                 
                                                -[docs] +[docs] def visualize_page( page: Dict[str, Any], image: np.ndarray, @@ -445,15 +360,16 @@

                                                Source code for doctr.utils.visualization

                                                 ) -> Figure:
                                                     """Visualize a full page with predicted blocks, lines and words
                                                 
                                                -    >>> import numpy as np
                                                -    >>> import matplotlib.pyplot as plt
                                                -    >>> from doctr.utils.visualization import visualize_page
                                                -    >>> from doctr.models import ocr_db_crnn
                                                -    >>> model = ocr_db_crnn(pretrained=True)
                                                -    >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                                -    >>> out = model([[input_page]])
                                                -    >>> visualize_page(out[0].pages[0].export(), input_page)
                                                -    >>> plt.show()
                                                +    Example::
                                                +        >>> import numpy as np
                                                +        >>> import matplotlib.pyplot as plt
                                                +        >>> from doctr.utils.visualization import visualize_page
                                                +        >>> from doctr.models import ocr_db_crnn
                                                +        >>> model = ocr_db_crnn(pretrained=True)
                                                +        >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
                                                +        >>> out = model([[input_page]])
                                                +        >>> visualize_page(out[0].pages[0].export(), input_page)
                                                +        >>> plt.show()
                                                 
                                                     Args:
                                                         page: the exported Page of a Document
                                                @@ -471,73 +387,61 @@ 

                                                Source code for doctr.utils.visualization

                                                     # Display the image
                                                     ax.imshow(image)
                                                     # hide both axis
                                                -    ax.axis("off")
                                                +    ax.axis('off')
                                                 
                                                     if interactive:
                                                         artists: List[patches.Patch] = []  # instantiate an empty list of patches (to be drawn on the page)
                                                 
                                                -    for block in page["blocks"]:
                                                +    for block in page['blocks']:
                                                         if not words_only:
                                                -            rect = create_obj_patch(
                                                -                block["geometry"], page["dimensions"], label="block", color=(0, 1, 0), linewidth=1, **kwargs
                                                -            )
                                                +            rect = create_rect_patch(block['geometry'], 'block', page['dimensions'], (0, 1, 0), linewidth=1, **kwargs)
                                                             # add patch on figure
                                                             ax.add_patch(rect)
                                                             if interactive:
                                                                 # add patch to cursor's artists
                                                                 artists.append(rect)
                                                 
                                                -        for line in block["lines"]:
                                                +        for line in block['lines']:
                                                             if not words_only:
                                                -                rect = create_obj_patch(
                                                -                    line["geometry"], page["dimensions"], label="line", color=(1, 0, 0), linewidth=1, **kwargs
                                                -                )
                                                +                rect = create_rect_patch(line['geometry'], 'line', page['dimensions'], (1, 0, 0), linewidth=1, **kwargs)
                                                                 ax.add_patch(rect)
                                                                 if interactive:
                                                                     artists.append(rect)
                                                 
                                                -            for word in line["words"]:
                                                -                rect = create_obj_patch(
                                                -                    word["geometry"],
                                                -                    page["dimensions"],
                                                -                    label=f"{word['value']} (confidence: {word['confidence']:.2%})",
                                                -                    color=(0, 0, 1),
                                                -                    **kwargs,
                                                -                )
                                                +            for word in line['words']:
                                                +                rect = create_rect_patch(word['geometry'], f"{word['value']} (confidence: {word['confidence']:.2%})",
                                                +                                         page['dimensions'], (0, 0, 1), **kwargs)
                                                                 ax.add_patch(rect)
                                                                 if interactive:
                                                                     artists.append(rect)
                                                                 elif add_labels:
                                                -                    if len(word["geometry"]) == 5:
                                                +                    if len(word['geometry']) == 5:
                                                                         text_loc = (
                                                -                            int(page["dimensions"][1] * (word["geometry"][0] - word["geometry"][2] / 2)),
                                                -                            int(page["dimensions"][0] * (word["geometry"][1] - word["geometry"][3] / 2)),
                                                +                            int(page['dimensions'][1] * (word['geometry'][0] - word['geometry'][2] / 2)),
                                                +                            int(page['dimensions'][0] * (word['geometry'][1] - word['geometry'][3] / 2))
                                                                         )
                                                                     else:
                                                                         text_loc = (
                                                -                            int(page["dimensions"][1] * word["geometry"][0][0]),
                                                -                            int(page["dimensions"][0] * word["geometry"][0][1]),
                                                -                        )
                                                -
                                                -                    if len(word["geometry"]) == 2:
                                                -                        # We draw only if boxes are in straight format
                                                -                        ax.text(
                                                -                            *text_loc,
                                                -                            word["value"],
                                                -                            size=10,
                                                -                            alpha=0.5,
                                                -                            color=(0, 0, 1),
                                                +                            int(page['dimensions'][1] * word['geometry'][0][0]),
                                                +                            int(page['dimensions'][0] * word['geometry'][0][1])
                                                                         )
                                                +                    ax.text(
                                                +                        *text_loc,
                                                +                        word['value'],
                                                +                        size=10,
                                                +                        alpha=0.5,
                                                +                        color=(0, 0, 1),
                                                +                    )
                                                 
                                                         if display_artefacts:
                                                -            for artefact in block["artefacts"]:
                                                -                rect = create_obj_patch(
                                                -                    artefact["geometry"],
                                                -                    page["dimensions"],
                                                -                    label="artefact",
                                                -                    color=(0.5, 0.5, 0.5),
                                                +            for artefact in block['artefacts']:
                                                +                rect = create_rect_patch(
                                                +                    artefact['geometry'],
                                                +                    'artefact',
                                                +                    page['dimensions'],
                                                +                    (0.5, 0.5, 0.5),  # type: ignore[arg-type]
                                                                     linewidth=1,
                                                -                    **kwargs,
                                                +                    **kwargs
                                                                 )
                                                                 ax.add_patch(rect)
                                                                 if interactive:
                                                @@ -546,18 +450,16 @@ 

                                                Source code for doctr.utils.visualization

                                                     if interactive:
                                                         # Create mlp Cursor to hover patches in artists
                                                         mplcursors.Cursor(artists, hover=2).connect("add", lambda sel: sel.annotation.set_text(sel.artist.get_label()))
                                                -    fig.tight_layout(pad=0.0)
                                                +    fig.tight_layout(pad=0.)
                                                 
                                                     return fig
                                                -
                                                -[docs] -def synthesize_page( +def synthetize_page( page: Dict[str, Any], draw_proba: bool = False, - font_family: Optional[str] = None, + font_size: int = 13, ) -> np.ndarray: """Draw a the content of the element page (OCR response) on a blank page. @@ -565,12 +467,10 @@

                                                Source code for doctr.utils.visualization

                                                         page: exported Page object to represent
                                                         draw_proba: if True, draw words in colors to represent confidence. Blue: p=1, red: p=0
                                                         font_size: size of the font, default font = 13
                                                -        font_family: family of the font
                                                 
                                                     Return:
                                                -        the synthesized page
                                                +        A np array (drawn page)
                                                     """
                                                -
                                                     # Draw template
                                                     h, w = page["dimensions"]
                                                     response = 255 * np.ones((h, w, 3), dtype=np.int32)
                                                @@ -581,25 +481,26 @@ 

                                                Source code for doctr.utils.visualization

                                                             for word in line["words"]:
                                                                 # Get aboslute word geometry
                                                                 (xmin, ymin), (xmax, ymax) = word["geometry"]
                                                -                xmin, xmax = int(round(w * xmin)), int(round(w * xmax))
                                                -                ymin, ymax = int(round(h * ymin)), int(round(h * ymax))
                                                +                xmin, xmax = int(w * xmin), int(w * xmax)
                                                +                ymin, ymax = int(h * ymin), int(h * ymax)
                                                 
                                                                 # White drawing context adapted to font size, 0.75 factor to convert pts --> pix
                                                -                font = get_font(font_family, int(0.75 * (ymax - ymin)))
                                                -                img = Image.new("RGB", (xmax - xmin, ymax - ymin), color=(255, 255, 255))
                                                +                h_box, w_box = ymax - ymin, xmax - xmin
                                                +                h_font, w_font = font_size, int(font_size * w_box / (h_box * 0.75))
                                                +                img = Image.new('RGB', (w_font, h_font), color=(255, 255, 255))
                                                                 d = ImageDraw.Draw(img)
                                                +
                                                                 # Draw in black the value of the word
                                                -                try:
                                                -                    d.text((0, 0), word["value"], font=font, fill=(0, 0, 0))
                                                -                except UnicodeEncodeError:
                                                -                    # When character cannot be encoded, use its unidecode version
                                                -                    d.text((0, 0), unidecode(word["value"]), font=font, fill=(0, 0, 0))
                                                +                d.text((0, 0), word["value"], font=ImageFont.load_default(), fill=(0, 0, 0))
                                                +
                                                +                # Resize back to box size
                                                +                img = img.resize((w_box, h_box), Image.NEAREST)
                                                 
                                                                 # Colorize if draw_proba
                                                                 if draw_proba:
                                                                     p = int(255 * word["confidence"])
                                                                     mask = np.where(np.array(img) == 0, 1, 0)
                                                -                    proba: np.ndarray = np.array([255 - p, 0, p])
                                                +                    proba = np.array([255 - p, 0, p])
                                                                     color = mask * proba[np.newaxis, np.newaxis, :]
                                                                     white_mask = 255 * (1 - mask)
                                                                     img = color + white_mask
                                                @@ -607,31 +508,7 @@ 

                                                Source code for doctr.utils.visualization

                                                                 # Write to response page
                                                                 response[ymin:ymax, xmin:xmax, :] = np.array(img)
                                                 
                                                -    return response
                                                - - - -def draw_boxes(boxes: np.ndarray, image: np.ndarray, color: Optional[Tuple[int, int, int]] = None, **kwargs) -> None: - """Draw an array of relative straight boxes on an image - - Args: - boxes: array of relative boxes, of shape (*, 4) - image: np array, float32 or uint8 - color: color to use for bounding box edges - """ - h, w = image.shape[:2] - # Convert boxes to absolute coords - _boxes = deepcopy(boxes) - _boxes[:, [0, 2]] *= w - _boxes[:, [1, 3]] *= h - _boxes = _boxes.astype(np.int32) - for box in _boxes.tolist(): - xmin, ymin, xmax, ymax = box - image = cv2.rectangle( - image, (xmin, ymin), (xmax, ymax), color=color if isinstance(color, tuple) else (0, 0, 255), thickness=2 - ) - plt.imshow(image) - plt.plot(**kwargs) + return response
                                                @@ -664,7 +541,7 @@

                                                Source code for doctr.utils.visualization

                                                       
                                                     
                                                   
                                                -
                                                +
                                                diff --git a/v0.6.0/_modules/index.html b/v0.6.0/_modules/index.html index 2027064f99..c887b618c2 100644 --- a/v0.6.0/_modules/index.html +++ b/v0.6.0/_modules/index.html @@ -226,35 +226,20 @@ -
                                                +
                                                diff --git a/v0.6.0/_sources/changelog.rst.txt b/v0.6.0/_sources/changelog.rst.txt index 4fbf5ffd2e..430097d6c8 100644 --- a/v0.6.0/_sources/changelog.rst.txt +++ b/v0.6.0/_sources/changelog.rst.txt @@ -1,30 +1,6 @@ Changelog ========= -v0.5.1 (2022-03-22) -------------------- -Release note: `v0.5.1 `_ - -v0.5.0 (2021-12-31) -------------------- -Release note: `v0.5.0 `_ - -v0.4.1 (2021-11-22) -------------------- -Release note: `v0.4.1 `_ - -v0.4.0 (2021-10-01) -------------------- -Release note: `v0.4.0 `_ - -v0.3.1 (2021-08-27) -------------------- -Release note: `v0.3.1 `_ - -v0.3.0 (2021-07-02) -------------------- -Release note: `v0.3.0 `_ - v0.2.1 (2021-05-28) ------------------- Release note: `v0.2.1 `_ diff --git a/v0.6.0/_sources/contributing/contributing.md.txt b/v0.6.0/_sources/contributing/contributing.md.txt index 485e9c68d4..7e2a849de3 100644 --- a/v0.6.0/_sources/contributing/contributing.md.txt +++ b/v0.6.0/_sources/contributing/contributing.md.txt @@ -2,8 +2,6 @@ Everything you need to know to contribute efficiently to the project. - - ## Codebase structure - [doctr](https://github.com/mindee/doctr/blob/main/doctr) - The package codebase @@ -11,10 +9,9 @@ Everything you need to know to contribute efficiently to the project. - [docs](https://github.com/mindee/doctr/blob/main/docs) - Library documentation building - [scripts](https://github.com/mindee/doctr/blob/main/scripts) - Example scripts - [references](https://github.com/mindee/doctr/blob/main/references) - Reference training scripts -- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities +- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities - [api](https://github.com/mindee/doctr/blob/main/api) - A minimal template to deploy a REST API with docTR - ## Continuous Integration This project uses the following integrations to ensure proper codebase maintenance: @@ -24,13 +21,11 @@ This project uses the following integrations to ensure proper codebase maintenan As a contributor, you will only have to ensure coverage of your code by adding appropriate unit testing of your code. - - ## Feedback ### Feature requests & bug report -Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues). +Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues). First, check whether the topic wasn't already covered in an open / closed issue. If not, feel free to open a new one! When doing so, use issue templates whenever possible and provide enough information for other contributors to jump in. @@ -38,7 +33,6 @@ First, check whether the topic wasn't already covered in an open / closed issue. If you are wondering how to do something with docTR, or a more general question, you should consider checking out Github [discussions](https://github.com/mindee/doctr/discussions). See it as a Q&A forum, or the docTR-specific StackOverflow! - ## Developing docTR ### Developer mode installation @@ -46,7 +40,9 @@ If you are wondering how to do something with docTR, or a more general question, Install all additional dependencies with the following command: ```shell +python -m pip install --upgrade pip pip install -e .[dev] +pre-commit install ``` ### Commits @@ -54,7 +50,6 @@ pip install -e .[dev] - **Code**: ensure to provide docstrings to your Python code. In doing so, please follow [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) so it can ease the process of documentation later. - **Commit message**: please follow [Udacity guide](http://udacity.github.io/git-styleguide/) - ### Unit tests In order to run the same unit tests as the CI workflows, you can run unittests locally: @@ -71,52 +66,27 @@ To run all quality checks together make quality ``` -#### Lint verification +#### Code style verification -To ensure that your incoming PR complies with the lint settings, you need to install [flake8](https://flake8.pycqa.org/en/latest/) and run the following command from the repository's root folder: +To run all style checks together ```shell -flake8 ./ +make style ``` -This will read the `.flake8` setting file and let you know whether your commits need some adjustments. - -#### Import order - -In order to ensure there is a common import order convention, run [isort](https://github.com/PyCQA/isort) as follows: - -```shell -isort **/*.py -``` -This will reorder the imports of your local files. - -#### Annotation typing -Additionally, to catch type-related issues and have a cleaner codebase, annotation typing are expected. After installing [mypy](https://github.com/python/mypy), you can run the verifications as follows: - -```shell -mypy --config-file mypy.ini doctr/ -``` -The `mypy.ini` file will be read to check your typing. - -#### Docstring format +### Modifying the documentation -To keep a sane docstring structure, if you install [pydocstyle](https://github.com/PyCQA/pydocstyle), you can verify your docstrings as follows: +The current documentation is built using `sphinx` thanks to our CI. +You can build the documentation locally: ```shell -pydocstyle doctr/ +make docs-single-version ``` -The `.pydocstyle` file will be read to configure this operation. +Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the `_build` directory. Additionally, you may need to clear your web browser's cache to see the modifications. -### Modifying the documentation - -In order to check locally your modifications to the documentation: -```shell -make docs-single-version -``` You can now open your local version of the documentation located at `docs/_build/index.html` in your browser - ## Let's connect Should you wish to connect somewhere else than on GitHub, feel free to join us on [Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-uzgmljfl-MotFVfH~IdEZxjp~0zldww), where you will find a `#doctr` channel! diff --git a/v0.6.0/_sources/datasets.rst.txt b/v0.6.0/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.6.0/_sources/datasets.rst.txt +++ b/v0.6.0/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.6.0/_sources/getting_started/installing.rst.txt b/v0.6.0/_sources/getting_started/installing.rst.txt index 43f07bbaf3..46d4177b30 100644 --- a/v0.6.0/_sources/getting_started/installing.rst.txt +++ b/v0.6.0/_sources/getting_started/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires `Python `_ 3.9 or higher. Prerequisites @@ -19,17 +19,6 @@ For MacBooks with M1 chip, you will need some additional packages or specific ve * `TensorFlow 2 Metal Plugin `_ * `PyTorch >= 1.12.0 `_ -If you are running another OS than Linux, you will need a few extra dependencies. - -For MacOS users, you can install them using `Homebrew `_ as follows: - -.. code:: shell - - brew install cairo pango gdk-pixbuf libffi - -For Windows users, those dependencies are included in GTK. You can find the latest installer over `here `_. - - Via Python Package ================== @@ -49,18 +38,35 @@ We strive towards reducing framework-specific dependencies to a minimum, but som .. code:: bash pip install "python-doctr[tf]" + # or with preinstalled packages for visualization & html & contrib module support + pip install "python-doctr[tf,viz,html,contib]" .. tab:: PyTorch .. code:: bash pip install "python-doctr[torch]" + # or with preinstalled packages for visualization & html & contrib module support + pip install "python-doctr[torch,viz,html,contrib]" + + + + +Via Conda (Only for Linux) +========================== + +Install the last stable release of the package using `conda `_: + +.. code:: bash + + conda config --set channel_priority strict + conda install -c techMindee -c pypdfium2-team -c bblanchon -c defaults -c conda-forge python-doctr Via Git ======= -Install the library in developper mode: +Install the library in developer mode: .. tabs:: diff --git a/v0.6.0/_sources/index.rst.txt b/v0.6.0/_sources/index.rst.txt index bf78a793f0..fc3ff89fdf 100644 --- a/v0.6.0/_sources/index.rst.txt +++ b/v0.6.0/_sources/index.rst.txt @@ -1,8 +1,7 @@ -******************************** -docTR: Document Text Recognition -******************************** +DocTR: Document Text Recognition +================================ -State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch +State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta) .. image:: https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png :align: center @@ -10,29 +9,38 @@ State-of-the-art Optical Character Recognition made seamless & accessible to any DocTR provides an easy and powerful way to extract valuable information from your documents: -* |:receipt:| **for automation**: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. +* |:receipt:| **for automation**: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents. * |:woman_scientist:| **for research**: quickly compare your own architectures speed & performances with state-of-art models on public datasets. +Welcome to the documentation of `DocTR `_! + + Main Features ------------- * |:robot:| Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters * |:zap:| User-friendly, 3 lines of code to load a document and extract text with a predictor -* |:rocket:| State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract +* |:rocket:| State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract * |:zap:| Optimized for inference speed on both CPU & GPU -* |:bird:| Light package, minimal dependencies -* |:tools:| Actively maintained by Mindee -* |:factory:| Easy integration (available templates for browser demo & API deployment) +* |:bird:| Light package, small dependencies +* |:tools:| Daily maintained +* |:factory:| Easy integration + +Getting Started +--------------- .. toctree:: :maxdepth: 2 - :caption: Getting started - :hidden: - getting_started/installing - notebooks + installing + + +Build & train your predictor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained) +* Fine-tune or train from scratch any detection or recognition model to specialize on your data Model zoo @@ -40,68 +48,36 @@ Model zoo Text detection models """"""""""""""""""""" -* DBNet from `"Real-time Scene Text Detection with Differentiable Binarization" `_ -* LinkNet from `"LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation" `_ + * `DBNet `_ (Differentiable Binarization) + * `LinkNet `_ Text recognition models """"""""""""""""""""""" -* SAR from `"Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition" `_ -* CRNN from `"An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition" `_ -* MASTER from `"MASTER: Multi-Aspect Non-local Network for Scene Text Recognition" `_ -* ViTSTR from `"Vision Transformer for Fast and Efficient Scene Text Recognition" `_ + * `SAR `_ (Show, Attend and Read) + * `CRNN `_ (Convolutional Recurrent Neural Network) + * `MASTER `_ (Multi-Aspect Non-local Network for Scene Text Recognition) Supported datasets ^^^^^^^^^^^^^^^^^^ -* FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. -* CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. -* SROIE from `ICDAR 2019 `_. -* IIIT-5k from `CVIT `_. -* Street View Text from `"End-to-End Scene Text Recognition" `_. -* SynthText from `Visual Geometry Group `_. -* SVHN from `"Reading Digits in Natural Images with Unsupervised Feature Learning" `_. -* IC03 from `ICDAR 2003 `_. -* IC13 from `ICDAR 2013 `_. -* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" `_. -* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" `_. + * FUNSD from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents" `_. + * CORD from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing" `_. + * SROIE from `ICDAR 2019 `_. .. toctree:: :maxdepth: 2 - :caption: Using docTR - :hidden: + :caption: Notes - using_doctr/using_models - using_doctr/using_datasets - using_doctr/sharing_models - using_doctr/using_model_export - using_doctr/running_on_aws + changelog .. toctree:: :maxdepth: 2 :caption: Package Reference - :hidden: - - modules/datasets - modules/io - modules/models - modules/transforms - modules/utils - -.. toctree:: - :maxdepth: 2 - :caption: Contributing - :hidden: - - contributing/code_of_conduct - contributing/contributing - - -.. toctree:: - :maxdepth: 2 - :caption: Notes - :hidden: - - changelog + datasets + documents + models + transforms + utils diff --git a/v0.6.0/_sources/installing.rst.txt b/v0.6.0/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.6.0/_sources/installing.rst.txt +++ b/v0.6.0/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.6.0/_sources/io.rst.txt b/v0.6.0/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.6.0/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.6.0/_sources/models.rst.txt b/v0.6.0/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.6.0/_sources/models.rst.txt +++ b/v0.6.0/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.6.0/_sources/modules/datasets.rst.txt b/v0.6.0/_sources/modules/datasets.rst.txt index 75cb168083..872212a121 100644 --- a/v0.6.0/_sources/modules/datasets.rst.txt +++ b/v0.6.0/_sources/modules/datasets.rst.txt @@ -30,8 +30,12 @@ doctr.datasets .. autoclass:: MJSynth +.. autoclass:: IIITHWS + .. autoclass:: DocArtefacts +.. autoclass:: WILDRECEIPT + Synthetic dataset generator --------------------------- @@ -90,6 +94,9 @@ of vocabs. * - arabic_letters - 37 - ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي + * - generic_cyrillic_letters + - 58 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ * - persian_letters - 5 - پچڢڤگ @@ -117,6 +124,9 @@ of vocabs. * - spanish - 116 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ + * - italian + - 120 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ * - german - 108 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ @@ -126,8 +136,41 @@ of vocabs. * - czech - 130 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ + * - polish + - 118 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ąćęłńóśźżĄĆĘŁŃÓŚŹŻ + * - dutch + - 114 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ + * - norwegian + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿æøåÆØÅ + * - danish + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°£€¥¢฿æøåÆØÅ + * - finnish + - 104 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöÄÖ + * - swedish + - 106 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿åäöÅÄÖ + * - ukrainian + - 115 + - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴ * - vietnamese - - 234 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ + - 236 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ + * - hebrew + - 123 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪ + * - hindi + - 71 + - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰ + * - bangla + - 70 + - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯ + * - multilingual + - 195 + - english & french & german & italian & spanish & portuguese & czech & polish & dutch & norwegian & danish & finnish & swedish & § .. autofunction:: encode_sequences diff --git a/v0.6.0/_sources/modules/models.rst.txt b/v0.6.0/_sources/modules/models.rst.txt index 79154b3c58..2baf095eed 100644 --- a/v0.6.0/_sources/modules/models.rst.txt +++ b/v0.6.0/_sources/modules/models.rst.txt @@ -25,7 +25,9 @@ doctr.models.classification .. autofunction:: doctr.models.classification.mobilenet_v3_large_r -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +.. autofunction:: doctr.models.classification.mobilenet_v3_small_crop_orientation + +.. autofunction:: doctr.models.classification.mobilenet_v3_small_page_orientation .. autofunction:: doctr.models.classification.magc_resnet31 @@ -33,26 +35,36 @@ doctr.models.classification .. autofunction:: doctr.models.classification.vit_b +.. autofunction:: doctr.models.classification.textnet_tiny + +.. autofunction:: doctr.models.classification.textnet_small + +.. autofunction:: doctr.models.classification.textnet_base + .. autofunction:: doctr.models.classification.crop_orientation_predictor +.. autofunction:: doctr.models.classification.page_orientation_predictor + doctr.models.detection ---------------------- .. autofunction:: doctr.models.detection.linknet_resnet18 -.. autofunction:: doctr.models.detection.linknet_resnet18_rotation - .. autofunction:: doctr.models.detection.linknet_resnet34 .. autofunction:: doctr.models.detection.linknet_resnet50 .. autofunction:: doctr.models.detection.db_resnet50 -.. autofunction:: doctr.models.detection.differentiable_binarization.pytorch.db_resnet50_rotation - .. autofunction:: doctr.models.detection.db_mobilenet_v3_large +.. autofunction:: doctr.models.detection.fast_tiny + +.. autofunction:: doctr.models.detection.fast_small + +.. autofunction:: doctr.models.detection.fast_base + .. autofunction:: doctr.models.detection.detection_predictor @@ -73,6 +85,8 @@ doctr.models.recognition .. autofunction:: doctr.models.recognition.vitstr_base +.. autofunction:: doctr.models.recognition.parseq + .. autofunction:: doctr.models.recognition.recognition_predictor @@ -81,6 +95,8 @@ doctr.models.zoo .. autofunction:: doctr.models.ocr_predictor +.. autofunction:: doctr.models.kie_predictor + doctr.models.factory -------------------- diff --git a/v0.6.0/_sources/modules/transforms.rst.txt b/v0.6.0/_sources/modules/transforms.rst.txt index 7f90325e4d..7fc02f4cc4 100644 --- a/v0.6.0/_sources/modules/transforms.rst.txt +++ b/v0.6.0/_sources/modules/transforms.rst.txt @@ -28,6 +28,7 @@ Here are all transformations that are available through docTR: .. autoclass:: GaussianNoise .. autoclass:: RandomHorizontalFlip .. autoclass:: RandomShadow +.. autoclass:: RandomResize Composing transformations diff --git a/v0.6.0/_sources/notebooks.md.txt b/v0.6.0/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.6.0/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.6.0/_sources/notebooks.rst.txt b/v0.6.0/_sources/notebooks.rst.txt index e8971fceee..96f9e80edb 100644 --- a/v0.6.0/_sources/notebooks.rst.txt +++ b/v0.6.0/_sources/notebooks.rst.txt @@ -14,4 +14,4 @@ Here are some notebooks compiled for users to better leverage the library capabi +--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ | `[Artefact detection] `_ | Object detection for artefacts in documents | .. image:: https://colab.research.google.com/assets/colab-badge.svg | | | | :target: https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb | -+--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ \ No newline at end of file ++--------------------------------------------------------------------------------------------------------+----------------------------------------------+---------------------------------------------------------------------------------------------------------------------+ diff --git a/v0.6.0/_sources/transforms.rst.txt b/v0.6.0/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.6.0/_sources/transforms.rst.txt +++ b/v0.6.0/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.6.0/_sources/using_doctr/running_on_aws.rst.txt b/v0.6.0/_sources/using_doctr/running_on_aws.rst.txt index a824f354e9..8a5e1a4cc4 100644 --- a/v0.6.0/_sources/using_doctr/running_on_aws.rst.txt +++ b/v0.6.0/_sources/using_doctr/running_on_aws.rst.txt @@ -1,7 +1,10 @@ AWS Lambda -======================== +========== -AWS Lambda's (read more about Lambda https://aws.amazon.com/lambda/) security policy does not allow you to write anywhere outside `/tmp` directory. -There are two things you need to do to make `doctr` work on lambda: -1. Disable usage of `multiprocessing` package by setting `DOCTR_MULTIPROCESSING_DISABLE` enivronment variable to `TRUE`. You need to do this, because this package uses `/dev/shm` directory for shared memory. -2. Change directory `doctr` uses for caching models. By default it's `~/.cache/doctr` which is outside of `/tmp` on AWS Lambda'. You can do this by setting `DOCTR_CACHE_DIR` enivronment variable. +The security policy of `AWS Lambda `_ restricts writing outside the ``/tmp`` directory. + +To make docTR work on Lambda, you need to perform the following two steps: + +1. Disable the usage of the ``multiprocessing`` package by setting the ``DOCTR_MULTIPROCESSING_DISABLE`` environment variable to ``TRUE``. This step is necessary because the package uses the ``/dev/shm`` directory for shared memory. + +2. Change the caching directory used by docTR for models. By default, it is set to ``~/.cache/doctr``, which is outside the ``/tmp`` directory on AWS Lambda. You can modify this by setting the ``DOCTR_CACHE_DIR`` environment variable. diff --git a/v0.6.0/_sources/using_doctr/sharing_models.rst.txt b/v0.6.0/_sources/using_doctr/sharing_models.rst.txt index 572bbca780..7ff09f08f3 100644 --- a/v0.6.0/_sources/using_doctr/sharing_models.rst.txt +++ b/v0.6.0/_sources/using_doctr/sharing_models.rst.txt @@ -132,3 +132,7 @@ Recognition +---------------------------------+---------------------------------------------------+---------------------+------------------------+ | crnn_vgg16_bn (dummy) | Felix92/doctr-tf-crnn-vgg16-bn-french | french | TensorFlow | +---------------------------------+---------------------------------------------------+---------------------+------------------------+ +| crnn_vgg16_bn | tilman-rassy/doctr-crnn-vgg16-bn-fascan-v1 | french + german + § | PyTorch | ++---------------------------------+---------------------------------------------------+---------------------+------------------------+ +| parseq | Felix92/doctr-torch-parseq-multilingual-v1 | multilingual | PyTorch | ++---------------------------------+---------------------------------------------------+---------------------+------------------------+ diff --git a/v0.6.0/_sources/using_doctr/using_datasets.rst.txt b/v0.6.0/_sources/using_doctr/using_datasets.rst.txt index f3f149e59b..5fd5dc2776 100644 --- a/v0.6.0/_sources/using_doctr/using_datasets.rst.txt +++ b/v0.6.0/_sources/using_doctr/using_datasets.rst.txt @@ -41,14 +41,16 @@ This datasets contains the information to train or validate a text detection mod +-----------------------------+---------------------------------+---------------------------------+----------------------------------+ | IMGUR5K | 7149 | 796 | Handwritten / external resources | +-----------------------------+---------------------------------+---------------------------------+----------------------------------+ +| WILDRECEIPT | 1268 | 472 | external resources | ++-----------------------------+---------------------------------+---------------------------------+----------------------------------+ .. code:: python3 from doctr.datasets import CORD # Load straight boxes - train_set = CORD(train=True, download=True) + train_set = CORD(train=True, download=True, detection_task=True) # Load rotated boxes - train_set = CORD(train=True, download=True, use_polygons=True) + train_set = CORD(train=True, download=True, use_polygons=True, detection_task=True) img, target = train_set[0] @@ -58,7 +60,7 @@ Recognition This datasets contains the information to train or validate a text recognition model. +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ -| **Dataset** | **Train Samples** | **Test Samples** | **Information** | +| **Dataset** | **Train Samples** | **Test Samples** | **Information** | +=============================+=================================+=================================+=============================================+ | FUNSD | 21888 | 8707 | english | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ @@ -80,7 +82,11 @@ This datasets contains the information to train or validate a text recognition m +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ | IMGUR5K | 207901 | 22672 | english / handwritten / external resources | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ -| MJSynth | 7581382 | 1337891 | english | +| MJSynth | 7581382 | 1337891 | english / external resources | ++-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ +| IIITHWS | 7141797 | 793533 | english / handwritten / external resources | ++-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ +| WILDRECEIPT | 49377 | 19598 | english / external resources | +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+ .. code:: python3 @@ -93,6 +99,21 @@ This datasets contains the information to train or validate a text recognition m img, target = train_set[0] +OCR +^^^ + +The same dataset table as for detection, but with information about the bounding boxes and labels. + +.. code:: python3 + + from doctr.datasets import CORD + # Load straight boxes + train_set = CORD(train=True, download=True) + # Load rotated boxes + train_set = CORD(train=True, download=True, use_polygons=True) + img, target = train_set[0] + + Object Detection ^^^^^^^^^^^^^^^^ diff --git a/v0.6.0/_sources/using_doctr/using_model_export.rst.txt b/v0.6.0/_sources/using_doctr/using_model_export.rst.txt index 992f4e9866..c62c36169b 100644 --- a/v0.6.0/_sources/using_doctr/using_model_export.rst.txt +++ b/v0.6.0/_sources/using_doctr/using_model_export.rst.txt @@ -3,69 +3,112 @@ Preparing your model for inference A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! -.. currentmodule:: doctr.models.export +.. currentmodule:: doctr.models.utils -Model compression ------------------ +Model optimization +------------------ -This section is meant to help you perform inference with compressed versions of your model. +This section is meant to help you perform inference with optimized versions of your model. -TensorFlow Lite -^^^^^^^^^^^^^^^ +Half-precision +^^^^^^^^^^^^^^ -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: +**NOTE:** We support half-precision inference for PyTorch and TensorFlow models only on **GPU devices**. - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() +Half-precision (or FP16) is a binary floating-point format that occupies 16 bits in computer memory. -Half-precision +Advantages: + +- Faster inference +- Less memory usage + +.. tabs:: + + .. tab:: TensorFlow + + .. code:: python3 + + import tensorflow as tf + from tensorflow.keras import mixed_precision + mixed_precision.set_global_policy('mixed_float16') + predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True) + + .. tab:: PyTorch + + .. code:: python3 + + import torch + predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True).cuda().half() + res = predictor(doc) + + +Export to ONNX ^^^^^^^^^^^^^^ -If you want to convert it to half-precision using your TFLite converter +ONNX (Open Neural Network Exchange) is an open and interoperable format for representing and exchanging machine learning models. +It defines a common format for representing models, including the network structure, layer types, parameters, and metadata. + +.. tabs:: + + .. tab:: TensorFlow + + .. code:: python3 + + import tensorflow as tf + from doctr.models import vitstr_small + from doctr.models.utils import export_model_to_onnx + + batch_size = 16 + input_shape = (3, 32, 128) + model = vitstr_small(pretrained=True, exportable=True) + dummy_input = [tf.TensorSpec([batch_size, input_shape], tf.float32, name="input")] + model_path, output = export_model_to_onnx(model, model_name="vitstr.onnx", dummy_input=dummy_input) + + + .. tab:: PyTorch - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() + .. code:: python3 + import torch + from doctr.models import vitstr_small + from doctr.models.utils import export_model_to_onnx -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ + batch_size = 16 + input_shape = (32, 128, 3) + model = vitstr_small(pretrained=True, exportable=True) + dummy_input = torch.rand((batch_size, input_shape), dtype=torch.float32) + model_path = export_model_to_onnx(model, model_name="vitstr.onnx, dummy_input=dummy_input) -Finally if you wish to quantize the model with your TFLite converter - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() +Using your ONNX exported model +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +To use your exported model, we have build a dedicated lightweight package called `OnnxTR `_. +The package doesn't require PyTorch or TensorFlow to be installed - build on top of ONNXRuntime. +It is simple and easy-to-use (with the same interface you know already from docTR), that allows you to perform inference with your exported model. -Using SavedModel ----------------- +- `Installation `_ +- `Loading custom exported model `_ -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: +.. code:: shell + pip install onnxtr[cpu] - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') +.. code:: python3 -And loaded just as easily: + from onnxtr.io import DocumentFile + from onnxtr.models import ocr_predictor, parseq, linknet_resnet18 + # Load your documents + single_img_doc = DocumentFile.from_images("path/to/your/img.jpg") + # Load your exported model/s + reco_model = parseq("path_to_custom_model.onnx", vocab="ABC") + det_model = linknet_resnet18("path_to_custom_model.onnx") + predictor = ocr_predictor(det_arch=det_model, reco_arch=reco_model) + # Or use any of the pre-trained models + predictor = ocr_predictor(det_arch="linknet_resnet18", reco_arch="parseq") - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') + # Get your results + res = predictor(single_img_doc) diff --git a/v0.6.0/_sources/using_doctr/using_models.rst.txt b/v0.6.0/_sources/using_doctr/using_models.rst.txt index 5c2d62fceb..e6e5006f2e 100644 --- a/v0.6.0/_sources/using_doctr/using_models.rst.txt +++ b/v0.6.0/_sources/using_doctr/using_models.rst.txt @@ -23,26 +23,56 @@ Available architectures The following architectures are currently supported: * :py:meth:`linknet_resnet18 ` +* :py:meth:`linknet_resnet34 ` +* :py:meth:`linknet_resnet50 ` * :py:meth:`db_resnet50 ` * :py:meth:`db_mobilenet_v3_large ` - -We also provide 2 models working with any kind of rotated documents: - -* :py:meth:`linknet_resnet18_rotation ` -* :py:meth:`db_resnet50_rotation ` +* :py:meth:`fast_tiny ` +* :py:meth:`fast_small ` +* :py:meth:`fast_base ` For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ ++------------------------------------------------------------------------------------+----------------------------+----------------------------+--------------------+ +| | FUNSD | CORD | | ++================+=================================+=================+===============+============+===============+============+===============+====================+ +| **Backend** | **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **sec/it (B: 1)** | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | db_resnet50 | (1024, 1024, 3) | 25.2 M | 84.39 | 85.86 | 93.70 | 83.24 | 1.2 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 80.29 | 70.90 | 84.70 | 67.76 | 0.5 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet18 | (1024, 1024, 3) | 11.5 M | 81.37 | 84.08 | 85.71 | 83.70 | 0.7 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet34 | (1024, 1024, 3) | 21.6 M | 82.20 | 85.49 | 87.63 | 87.17 | 0.8 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | linknet_resnet50 | (1024, 1024, 3) | 28.8 M | 80.70 | 83.51 | 86.46 | 84.94 | 1.1 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_tiny | (1024, 1024, 3) | 13.5 M (8.5M) | 85.29 | 85.34 | 93.46 | 75.99 | 0.7 (0.4) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_small | (1024, 1024, 3) | 14.7 M (9.7M) | 85.50 | 86.89 | 94.05 | 78.33 | 0.7 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | fast_base | (1024, 1024, 3) | 16.3 M (10.6M)| 85.22 | 86.97 | 94.18 | 84.74 | 0.8 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_resnet34 | (1024, 1024, 3) | 22.4 M | 82.76 | 76.75 | 89.20 | 71.74 | 0.8 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_resnet50 | (1024, 1024, 3) | 25.4 M | 83.56 | 86.68 | 92.61 | 86.39 | 1.1 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 82.69 | 84.63 | 94.51 | 70.28 | 0.5 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet18 | (1024, 1024, 3) | 11.5 M | 81.64 | 85.52 | 88.92 | 82.74 | 0.6 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet34 | (1024, 1024, 3) | 21.6 M | 81.62 | 82.95 | 86.26 | 81.06 | 0.7 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | linknet_resnet50 | (1024, 1024, 3) | 28.8 M | 81.78 | 82.47 | 87.29 | 85.54 | 1.0 | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_tiny | (1024, 1024, 3) | 13.5 M (8.5M) | 84.90 | 85.04 | 93.73 | 76.26 | 0.7 (0.4) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_small | (1024, 1024, 3) | 14.7 M (9.7M) | 85.36 | 86.68 | 94.09 | 78.53 | 0.7 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | fast_base | (1024, 1024, 3) | 16.3 M (10.6M)| 84.95 | 86.73 | 94.39 | 85.36 | 0.8 (0.5) | ++----------------+---------------------------------+-----------------+---------------+------------+---------------+------------+---------------+--------------------+ All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). @@ -50,7 +80,7 @@ Explanations about the metrics being used are available in :ref:`metrics`. *Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large ` AWS instance (CPU Xeon Platinum 8275L). +Seconds per iteration (with a batch size of 1) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz`. Detection predictors @@ -58,11 +88,13 @@ Detection predictors :py:meth:`detection_predictor ` wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) +.. code:: python3 + + import numpy as np + from doctr.models import detection_predictor + predictor = detection_predictor('db_resnet50') + dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = model([dummy_img]) You can pass specific boolean arguments to the predictor: @@ -72,10 +104,10 @@ You can pass specific boolean arguments to the predictor: For instance, this snippet will instantiates a detection predictor able to detect text on rotated documents while preserving the aspect ratio: - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50_rotation', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) +.. code:: python3 -NB: for the moment, `db_resnet50_rotation` is pretrained in Pytorch only and `linknet_resnet18_rotation` in Tensorflow only. + from doctr.models import detection_predictor + predictor = detection_predictor('db_resnet50', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) Text Recognition @@ -94,75 +126,81 @@ The following architectures are currently supported: * :py:meth:`crnn_mobilenet_v3_large ` * :py:meth:`sar_resnet31 ` * :py:meth:`master ` +* :py:meth:`vitstr_small ` +* :py:meth:`vitstr_base ` +* :py:meth:`parseq ` For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - ++-----------------------------------------------------------------------------------+----------------------------+----------------------------+--------------------+ +| | FUNSD | CORD | | ++================+=================================+=================+==============+============+===============+============+===============+====================+ +| **Backend** | **Architecture** | **Input shape** | **# params** | **Exact** | **Partial** | **Exact** | **Partial** | **sec/it (B: 64)** | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_vgg16_bn | (32, 128, 3) | 15.8 M | 88.12 | 88.85 | 94.68 | 95.10 | 0.9 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_mobilenet_v3_small | (32, 128, 3) | 2.1 M | 86.88 | 87.61 | 92.28 | 92.73 | 0.25 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | crnn_mobilenet_v3_large | (32, 128, 3) | 4.5 M | 87.44 | 88.12 | 94.14 | 94.55 | 0.34 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | master | (32, 128, 3) | 58.8 M | 87.44 | 88.21 | 93.83 | 94.25 | 22.3 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | sar_resnet31 | (32, 128, 3) | 57.2 M | 87.67 | 88.48 | 94.21 | 94.66 | 7.1 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | vitstr_small | (32, 128, 3) | 21.4 M | 83.01 | 83.84 | 86.57 | 87.00 | 2.0 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | vitstr_base | (32, 128, 3) | 85.2 M | 85.98 | 86.70 | 90.47 | 90.95 | 5.8 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| TensorFlow | parseq | (32, 128, 3) | 23.8 M | 81.62 | 82.29 | 79.13 | 79.52 | 3.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_vgg16_bn | (32, 128, 3) | 15.8 M | 86.54 | 87.41 | 94.29 | 94.69 | 0.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_mobilenet_v3_small | (32, 128, 3) | 2.1 M | 87.25 | 87.99 | 93.91 | 94.34 | 0.05 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | crnn_mobilenet_v3_large | (32, 128, 3) | 4.5 M | 87.38 | 88.09 | 94.46 | 94.92 | 0.08 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | master | (32, 128, 3) | 58.7 M | 88.57 | 89.39 | 95.73 | 96.21 | 17.6 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | sar_resnet31 | (32, 128, 3) | 55.4 M | 88.10 | 88.88 | 94.83 | 95.29 | 4.9 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | vitstr_small | (32, 128, 3) | 21.4 M | 88.00 | 88.82 | 95.40 | 95.78 | 1.5 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | vitstr_base | (32, 128, 3) | 85.2 M | 88.33 | 89.09 | 95.32 | 95.71 | 4.1 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ +| PyTorch | parseq | (32, 128, 3) | 23.8 M | 88.53 | 89.24 | 95.56 | 95.91 | 2.2 | ++----------------+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+--------------------+ + All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). Explanations about the metric being used (exact match) are available in :ref:`metrics`. While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) +.. code:: python3 + + from doctr.models import recognition_predictor + predictor = recognition_predictor('crnn_vgg16_bn') + print(predictor.model.cfg['vocab']) *Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large ` AWS instance (CPU Xeon Platinum 8275L). +Seconds per iteration (with a batch size of 64) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz`. Recognition predictors ^^^^^^^^^^^^^^^^^^^^^^ :py:meth:`recognition_predictor ` wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) +.. code:: python3 + + import numpy as np + from doctr.models import recognition_predictor + predictor = recognition_predictor('crnn_vgg16_bn') + dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) + out = model([dummy_img]) End-to-End OCR @@ -173,92 +211,119 @@ The task consists of both localizing and transcribing textual elements in a give Available architectures ^^^^^^^^^^^^^^^^^^^^^^^ -You can use any combination of detection and recognition models supporte by docTR. +You can use any combination of detection and recognition models supported by docTR. For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ ++---------------------------------------------------------------------------+----------------------------+----------------------------+ +| | FUNSD | CORD | ++================+==========================================================+============================+============+===============+ +| **Backend** | **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_vgg16_bn | 73.45 | 74.73 | 85.79 | 76.21 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_mobilenet_v3_small | 72.66 | 73.93 | 83.43 | 74.11 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + crnn_mobilenet_v3_large | 72.86 | 74.13 | 85.16 | 75.65 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + master | 72.73 | 74.00 | 84.13 | 75.05 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + sar_resnet31 | 73.23 | 74.51 | 85.34 | 76.03 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + vitstr_small | 68.57 | 69.77 | 78.24 | 69.51 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + vitstr_base | 70.96 | 72.20 | 82.10 | 72.94 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| TensorFlow | db_resnet50 + parseq | 68.85 | 70.05 | 72.38 | 64.30 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_vgg16_bn | 72.43 | 75.13 | 85.05 | 79.33 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_mobilenet_v3_small | 73.06 | 75.79 | 84.64 | 78.94 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + crnn_mobilenet_v3_large | 73.17 | 75.90 | 84.96 | 79.25 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + master | 73.90 | 76.66 | 85.84 | 80.07 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + sar_resnet31 | 73.58 | 76.33 | 85.64 | 79.88 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + vitstr_small | 73.06 | 75.79 | 85.95 | 80.17 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + vitstr_base | 73.70 | 76.46 | 85.76 | 79.99 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| PyTorch | db_resnet50 + parseq | 73.52 | 76.27 | 85.91 | 80.13 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Gvision text detection | 59.50 | 62.50 | 75.30 | 59.03 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Gvision doc. text detection | 64.00 | 53.30 | 68.90 | 61.10 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | AWS textract | 78.10 | 83.00 | 87.50 | 66.00 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ +| None | Azure Form Recognizer (v3.2) | 79.42 | 85.89 | 89.62 | 88.93 | ++----------------+----------------------------------------------------------+------------+---------------+------------+---------------+ + All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). Explanations about the metrics being used are available in :ref:`metrics`. *Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large ` AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - Two-stage approaches ^^^^^^^^^^^^^^^^^^^^ Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with :py:meth:`ocr_predictor `. - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) +.. code:: python3 + + import numpy as np + from doctr.models import ocr_predictor + model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = model([input_page]) You can pass specific boolean arguments to the predictor: -* `assume_straight_pages` -* `preserve_aspect_ratio` -* `symmetric_pad` +* `assume_straight_pages`: if you work with straight documents only, it will fit straight bounding boxes to the text areas. +* `preserve_aspect_ratio`: if you want to preserve the aspect ratio of your documents while resizing before sending them to the model. +* `symmetric_pad`: if you choose to preserve the aspect ratio, it will pad the image symmetrically and not from the bottom-right. Those 3 are going straight to the detection predictor, as mentioned above (in the detection part). +Additional arguments which can be passed to the `ocr_predictor` are: + * `export_as_straight_boxes`: If you work with rotated and skewed documents but you still want to export straight bounding boxes and not polygons, set it to True. +* `straighten_pages`: If you want to straighten the pages before sending them to the detection model, set it to True. For instance, this snippet instantiates an end-to-end ocr_predictor working with rotated documents, which preserves the aspect ratio of the documents, and returns polygons: - >>> from doctr.model import ocr_predictor - >>> model = ocr_predictor('linknet_resnet18_rotation', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor('linknet_resnet18', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True) + + +Additionally, you can change the batch size of the underlying detection and recognition predictors to optimize the performance depending on your hardware: + +* `det_bs`: batch size for the detection model (default: 2) +* `reco_bs`: batch size for the recognition model (default: 128) + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, det_bs=4, reco_bs=1024) + +To modify the output structure you can pass the following arguments to the predictor which will be handled by the underlying `DocumentBuilder`: + +* `resolve_lines`: whether words should be automatically grouped into lines (default: True) +* `resolve_blocks`: whether lines should be automatically grouped into blocks (default: False) +* `paragraph_break`: relative length of the minimum space separating paragraphs (default: 0.035) + +For example to disable the automatic grouping of lines into blocks: + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, resolve_blocks=False) What should I do with the output? @@ -285,11 +350,19 @@ Here is a typical `Document` layout:: )] ) +To get only the text content of the `Document`, you can use the `render` method:: + + text_output = result.render() + +For reference, here is the output for the `Document` above:: + + No. RECEIPT DATE + You can also export them as a nested dict, more appropriate for JSON format:: json_output = result.export() -For reference, here is the JSON export for the same `Document` as above:: +For reference, here is the export for the same `Document` as above:: { 'pages': [ @@ -308,17 +381,23 @@ For reference, here is the JSON export for the same `Document` as above:: { 'value': 'No.', 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) + 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)), + 'objectness_score': 0.96, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'RECEIPT', 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) + 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'DATE', 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) + 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, } ] } @@ -330,14 +409,18 @@ For reference, here is the JSON export for the same `Document` as above:: ] } -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: +To export the outpout as XML (hocr-format) you can use the `export_as_xml` method: + +.. code-block:: python xml_output = result.export_as_xml() for output in xml_output: xml_bytes_string = output[0] xml_element = output[1] -For reference, here is a sample XML byte string output:: +For reference, here is a sample XML byte string output: + +.. code-block:: xml @@ -360,3 +443,74 @@ For reference, here is a sample XML byte string output::
                                                + + +Advanced options +^^^^^^^^^^^^^^^^ +We provide a few advanced options to customize the behavior of the predictor to your needs: + +* Modify the binarization threshold for the detection model. +* Modify the box threshold for the detection model. + +This is useful to detect (possible less) text regions more accurately with a higher threshold, or to detect more text regions with a lower threshold. + + +.. code:: python3 + + import numpy as np + from doctr.models import ocr_predictor + predictor = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) + + # Modify the binarization threshold and the box threshold + predictor.det_predictor.model.postprocessor.bin_thresh = 0.5 + predictor.det_predictor.model.postprocessor.box_thresh = 0.2 + + input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) + out = predictor([input_page]) + + +* Disable page orientation classification + +If you deal with documents which contains only small rotations (~ -45 to 45 degrees), you can disable the page orientation classification to speed up the inference. + +This will only have an effect with `assume_straight_pages=False` and/or `straighten_pages=True` and/or `detect_orientation=True`. + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_page_orientation=True) + + +* Disable crop orientation classification + +If you deal with documents which contains only horizontal text, you can disable the crop orientation classification to speed up the inference. + +This will only have an effect with `assume_straight_pages=False` and/or `straighten_pages=True`. + +.. code:: python3 + + from doctr.model import ocr_predictor + model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_crop_orientation=True) + + +* Add a hook to the `ocr_predictor` to manipulate the location predictions before the crops are passed to the recognition model. + +.. code:: python3 + + from doctr.model import ocr_predictor + + class CustomHook: + def __call__(self, loc_preds): + # Manipulate the location predictions here + # 1. The outpout structure needs to be the same as the input location predictions + # 2. Be aware that the coordinates are relative and needs to be between 0 and 1 + return loc_preds + + my_hook = CustomHook() + + predictor = ocr_predictor(pretrained=True) + # Add a hook in the middle of the pipeline + predictor.add_hook(my_hook) + # You can also add multiple hooks which will be executed sequentially + for hook in [my_hook, my_hook, my_hook]: + predictor.add_hook(hook) diff --git a/v0.6.0/_sources/using_model_export.rst.txt b/v0.6.0/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.6.0/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.6.0/_sources/using_models.rst.txt b/v0.6.0/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.6.0/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
                                                -
                                                -

                                                - - Hello - XML - World - -

                                                -
                                                - - \ No newline at end of file diff --git a/v0.6.0/_sources/utils.rst.txt b/v0.6.0/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.6.0/_sources/utils.rst.txt +++ b/v0.6.0/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.6.0/_static/documentation_options.js b/v0.6.0/_static/documentation_options.js index f319e014cb..a7b5cbe04a 100644 --- a/v0.6.0/_static/documentation_options.js +++ b/v0.6.0/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.6.0a0-git', + VERSION: '0.3.0a0-git', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/v0.6.0/changelog.html b/v0.6.0/changelog.html index 0f8b810583..6ed2620fb7 100644 --- a/v0.6.0/changelog.html +++ b/v0.6.0/changelog.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Changelog - docTR documentation @@ -227,36 +227,21 @@ @@ -298,30 +283,6 @@

                                                Changelog

                                                -
                                                -

                                                v0.5.1 (2022-03-22)

                                                -

                                                Release note: v0.5.1

                                                -
                                                -
                                                -

                                                v0.5.0 (2021-12-31)

                                                -

                                                Release note: v0.5.0

                                                -
                                                -
                                                -

                                                v0.4.1 (2021-11-22)

                                                -

                                                Release note: v0.4.1

                                                -
                                                -
                                                -

                                                v0.4.0 (2021-10-01)

                                                -

                                                Release note: v0.4.0

                                                -
                                                -
                                                -

                                                v0.3.1 (2021-08-27)

                                                -

                                                Release note: v0.3.1

                                                -
                                                -
                                                -

                                                v0.3.0 (2021-07-02)

                                                -

                                                Release note: v0.3.0

                                                -

                                                v0.2.1 (2021-05-28)

                                                Release note: v0.2.1

                                                @@ -345,15 +306,23 @@

                                                v0.1.0 (2021-03-05) - - + +
                                                +
                                                + Next +
                                                +
                                                doctr.datasets
                                                +
                                                + +
                                                +
                                                Previous
                                                -
                                                Contributing to docTR
                                                +
                                                Installation
                                                @@ -388,12 +357,6 @@

                                                v0.1.0 (2021-03-05)

                                                diff --git a/v0.6.0/contributing/code_of_conduct.html b/v0.6.0/contributing/code_of_conduct.html index 1a8e109476..7aa6177698 100644 --- a/v0.6.0/contributing/code_of_conduct.html +++ b/v0.6.0/contributing/code_of_conduct.html @@ -236,12 +236,15 @@

                                                Package Reference

                                                diff --git a/v0.6.0/contributing/contributing.html b/v0.6.0/contributing/contributing.html index 77618ca847..6594d38b68 100644 --- a/v0.6.0/contributing/contributing.html +++ b/v0.6.0/contributing/contributing.html @@ -236,12 +236,15 @@

                                                Package Reference

                                        @@ -361,45 +366,22 @@

                                        Code quality
                                        make quality
                                         
                                        -
                                        -

                                        Lint verification

                                        -

                                        To ensure that your incoming PR complies with the lint settings, you need to install flake8 and run the following command from the repository’s root folder:

                                        -
                                        flake8 ./
                                        +
                                        +

                                        Code style verification

                                        +

                                        To run all style checks together

                                        +
                                        make style
                                         
                                        -

                                        This will read the .flake8 setting file and let you know whether your commits need some adjustments.

                                        -
                                        -
                                        -

                                        Import order

                                        -

                                        In order to ensure there is a common import order convention, run isort as follows:

                                        -
                                        isort **/*.py
                                        -
                                        -
                                        -

                                        This will reorder the imports of your local files.

                                        -
                                        -
                                        -

                                        Annotation typing

                                        -

                                        Additionally, to catch type-related issues and have a cleaner codebase, annotation typing are expected. After installing mypy, you can run the verifications as follows:

                                        -
                                        mypy --config-file mypy.ini doctr/
                                        -
                                        -
                                        -

                                        The mypy.ini file will be read to check your typing.

                                        -
                                        -
                                        -

                                        Docstring format

                                        -

                                        To keep a sane docstring structure, if you install pydocstyle, you can verify your docstrings as follows:

                                        -
                                        pydocstyle doctr/
                                        -
                                        -
                                        -

                                        The .pydocstyle file will be read to configure this operation.

                                        Modifying the documentation

                                        -

                                        In order to check locally your modifications to the documentation:

                                        +

                                        The current documentation is built using sphinx thanks to our CI. +You can build the documentation locally:

                                        make docs-single-version
                                         
                                        +

                                        Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the _build directory. Additionally, you may need to clear your web browser’s cache to see the modifications.

                                        You can now open your local version of the documentation located at docs/_build/index.html in your browser

                                        @@ -477,10 +459,7 @@

                                        Let’s connectCommits

                                      • Unit tests
                                      • Code quality
                                      • Modifying the documentation
                                      • @@ -498,7 +477,7 @@

                                        Let’s connect + diff --git a/v0.6.0/datasets.html b/v0.6.0/datasets.html index 1f5855cc82..640791680a 100644 --- a/v0.6.0/datasets.html +++ b/v0.6.0/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,12 +287,16 @@

                                        doctr.datasets

                                        Available Datasets

                                        -

                                        Here are all datasets that are available through docTR:

                                        -
                                        -

                                        Public datasets

                                        +

                                        The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

                                        +
                                        +
                                        +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
                                        +
                                        + +

                                        Here are all datasets that are available through DocTR:

                                        -class doctr.datasets.FUNSD(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        +class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                                        FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

                                        Example::
                                        >>> from doctr.datasets import FUNSD
                                        @@ -313,7 +310,8 @@ 

                                        Public datasetsParameters:
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • +
                                        • sample_transforms – composable transformations that will be applied to each image

                                        • +
                                        • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • **kwargs – keyword arguments from VisionDataset.

                                        @@ -322,7 +320,7 @@

                                        Public datasets
                                        -class doctr.datasets.SROIE(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        +class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                                        SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

                                        Example::
                                        - -
                                        -
                                        -class doctr.datasets.IIIT5K(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

                                        -
                                        -
                                        Example::
                                        >>> # NOTE: this dataset is for character-level localization
                                        ->>> from doctr.datasets import IIIT5K
                                        ->>> train_set = IIIT5K(train=True, download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        • **kwargs – keyword arguments from VisionDataset.

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.SVT(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import SVT
                                        ->>> train_set = SVT(train=True, download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        • **kwargs – keyword arguments from VisionDataset.

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.SVHN(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        SVHN dataset from “The Street View House Numbers (SVHN) Dataset”.

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import SVHN
                                        ->>> train_set = SVHN(train=True, download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        • **kwargs – keyword arguments from VisionDataset.

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.SynthText(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        SynthText dataset from “Synthetic Data for Text Localisation in Natural Images” | “repository” | -“website”.

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import SynthText
                                        ->>> train_set = SynthText(train=True, download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        • **kwargs – keyword arguments from VisionDataset.

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.IC03(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        IC03 dataset from “ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions”.

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import IC03
                                        ->>> train_set = IC03(train=True, download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        • **kwargs – keyword arguments from VisionDataset.

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.IC13(img_folder: str, label_folder: str, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        IC13 dataset from “ICDAR 2013 Robust Reading Competition”. -Example:

                                        -
                                        >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
                                        ->>> from doctr.datasets import IC13
                                        ->>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
                                        ->>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
                                        ->>> img, target = train_set[0]
                                        ->>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
                                        ->>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
                                        ->>> img, target = test_set[0]
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • img_folder – folder with all the images of the dataset

                                        • -
                                        • label_folder – folder with all annotation files for the images

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        -
                                        -
                                        -
                                        - -

                                        -
                                        -

                                        docTR synthetic datasets

                                        -
                                        -
                                        -class doctr.datasets.DocArtefacts(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import DocArtefacts
                                        ->>> train_set = DocArtefacts(download=True)
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • train – whether the subset should be the training one

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • +
                                        • sample_transforms – composable transformations that will be applied to each image

                                        • +
                                        • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • **kwargs – keyword arguments from VisionDataset.

                                        -
                                        -
                                        -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
                                        -

                                        Implements a character image generation dataset

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import CharacterGenerator
                                        ->>> ds = CharacterGenerator(vocab='abdef')
                                        ->>> img, target = ds[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • vocab – vocabulary to take the character from

                                        • -
                                        • num_samples – number of samples that will be generated iterating over the dataset

                                        • -
                                        • cache_samples – whether generated images should be cached firsthand

                                        • -
                                        • font_family – font to use to generate the text images

                                        • -
                                        • img_transforms – composable transformations that will be applied to each image

                                        • -
                                        • sample_transforms – composable transformations that will be applied to both the image and the target

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.WordGenerator(vocab: str, min_chars: int, max_chars: int, num_samples: int, cache_samples: bool = False, font_family: str | List[str] | None = None, img_transforms: Callable[[Any], Any] | None = None, sample_transforms: Callable[[Any, Any], Tuple[Any, Any]] | None = None)[source]
                                        -

                                        Implements a character image generation dataset

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import WordGenerator
                                        ->>> ds = WordGenerator(vocab='abdef')
                                        ->>> img, target = ds[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • vocab – vocabulary to take the character from

                                        • -
                                        • min_chars – minimum number of characters in a word

                                        • -
                                        • max_chars – maximum number of characters in a word

                                        • -
                                        • num_samples – number of samples that will be generated iterating over the dataset

                                        • -
                                        • cache_samples – whether generated images should be cached firsthand

                                        • -
                                        • font_family – font to use to generate the text images

                                        • -
                                        • img_transforms – composable transformations that will be applied to each image

                                        • -
                                        • sample_transforms – composable transformations that will be applied to both the image and the target

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -

                                        docTR private datasets

                                        -

                                        Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same.

                                        -
                                        -
                                        -class doctr.datasets.DetectionDataset(img_folder: str, label_path: str, use_polygons: bool = False, **kwargs: Any)[source]
                                        -

                                        Implements a text detection dataset

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import DetectionDataset
                                        ->>> train_set = DetectionDataset(img_folder="/path/to/images", label_path="/path/to/labels.json")
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • img_folder – folder with all the images of the dataset

                                        • -
                                        • label_path – path to the annotations of each image

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • -
                                        -
                                        -
                                        -
                                        - -
                                        -
                                        -class doctr.datasets.RecognitionDataset(img_folder: str, labels_path: str, **kwargs: Any)[source]
                                        -

                                        Dataset implementation for text recognition tasks

                                        -
                                        -
                                        Example::
                                        >>> from doctr.datasets import RecognitionDataset
                                        ->>> train_set = RecognitionDataset(img_folder="/path/to/images", labels_path="/path/to/labels.json")
                                        ->>> img, target = train_set[0]
                                        -
                                        -
                                        -
                                        -
                                        -
                                        -
                                        Parameters:
                                        -
                                          -
                                        • img_folder – path to the images folder

                                        • -
                                        • labels_path – pathe to the json file containing all labels (character sequences)

                                        • -
                                        -
                                        -
                                        -
                                        -
                                        -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]
                                        +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

                                        Implements an OCR dataset

                                        Parameters:
                                        • img_folder – local path to image folder (all jpg at the root)

                                        • label_file – local path to the label file

                                        • -
                                        • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • +
                                        • sample_transforms – composable transformations that will be applied to each image

                                        • +
                                        • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

                                        • +
                                        • **kwargs – keyword arguments from VisionDataset.

                                        -

                                        Data Loading

                                        -

                                        Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

                                        +

                                        Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

                                        -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
                                        +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

                                        Implements a dataset wrapper for fast data loading

                                        Example::
                                        >>> from doctr.datasets import FUNSD, DataLoader
                                        @@ -681,7 +408,7 @@ 

                                        Data Loading

                                        Supported Vocabs

                                        -

                                        Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

                                        Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

                                        - +@@ -724,39 +451,19 @@

                                        Data Loading

                                        - - - - - - - - - - + + - - - - - - - - - - - - - - + +
                                        docTR VocabsDocTR Vocabs

                                        latin

                                        94

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

                                        english

                                        100

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

                                        legacy_french

                                        123

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                                        96

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

                                        french

                                        126

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

                                        portuguese

                                        131

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

                                        spanish

                                        116

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

                                        german

                                        108

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

                                        154

                                        0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

                                        -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
                                        +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

                                        Encode character sequences using a given vocab as mapping

                                        Parameters:
                                        @@ -767,7 +474,6 @@

                                        Data LoadingReturns: @@ -784,23 +490,23 @@

                                        Data Loading - +
                                        Next
                                        -
                                        doctr.io
                                        +
                                        doctr.documents
                                        - +
                                        Previous
                                        -
                                        Preparing your model for inference
                                        +
                                        Changelog
                                        @@ -836,32 +542,13 @@

                                        Data Loadingdoctr.datasets

                                        diff --git a/v0.6.0/genindex.html b/v0.6.0/genindex.html index bff7673ea9..10d0739337 100644 --- a/v0.6.0/genindex.html +++ b/v0.6.0/genindex.html @@ -225,35 +225,20 @@
                                        -
                                        -

                                        U

                                        - - -
                                        -
                                        -

                                        V

                                        @@ -677,11 +562,7 @@

                                        V

                                        W

                                        -
                                        @@ -719,7 +600,7 @@

                                        W

                                        - + diff --git a/v0.6.0/getting_started/installing.html b/v0.6.0/getting_started/installing.html index ee38f9291f..1301e50b85 100644 --- a/v0.6.0/getting_started/installing.html +++ b/v0.6.0/getting_started/installing.html @@ -236,12 +236,15 @@

                                        Package Reference

                                        -

                                        If you are running another OS than Linux, you will need a few extra dependencies.

                                        -

                                        For MacOS users, you can install them using Homebrew as follows:

                                        -
                                        brew install cairo pango gdk-pixbuf libffi
                                        -
                                        -
                                        -

                                        For Windows users, those dependencies are included in GTK. You can find the latest installer over here.

                                        Via Python Package

                                        @@ -327,16 +324,28 @@

                                        Via Python Package
                                        pip install "python-doctr[tf]"
                                        +# or with preinstalled packages for visualization & html & contrib module support
                                        +pip install "python-doctr[tf,viz,html,contib]"
                                         

                                        +
                                        +

                                        Via Conda (Only for Linux)

                                        +

                                        Install the last stable release of the package using conda:

                                        +
                                        conda config --set channel_priority strict
                                        +conda install -c techMindee -c pypdfium2-team -c bblanchon -c defaults -c conda-forge python-doctr
                                        +
                                        +
                                        +

                                        Via Git

                                        -

                                        Install the library in developper mode:

                                        +

                                        Install the library in developer mode:

                                        git clone https://github.com/mindee/doctr.git
                                         pip install -e doctr/.[tf]
                                        @@ -408,6 +417,7 @@ 

                                        Via Git
                                      • Installation
                                      • @@ -420,7 +430,7 @@

                                        Via Git

                                        -
                                        +
                                        diff --git a/v0.6.0/index.html b/v0.6.0/index.html index c9aef45a5f..b7be51df96 100644 --- a/v0.6.0/index.html +++ b/v0.6.0/index.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + docTR documentation @@ -227,35 +227,20 @@
                                        -

                                        docTR: Document Text Recognition

                                        -

                                        State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch

                                        +

                                        DocTR: Document Text Recognition

                                        +

                                        State-of-the-art Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 (PyTorch now in beta)

                                        https://github.com/mindee/doctr/releases/download/v0.2.0/ocr.png

                                        DocTR provides an easy and powerful way to extract valuable information from your documents:

                                          -
                                        • 🧾 for automation: seamlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

                                        • +
                                        • 🧾 for automation: seemlessly process documents for Natural Language Understanding tasks: we provide OCR predictors to parse textual information (localize and identify each word) from your documents.

                                        • 👩‍🔬 for research: quickly compare your own architectures speed & performances with state-of-art models on public datasets.

                                        +

                                        Welcome to the documentation of DocTR!

                                        Main Features

                                        • 🤖 Robust 2-stage (detection + recognition) OCR predictors with pretrained parameters

                                        • ⚡ User-friendly, 3 lines of code to load a document and extract text with a predictor

                                        • -
                                        • 🚀 State-of-the-art performance on public document datasets, comparable with GoogleVision/AWS Textract

                                        • +
                                        • 🚀 State-of-the-art performances on public document datasets, comparable with GoogleVision/AWS Textract

                                        • ⚡ Optimized for inference speed on both CPU & GPU

                                        • -
                                        • 🐦 Light package, minimal dependencies

                                        • -
                                        • 🛠️ Actively maintained by Mindee

                                        • -
                                        • 🏭 Easy integration (available templates for browser demo & API deployment)

                                        • +
                                        • 🐦 Light package, small dependencies

                                        • +
                                        • 🛠️ Daily maintained

                                        • +
                                        • 🏭 Easy integration

                                        +
                                        +
                                        +

                                        Getting Started

                                        +
                                        +

                                        Build & train your predictor

                                        +
                                          +
                                        • Compose your own end-to-end OCR predictor: mix and match detection & recognition predictors (all-pretrained)

                                        • +
                                        • Fine-tune or train from scratch any detection or recognition model to specialize on your data

                                        • +
                                        +

                                        Model zoo

                                        Text detection models

                                        -

                                        Text recognition models

                                        -

                                        Supported datasets

                                        -
                                        @@ -369,7 +407,7 @@

                                        Supported datasets - +
                                        Next @@ -409,8 +447,10 @@

                                        Supported datasets diff --git a/v0.6.0/installing.html b/v0.6.0/installing.html index b79f453bd6..8068adc0ba 100644 --- a/v0.6.0/installing.html +++ b/v0.6.0/installing.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Installation - docTR documentation @@ -227,28 +227,21 @@ @@ -290,16 +283,16 @@

                                        Installation

                                        -

                                        This library requires Python 3.6 or higher.

                                        +

                                        This library requires Python 3.6 or higher.

                                        Prerequisites

                                        Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

                                        If you are running another OS than Linux, you will need a few extra dependencies.

                                        -

                                        For MacOS users, you can install them using Homebrew as follows:

                                        +

                                        For MacOS users, you can install them as follows:

                                        brew install cairo pango gdk-pixbuf libffi
                                         
                                        @@ -307,17 +300,10 @@

                                        Prerequisites

                                        Via Python Package

                                        -

                                        Install the last stable release of the package using pip:

                                        +

                                        Install the last stable release of the package using pip:

                                        pip install python-doctr
                                         
                                        -

                                        We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

                                        -
                                        # for TensorFlow
                                        -pip install "python-doctr[tf]"
                                        -# for PyTorch
                                        -pip install "python-doctr[torch]"
                                        -
                                        -

                                        Via Git

                                        @@ -326,14 +312,6 @@

                                        Via Git¶ pip install -e doctr/.

                                        -

                                        Again, for framework-specific builds:

                                        -
                                        git clone https://github.com/mindee/doctr.git
                                        -# for TensorFlow
                                        -pip install -e doctr/.[tf]
                                        -# for PyTorch
                                        -pip install -e doctr/.[torch]
                                        -
                                        -

                                        @@ -342,12 +320,12 @@

                                        Via Git

                                    • Synthetic dataset generator
                                        @@ -965,7 +1077,7 @@

                                        Dataloader + diff --git a/v0.6.0/modules/io.html b/v0.6.0/modules/io.html index 33641d2d89..0706457520 100644 --- a/v0.6.0/modules/io.html +++ b/v0.6.0/modules/io.html @@ -236,12 +236,15 @@

                                        Package Reference

                                      • doctr.models.zoo
                                      • doctr.models.factory
                                          @@ -1157,7 +1598,7 @@

                                          doctr.models.factory - + diff --git a/v0.6.0/modules/transforms.html b/v0.6.0/modules/transforms.html index 1e8ce4ee14..1684036838 100644 --- a/v0.6.0/modules/transforms.html +++ b/v0.6.0/modules/transforms.html @@ -236,12 +236,15 @@

                                          Package Reference

                                        • Composing transformations
                                            @@ -801,7 +830,7 @@

                                            Composing transformations + diff --git a/v0.6.0/modules/utils.html b/v0.6.0/modules/utils.html index 73048c2693..f9836a1705 100644 --- a/v0.6.0/modules/utils.html +++ b/v0.6.0/modules/utils.html @@ -236,12 +236,15 @@

                                            Package Reference

                                              +
                                            • doctr.contrib
                                            • doctr.datasets
                                            • doctr.io
                                            • doctr.models
                                            • @@ -317,38 +320,25 @@

                                              Visualization>>> plt.show() -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • page – the exported Page of a Document

                                              • -
                                              • image – np array of the page, needs to have the same shape than page[‘dimensions’]

                                              • -
                                              • words_only – whether only words should be displayed

                                              • -
                                              • display_artefacts – whether artefacts should be displayed

                                              • -
                                              • scale – figsize of the largest windows side

                                              • -
                                              • interactive – whether the plot should be interactive

                                              • -
                                              • add_labels – for static plot, adds text labels on top of bounding box

                                              • -
                                              -
                                              -
                                              - - -
                                              -
                                              -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_family: str | None = None) ndarray[source]
                                              -

                                              Draw a the content of the element page (OCR response) on a blank page.

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • page – exported Page object to represent

                                              • -
                                              • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

                                              • -
                                              • font_size – size of the font, default font = 13

                                              • -
                                              • font_family – family of the font

                                              • -
                                              -
                                              -
                                              Returns:
                                              -

                                              the synthesized page

                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              page: the exported Page of a Document +image: np array of the page, needs to have the same shape than page[‘dimensions’] +words_only: whether only words should be displayed +display_artefacts: whether artefacts should be displayed +scale: figsize of the largest windows side +interactive: whether the plot should be interactive +add_labels: for static plot, adds text labels on top of bounding box +**kwargs: keyword arguments for the polygon patch

                                              +
                                              +
                                              +
                                              +

                                              Returns:

                                              +
                                              +

                                              the matplotlib figure

                                              +
                                              +
                                              @@ -388,14 +378,13 @@

                                              Visualization update(gt: List[str], pred: List[str]) None[source]

                                              Update the state of the metric with new predictions

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • gt – list of groung-truth character sequences

                                              • -
                                              • pred – list of predicted character sequences

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              gt: list of groung-truth character sequences +pred: list of predicted character sequences

                                              +
                                              +
                                              @@ -404,8 +393,11 @@

                                              Visualization
                                              Returns:
                                              -

                                              a dictionary with the exact match score for the raw data, its lower-case counterpart, its unidecode -counterpart and its lower-case unidecode counterpart

                                              +

                                                +
                                              • a dictionary with the exact match score for the raw data, its lower-case counterpart, its anyascii

                                              • +
                                              • counterpart and its lower-case anyascii counterpart

                                              • +
                                              +

                                              @@ -414,7 +406,7 @@

                                              Visualization
                                              -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                              +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                              Implements common confusion metrics and mean IoU for localization evaluation.

                                              The aggregated metrics are computed as follows:

                                              @@ -445,28 +437,23 @@

                                              Visualization>>> metric.summary()

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                              • -
                                              • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                              • -
                                              • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                              • -
                                              • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                              +
                                              update(gts: ndarray, preds: ndarray) None[source]

                                              Updates the metric

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                              • -
                                              • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              gts: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +preds: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                              +
                                              +
                                              @@ -474,17 +461,18 @@

                                              Visualizationsummary() Tuple[float | None, float | None, float | None][source]

                                              Computes the aggregated metrics

                                              -
                                              Returns:
                                              +
                                              Return type:

                                              a tuple with the recall, precision and meanIoU scores

                                              +
                                              -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                              +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                              Implements an end-to-end OCR metric.

                                              The aggregated metrics are computed as follows:

                                              @@ -519,30 +507,25 @@

                                              Visualization>>> metric.summary()

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                              • -
                                              • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                              • -
                                              • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                              • -
                                              • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                              +
                                              update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]

                                              Updates the metric

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                              • -
                                              • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                              • -
                                              • gt_labels – a list of N string labels

                                              • -
                                              • pred_labels – a list of M string labels

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones +gt_labels: a list of N string labels +pred_labels: a list of M string labels

                                              +
                                              +
                                              @@ -550,17 +533,18 @@

                                              Visualizationsummary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

                                              Computes the aggregated metrics

                                              -
                                              Returns:
                                              +
                                              Return type:

                                              a tuple with the recall & precision for each string comparison and the mean IoU

                                              +
                                              -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
                                              +class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False)[source]

                                              Implements an object detection metric.

                                              The aggregated metrics are computed as follows:

                                              @@ -595,30 +579,25 @@

                                              Visualization>>> metric.summary()

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

                                              • -
                                              • use_polygons – if set to True, predictions and targets will be expected to have rotated format

                                              • -
                                              • mask_shape – if use_polygons is True, describes the spatial shape of the image used

                                              • -
                                              • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match +use_polygons: if set to True, predictions and targets will be expected to have rotated format

                                              +
                                              update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]

                                              Updates the metric

                                              -
                                              -
                                              Parameters:
                                              -
                                                -
                                              • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

                                              • -
                                              • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

                                              • -
                                              • gt_labels – an array of class indices of shape (N,)

                                              • -
                                              • pred_labels – an array of class indices of shape (M,)

                                              • -
                                              -
                                              -
                                              +
                                              +

                                              Args:

                                              +
                                              +

                                              gt_boxes: a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones +pred_boxes: a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones +gt_labels: an array of class indices of shape (N,) +pred_labels: an array of class indices of shape (M,)

                                              +
                                              +
                                              @@ -626,12 +605,13 @@

                                              Visualizationsummary() Tuple[float | None, float | None, float | None][source]

                                              Computes the aggregated metrics

                                              -
                                              Returns:
                                              +
                                              Return type:

                                              a tuple with the recall & precision for each class prediction and the mean IoU

                                              +
                                              @@ -695,7 +675,6 @@

                                              Visualizationdoctr.utils
                                              • Visualization
                                              • Task evaluation
                                                  @@ -732,7 +711,7 @@

                                                  Visualization + diff --git a/v0.6.0/notebooks.html b/v0.6.0/notebooks.html index 63b0d24528..42abaa6cfd 100644 --- a/v0.6.0/notebooks.html +++ b/v0.6.0/notebooks.html @@ -236,12 +236,15 @@

                                                  Package Reference

                                                    +
                                                  • doctr.contrib
                                                  • doctr.datasets
                                                  • doctr.io
                                                  • doctr.models
                                                  • @@ -378,7 +381,7 @@

                                                    docTR Notebooks + diff --git a/v0.6.0/objects.inv b/v0.6.0/objects.inv index 064f7bc917..a22d2ce821 100644 Binary files a/v0.6.0/objects.inv and b/v0.6.0/objects.inv differ diff --git a/v0.6.0/search.html b/v0.6.0/search.html index 05dffca387..fea94ac955 100644 --- a/v0.6.0/search.html +++ b/v0.6.0/search.html @@ -227,35 +227,20 @@ - + diff --git a/v0.6.0/searchindex.js b/v0.6.0/searchindex.js index 560c8e6a3f..231483d7a6 100644 --- a/v0.6.0/searchindex.js +++ b/v0.6.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[11, null]], "Annotation typing": [[2, "annotation-typing"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[13, "available-datasets"]], "Available architectures": [[15, "available-architectures"], [15, "id1"], [15, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[13, null]], "Choosing the right model": [[15, null]], "Classification": [[12, "classification"]], "Code quality": [[2, "code-quality"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[13, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[12, "detection"], [13, "detection"]], "Detection predictors": [[15, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Docstring format": [[2, "docstring-format"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[15, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[14, "half-precision"]], "Import order": [[2, "import-order"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Lint verification": [[2, "lint-verification"]], "Loading from Huggingface Hub": [[12, "loading-from-huggingface-hub"]], "Main Features": [[4, "main-features"]], "Model compression": [[14, "model-compression"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[12, "naming-conventions"]], "Object Detection": [[13, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Post-training quantization": [[14, "post-training-quantization"]], "Preparing your model for inference": [[14, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[12, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[12, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[12, "recognition"], [13, "recognition"]], "Recognition predictors": [[15, "recognition-predictors"]], "Scope": [[1, "scope"]], "Share your model with the community": [[12, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [13, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "TensorFlow Lite": [[14, "tensorflow-lite"]], "Text Detection": [[15, "text-detection"]], "Text Recognition": [[15, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition model zoo": [[15, "id3"]], "Text recognition models": [[4, "text-recognition-models"]], "Two-stage approaches": [[15, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[13, "use-your-own-datasets"]], "Using SavedModel": [[14, "using-savedmodel"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[15, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id3"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet18_rotation() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18_rotation", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet18_rotation"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 11, 12], "0": [1, 3, 5, 8, 9, 13, 15], "00": 15, "01": 15, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": 15, "02562": 7, "03": 15, "035": [], "0361328125": 15, "04": 15, "05": [], "06": 15, "06640625": 15, "07": [], "08": [8, 15], "09": [], "0966796875": 15, "1": [3, 5, 6, 7, 8, 9, 11, 13, 14, 15], "10": [5, 9, 15], "100": [5, 8, 9, 13, 14, 15], "1000": 15, "101": 5, "1024": [7, 9, 14, 15], "104": [], "106": [], "108": 5, "1095": 13, "11": 15, "110": 9, "1107": 13, "114": [], "115": [], "1156": 13, "116": 5, "118": [], "11800h": [], "11th": [], "12": [3, 15], "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 15], "13": [9, 15], "130": 5, "13068": 13, "131": 5, "1337891": 13, "1357421875": 15, "1396484375": 15, "14": 15, "1420": 15, "14470v1": [], "149": 13, "15": 15, "150": [9, 15], "154": [], "1552": 15, "16": 7, "160": [], "1630859375": 15, "1684": 15, "16x16": 7, "17": [], "1778": 15, "1782": 15, "18": [7, 15], "185546875": 15, "19": [], "1900": 15, "1910": 7, "19342": 13, "19370": 13, "195": [], "19598": [], "199": 15, "1999": 15, "1m": 15, "2": [3, 4, 5, 6, 8, 11, 14, 15], "20": 15, "200": 9, "2000": 13, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 13, "21": 15, "2103": [], "2186": 13, "21888": 13, "22": [], "224": [7, 8, 14], "225": 8, "22672": 13, "229": [8, 13], "23": [], "233": 13, "234": 5, "236": [], "24": [], "246": 13, "249": 13, "25": 15, "2504": 15, "255": [6, 7, 8, 9, 15], "256": 7, "257": 13, "26": [], "26032": 13, "264": [], "27": 15, "2700": 13, "2710": 15, "2749": [], "28": [], "287": [], "29": 15, "296": [], "299": [], "2d": 15, "2m": 15, "3": [3, 4, 6, 7, 8, 9, 14, 15], "30": 15, "300": 13, "3000": 13, "301": [], "30595": 15, "30ghz": [], "31": [7, 15], "32": [5, 7, 8, 13, 14, 15], "3232421875": 15, "33": 8, "33402": 13, "33608": 13, "34": [7, 15], "340": 15, "3456": 15, "35": 15, "3515625": 15, "36": 15, "360": 13, "37": [5, 15], "38": 15, "39": 15, "4": [7, 8, 9, 15], "40": [], "406": 8, "41": 15, "42": 15, "43": 15, "44": [], "45": 15, "456": 8, "46": 15, "47": 15, "472": [], "48": [5, 15], "485": 8, "49": 15, "49377": [], "5": [5, 8, 9, 15], "50": [7, 13, 15], "51": 15, "51171875": 15, "512": 7, "52": [5, 15], "529": 15, "53": 15, "533": [], "54": [], "540": 15, "5478515625": 15, "55": [], "56": 15, "57": 15, "58": [], "580": 15, "5810546875": 15, "583": 15, "59": 15, "595": [], "597": 15, "5k": [4, 5], "5m": 15, "6": [3, 8, 15], "60": 8, "600": [7, 9, 15], "61": 15, "611": [], "62": 15, "625": [], "626": 13, "629": [], "63": 15, "630": [], "64": [7, 8, 15], "640": [], "641": 15, "647": 13, "65": 15, "66": 15, "660": [], "664": [], "666": [], "67": 15, "672": [], "68": 15, "689": [], "69": 15, "693": [], "694": [], "695": [], "6m": [], "7": 15, "70": [9, 15], "700": [], "701": [], "702": [], "707470": 13, "71": 15, "7100000": 13, "713": [], "7141797": [], "7149": 13, "72": 15, "72dpi": 6, "73": 15, "73257": 13, "733": [], "74": 15, "745": [], "75": [8, 15], "753": [], "7581382": 13, "76": 15, "77": 15, "772": [], "772875": 13, "78": 15, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 15, "793533": [], "796": 13, "798": [], "7m": 15, "8": [7, 8, 15], "80": 15, "800": [7, 9, 13, 15], "81": 15, "817": [], "82": 15, "8275l": 15, "83": 15, "830": [], "84": 15, "849": 13, "85": 15, "8564453125": 15, "857": 15, "85875": 13, "86": 15, "860": [], "8603515625": 15, "862": [], "863": [], "87": 15, "8707": 13, "875": [], "88": [], "89": 15, "8m": 15, "9": [], "90": 15, "90k": [], "90kdict32px": 5, "91": 15, "913": [], "914085328578949": 15, "917": [], "92": 15, "921": [], "93": 15, "94": [5, 15], "95": [9, 15], "9578408598899841": 15, "96": 15, "97": 15, "98": 15, "99": 15, "9949972033500671": 15, "A": [1, 2, 4, 5, 6, 7, 10, 14], "And": 14, "As": 2, "Be": [], "Being": 1, "By": 11, "For": [1, 2, 3, 15], "If": [2, 3, 6, 7, 14, 15], "In": [2, 5, 13], "It": [8, 12], "Its": [4, 7], "No": [1, 15], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 15], "Then": [], "There": 11, "To": [2, 3, 12, 15], "_": [1, 5, 7, 14], "__call__": [], "_build": 2, "_helper": [], "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 13], "abl": [13, 15], "about": [1, 11, 13, 15], "abov": 15, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 13, 15], "account": [1, 12, 14], "accur": [], "accuraci": 9, "achiev": 14, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 12], "add_hook": [], "add_label": 9, "addit": [2, 3, 6], "addition": [2, 14, 15], "address": [1, 6], "adjust": [2, 8], "advanc": 1, "advantag": [], "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [2, 12, 15], "ag": 1, "again": [], "aggreg": [9, 13], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 13, 15], "allow": [1, 11], "along": 15, "alreadi": 2, "also": [1, 7, 12, 13, 15], "alwai": 13, "amazon": [11, 15], "an": [1, 2, 4, 5, 6, 7, 9, 14, 15], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 14, 15], "annot": 5, "anot": 13, "anoth": [3, 7, 13], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "anywher": 11, "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 12, "appropri": [1, 2, 15], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 11, 13, 15], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [], "arch": [7, 12], "architectur": [4, 7, 12], "archiv": [], "area": 15, "arg": [5, 7], "argument": [5, 7, 15], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 15], "artefact_typ": 6, "artifici": [4, 5], "arxiv": 7, "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 15], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 15], "astyp": [7, 9, 14, 15], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": [], "autoregress": [], "avail": [1, 4, 8], "averag": [8, 15], "avoid": [1, 3], "aw": [4, 15], "awar": [], "azur": [], "b": [7, 9], "b_j": 9, "back": 2, "backbon": 7, "backend": 15, "background": 13, "bangla": [], "bar": [], "bar_cod": 13, "base": [4, 7], "baselin": [4, 7, 15], "bash": [], "batch": [5, 7, 8, 13, 15], "batch_siz": [5, 13], "bblanchon": [], "bbox": 15, "becaus": 11, "been": [9, 13, 15], "befor": [5, 7, 8, 15], "begin": 9, "behavior": 1, "being": [9, 15], "belong": 15, "below": 15, "benchmark": 15, "best": 1, "beta": [], "better": [10, 15], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 15], "bit": [], "blank": 9, "block": [9, 15], "block_1_1": 15, "blue": 9, "blur": 8, "bmvc": 5, "bn": 12, "bodi": [1, 15], "bool": [5, 6, 7, 8, 9], "boolean": [7, 15], "both": [4, 5, 8, 13, 15], "bottom": [7, 15], "bound": [5, 6, 7, 8, 9, 15], "box": [5, 6, 7, 8, 9, 13, 15], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": [], "byte": [6, 15], "c": [6, 9], "c5": 15, "c_j": 9, "cach": [5, 11], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 15], "capabl": [2, 10, 15], "case": [5, 9, 15], "catch": 2, "cf": 15, "cfg": 15, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 11, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 13, 15], "charactergener": [5, 13], "characterist": 1, "charg": 15, "charset": 15, "chart": 6, "check": [2, 12, 15], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 15], "class_nam": [], "classif": 13, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "cleaner": 2, "clear": [], "clone": 3, "close": 2, "co": 12, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 11, 12, 15], "combin": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 2, 8, 9], "commun": 1, "compar": 4, "comparison": [9, 15], "competit": 5, "compil": [10, 15], "complaint": 1, "complementari": 9, "complet": [], "compli": 2, "compon": 15, "compos": [5, 15], "comprehens": 15, "comput": [5, 9, 15], "conf_threshold": [], "confid": [6, 9, 15], "config": [2, 7], "configur": [2, 7], "confus": 9, "consecut": [8, 15], "consequ": 1, "consid": [1, 2, 5, 6, 9, 15], "consist": 15, "consolid": [4, 5], "constant": 8, "constraint": 14, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 13, 15], "content": [5, 6, 9, 15], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": 14, "convent": 2, "convers": 6, "convert": [6, 8, 14], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 15], "cord": [4, 5, 13, 15], "core": 9, "corner": 15, "correct": 8, "correspond": [3, 6, 15], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 15], "creat": 12, "crnn": [4, 7, 12], "crnn_mobilenet_v3_larg": [7, 12, 15], "crnn_mobilenet_v3_smal": [7, 15], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 12, 15], "crop": [7, 8, 13, 15], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": [], "currenc": 5, "current": 15, "custom": 12, "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": 5, "d": [5, 13], "daili": [], "danish": [], "data": [4, 5, 6, 8, 9, 12], "dataload": 13, "dataset": [7, 15], "dataset_info": 5, "date": 15, "db": 12, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 12, 15], "db_resnet34": [], "db_resnet50": [7, 12, 14, 15], "db_resnet50_rot": 15, "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 15], "def": 14, "default": [6, 9, 11, 14], "defer": 13, "defin": 9, "deform": [], "degre": 8, "degress": 6, "delet": [], "delimit": 15, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 12], "det_b": [], "det_model": 12, "det_param": [], "det_predictor": [], "detail": 15, "detect": [5, 9, 10], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 15], "detection_task": [], "detectiondataset": [5, 13], "detectionmetr": 9, "detectionpredictor": 7, "detector": [], "deterior": 7, "determin": 1, "dev": [2, 11], "develop": 3, "developp": 3, "deviat": 8, "devic": [], "dict": [6, 9, 15], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 13], "dimens": [6, 9, 15], "dimension": 8, "direct": 5, "directli": [12, 15], "directori": 11, "disabl": [1, 11], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 15, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 15, "divers": 1, "divid": 6, "do": [2, 3, 7, 11, 14], "doc": [2, 6, 15], "docartefact": [5, 13], "docstr": [], "doctr": [3, 11, 12, 13, 14, 15], "doctr_cache_dir": 11, "doctr_multiprocessing_dis": 11, "document": [5, 7, 9, 10, 13, 15], "documentbuild": [], "documentfil": [6, 12], "doe": 11, "doesn": [], "don": 15, "done": 8, "download": [5, 13], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 14], "dual": [], "dummi": 12, "dummy_img": 15, "dummy_input": [], "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 13, 15], "eas": 2, "easi": [4, 9, 12], "easier": [], "easili": [6, 9, 12, 13, 14, 15], "ec2": 15, "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 15], "element": [5, 6, 7, 9, 15], "els": 2, "email": 1, "empathi": 1, "en": 15, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 15], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 13], "enivron": 11, "enough": [2, 15], "ensur": 2, "entir": [], "entri": 5, "environ": 1, "eo": 5, "equiv": 15, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [13, 15], "event": 1, "everyon": 1, "everyth": [2, 15], "exact": [9, 15], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 12], "exchang": [], "exclud": [], "execut": [], "exist": 12, "expand": 8, "expect": [2, 6, 8, 9], "experi": 1, "explan": [1, 15], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 14, 15], "export_as_straight_box": [7, 15], "export_as_xml": 15, "export_model_to_onnx": [], "express": [1, 8], "extens": 6, "extern": [1, 13], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fallback": 14, "fals": [5, 6, 7, 8, 9, 14, 15], "famili": 9, "faq": 1, "fascan": [], "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 15, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 12], "felix92": 12, "few": [3, 14], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": [7, 14], "find": [2, 3, 13], "fine": 15, "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 15], "fitz": [], "flag": 15, "flake8": 2, "flexibl": [], "flip": 8, "float": [6, 8, 9, 14], "float16": 14, "float32": [6, 7, 8, 14], "fn": 8, "focu": 12, "focus": [1, 5], "folder": [2, 5, 14], "follow": [1, 2, 3, 5, 8, 9, 12, 14, 15], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": [], "forg": [], "form": [4, 5, 15], "format": [6, 9, 13, 14, 15], "forpost": [4, 5], "forum": 2, "fp": 15, "fp16": [], "frac": 9, "frame": 15, "framework": [3, 12, 13, 15], "free": [1, 2, 12], "french": [5, 12, 15], "friendli": 4, "from": [1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15], "from_hub": [7, 12], "from_imag": [6, 12], "from_keras_model": 14, "from_pdf": 6, "from_url": 6, "full": [5, 9, 15], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 13, 15], "further": 13, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": [], "gender": 1, "gener": [2, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 15], "geq": 9, "german": 5, "get": 15, "get_artefact": [], "get_lin": [], "get_text_word": [], "get_word": [], "gettextword": [], "git": 12, "github": [2, 3, 12], "give": 1, "given": [5, 6, 8, 9, 15], "global": 7, "go": 15, "good": 14, "googl": 2, "googlevis": 4, "gpu": 4, "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 13, "gvision": 15, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 13], "half": [], "handl": 13, "handwrit": 5, "handwritten": 13, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 12, 13, 14, 15], "head": [7, 15], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 15], "help": 14, "here": [3, 8, 10, 13, 15], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": 5, "hocr": 15, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 12, 13], "howev": 13, "hsv": 8, "html": [1, 2, 15], "http": [1, 3, 6, 7, 11, 12, 15], "hub": 7, "hue": 8, "huggingfac": 7, "hw": [], "i": [1, 2, 5, 6, 7, 8, 9, 11, 12, 13, 14], "i7": [], "ic03": [4, 5, 13], "ic13": [4, 5, 13], "icdar": [4, 5], "icdar2019": 5, "id": 15, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 13], "iiithw": [], "imag": [4, 5, 6, 7, 8, 9, 12, 13, 15], "imagenet": 7, "imageri": 1, "images_90k_norm": [], "img": [5, 8, 13], "img_cont": 6, "img_fold": [5, 13], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 13], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 8, 9, 14, 15], "import": [5, 6, 7, 8, 9, 12, 13, 14, 15], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 13], "inclus": 1, "incom": 2, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inference_input_typ": 14, "inference_output_typ": 14, "inform": [1, 2, 4, 5, 13, 15], "inherit": 14, "ini": 2, "input": [2, 6, 7, 8, 15], "input_crop": 7, "input_pag": [7, 9, 15], "input_shap": 14, "input_t": 14, "input_tensor": 7, "inspir": [1, 8], "instal": 12, "instanc": [1, 15], "instanti": [7, 15], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "int8": 14, "integ": [9, 14], "integr": [4, 12, 13], "intel": [], "interact": [1, 6, 9], "interfac": 12, "interoper": [], "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": 15, "involv": [1, 15], "io": 12, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 13], "isn": 5, "isort": 2, "issu": [1, 2, 12], "italian": [], "iter": [5, 8, 13], "its": [6, 7, 8, 9, 13, 15], "itself": [7, 12], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 12], "json": [5, 13, 15], "json_output": 15, "jump": 2, "just": [1, 14], "keep": 2, "kei": [], "kera": [7, 14], "kernel": 8, "kernel_s": 14, "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [1, 15], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 13], "label_fil": [5, 13], "label_fold": 5, "label_path": [5, 13], "labels_path": [5, 13], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 15, "languag": [1, 4, 5, 6, 7, 12, 15], "larg": [7, 12], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 15], "latin": 5, "layer": [], "layout": 15, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15], "least": 3, "left": [9, 15], "legacy_french": 5, "length": 5, "less": [], "let": [], "letter": [], "level": [1, 5, 9, 15], "levenshtein": [], "leverag": 10, "lf": 12, "libffi": 3, "librari": [2, 3, 10], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 15], "line_1_1": 15, "link": [], "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 15], "linknet_resnet18_rot": [7, 15], "linknet_resnet34": 7, "linknet_resnet50": 7, "linux": 3, "list": [5, 6, 8, 9, 12], "ll": 9, "load": [4, 5, 7, 14], "load_state_dict": [], "load_weight": [], "loader": [], "loc_pr": [], "local": [2, 4, 5, 7, 9, 13, 15], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": 7, "login_to_hub": [7, 12], "logo": [6, 13], "look": 15, "love": 12, "lower": [8, 9], "m": [9, 15], "m1": 3, "macbook": 3, "machin": [], "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 15], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 11, 12, 14, 15], "mani": [13, 15], "manipul": [], "map": 5, "map_loc": [], "mask_shap": 9, "master": [4, 7, 15], "match": [9, 15], "mathcal": 9, "matplotlib": 9, "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 13], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8, 14], "mbox": 9, "mean": [8, 9], "meaniou": 9, "meant": [6, 14], "measur": 15, "media": 1, "median": [], "meet": [], "member": 1, "memori": [9, 11], "mention": 15, "merg": 5, "messag": 2, "meta": 15, "metadata": [], "metal": 3, "method": [8, 15], "metric": [9, 15], "middl": [], "might": [14, 15], "min": 8, "min_area": 8, "min_char": [5, 13], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": [], "mixed_precis": [], "mjsynth": [4, 5, 13], "mnt": 5, "mobilenet": [7, 12], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "mobilenetv3_larg": [], "mobilenetv3_smal": [], "modal": [], "mode": 3, "model": [5, 9, 11, 13], "model_nam": [7, 12], "model_path": [], "moder": 1, "modif": 2, "modifi": 7, "modul": [6, 8, 9, 15], "moment": 15, "more": [2, 9, 11, 13, 15], "most": 15, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": 11, "my": 7, "my_awesome_model": 12, "my_hook": [], "mypi": 2, "n": [5, 9], "na": [], "name": [5, 7, 15], "nation": 1, "natur": [1, 4, 5], "nb": 15, "ndarrai": [5, 6, 8, 9], "necessari": 3, "need": [2, 3, 5, 9, 11, 12], "neg": 8, "nest": 15, "nestedobject": [], "network": [4, 5, 7], "neural": [4, 5, 7], "new": [2, 9], "newer": [], "next": [5, 13], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 15], "normal": [7, 8], "norwegian": [], "note": [0, 5, 7, 12], "now": 2, "np": [7, 8, 9, 14, 15], "num_output_channel": 8, "num_sampl": [5, 13], "num_work": 5, "number": [5, 8, 9, 15], "numpi": [6, 7, 9, 15], "o": 3, "obb": [], "obj_detect": 12, "object": [5, 9, 10, 15], "objectness_scor": [], "oblig": 1, "obtain": 15, "occupi": [], "ocr": [4, 5, 7, 9, 12, 13], "ocr_carea": 15, "ocr_db_crnn": 9, "ocr_lin": 15, "ocr_pag": 15, "ocr_par": 15, "ocr_predictor": [7, 12, 15], "ocrdataset": [5, 13], "ocrmetr": 9, "ocrpredictor": 7, "ocrx_word": 15, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 15, "one": [2, 5, 7, 8, 12, 15], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 12, 13, 15], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 12], "oper": [2, 14], "opinion": 1, "opsset": 14, "optic": [4, 15], "optim": [4, 14], "option": 5, "order": [5, 6, 8], "org": [1, 7, 15], "organ": 6, "orient": [1, 6, 7, 15], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [7, 15], "out": [2, 7, 8, 9, 15], "outpout": 15, "output": [6, 8], "output_s": [6, 8], "outsid": 11, "over": [3, 5, 9, 15], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 12, "own": 4, "p": [8, 9, 15], "packag": [2, 4, 9, 11, 13, 14], "pad": [5, 7, 8, 15], "page": [3, 5, 7, 9, 15], "page1": 6, "page2": 6, "page_1": 15, "page_idx": [6, 15], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 15, "paragraph": [], "paragraph_break": [], "param": [8, 15], "paramet": [4, 5, 6, 7, 8, 9], "pars": [4, 5], "parseq": 7, "part": [5, 8, 15], "parti": 3, "partial": [], "particip": 1, "pass": [5, 6, 7, 15], "password": 6, "patch": 7, "path": [5, 6, 13, 14], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": 1, "pdf": [6, 7, 10], "pdf_render": [], "pdfdocument": 6, "pdfpage": [], "peopl": 1, "per": [8, 15], "perform": [4, 6, 8, 9, 14, 15], "period": 1, "permiss": 1, "permut": [], "persian_lett": 5, "person": [1, 13], "phase": 15, "photo": 13, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 15], "platinum": 15, "pleas": 2, "plot": 9, "plt": 9, "plug": 12, "plugin": 3, "png": 6, "point": [], "polici": 11, "polish": [], "polit": 1, "polygon": [5, 15], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 12], "post": [1, 15], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 15, "pr": 2, "pre": 7, "precis": [9, 15], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 13, "predict": [6, 7, 9], "predictor": [4, 6, 7, 12], "prefer": 13, "preinstal": [], "preprocessor": 15, "prerequisit": 12, "present": 10, "preserv": [7, 8, 15], "preserve_aspect_ratio": [6, 7, 8, 15], "pretrain": [4, 7, 9, 14, 15], "pretrained_backbon": 7, "print": 15, "prior": 5, "privaci": 1, "privat": [1, 15], "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 15], "processor": 15, "produc": [10, 15], "product": 14, "profession": 1, "project": [2, 13], "promptli": 1, "proper": 2, "properli": 5, "properti": 14, "provid": [1, 2, 4, 12, 13, 14, 15], "public": [1, 4], "publicli": 15, "publish": 1, "pull": 12, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 12], "py": [2, 12], "pydocstyl": 2, "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": 12, "pytorch": [3, 4, 7, 8, 12, 15], "q": 2, "qr": 6, "qr_code": 13, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": [], "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 14, 15], "random": [7, 8, 9, 14, 15], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": [8, 14], "rassi": [], "ratio": [7, 8, 15], "raw": [6, 9], "re": [], "read": [2, 4, 5, 7, 11], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 14, "real": [4, 7, 8], "reason": 1, "rebuild": [], "rebuilt": [], "recal": [9, 15], "receipt": [4, 5, 15], "reco_arch": [7, 12], "reco_b": [], "reco_model": 12, "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": [5, 9], "recognition_predictor": [7, 15], "recognition_task": [5, 13], "recognitiondataset": [5, 13], "recognitionpredictor": 7, "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 12, 13, 15], "regardless": 1, "region": [], "regroup": 9, "regular": 13, "reject": 1, "rel": [6, 8, 9], "relat": [2, 6], "releas": [0, 3], "relev": [], "religion": 1, "relu": 14, "remov": 1, "render": 6, "render_pdf_topil": [], "render_to": 6, "reorder": 2, "repo": 7, "repo_id": [7, 12], "report": 1, "repositori": [2, 5, 7, 12], "repres": [1, 9, 15], "represent": [4, 7], "representative_dataset": 14, "request": [1, 12], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 15], "resnet": 7, "resnet18": [7, 12], "resnet31": 7, "resnet34": 7, "resnet50": [7, 12], "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": [13, 14], "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": [], "result": [2, 5, 6, 10, 12, 15], "resum": 15, "return": [5, 6, 7, 9, 15], "reusabl": 15, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "road": 15, "robust": [4, 5], "root": [2, 5], "rotat": [5, 6, 7, 8, 9, 13, 15], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 13, 15], "sampl": [5, 13, 15], "sample_transform": 5, "sane": 2, "sar": [4, 7], "sar_resnet31": [7, 15], "sar_vgg16_bn": [], "satur": 8, "save": [7, 13, 14], "saved_model": 14, "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 13], "seamless": 4, "seamlessli": [4, 15], "search": 7, "searchabl": 10, "sec": [], "second": 15, "section": [12, 14, 15], "secur": [1, 11], "see": [1, 2], "seemlessli": [], "seen": 15, "segment": [4, 7, 15], "self": [], "semant": [4, 7], "send": 15, "sens": 9, "sensit": [13, 15], "separ": 15, "sequenc": [4, 5, 6, 7, 9, 15], "sequenti": [8, 14], "seri": 1, "serial": 14, "serialized_model": 14, "seriou": 1, "set": [1, 2, 5, 7, 9, 11, 15], "set_global_polici": [], "sever": [6, 8, 15], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 14, 15], "share": [11, 13, 15], "shift": 8, "shm": 11, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 12], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 13, "simpl": [4, 7], "simpler": 7, "sinc": [5, 13, 15], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 15], "skew": 15, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 15, "so": [2, 3, 5, 7, 12, 13], "social": 1, "socio": 1, "some": [2, 3, 10, 12, 13], "someth": 2, "somewher": 2, "sort": 1, "sourc": [5, 6, 7, 8, 9, 12], "space": 1, "span": 15, "spanish": 5, "spatial": [6, 9], "special": [], "specif": [2, 3, 9, 13, 15], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": [], "sroie": [4, 5, 13], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": 8, "step": [], "still": 15, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 13, 15], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 15], "strive": 3, "strong": [4, 7], "structur": 15, "style": 2, "subset": [5, 15], "suggest": [2, 12], "sum": 9, "summari": 9, "support": 15, "supported_op": 14, "supported_typ": 14, "sustain": 1, "svhn": [4, 5, 13], "svt": [5, 13], "swedish": [], "symbol": [], "symmetr": [7, 8, 15], "symmetric_pad": [7, 8, 15], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 13], "system": 15, "t": [2, 5, 15], "tabl": 12, "take": [1, 5, 14, 15], "target": [5, 6, 8, 9, 13], "target_s": 5, "target_spec": 14, "task": [4, 5, 7, 12, 13, 15], "task2": 5, "tax": 15, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 15], "tensorflow": [3, 4, 6, 7, 8, 12, 15], "tensorspec": [], "term": 1, "test": 13, "test_set": 5, "text": [5, 6, 7, 9, 13], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 15], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 15], "tf": [3, 6, 7, 8, 12, 14], "tf_model": 14, "tflite": 14, "tflite_builtins_int8": 14, "tfliteconvert": 14, "than": [2, 3, 9, 12], "thank": [], "thei": [1, 9, 15], "them": [3, 5, 15], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15], "thing": [11, 14, 15], "third": 3, "those": [1, 3, 6, 15], "threaten": 1, "threshold": [], "through": [1, 8, 13], "tilman": [], "time": [1, 4, 7, 9, 13], "tini": [], "titl": [6, 15], "tm": [], "tmp": 11, "togeth": [2, 6], "tograi": 8, "tool": 13, "top": [9, 15], "topic": 2, "torch": [3, 8, 12], "torchvis": 8, "total": [], "toward": [1, 3], "train": [2, 5, 7, 8, 12, 13, 15], "train_it": [5, 13], "train_load": [5, 13], "train_pytorch": 12, "train_set": [5, 13], "train_tensorflow": 12, "trainabl": [4, 7], "tranform": 8, "transcrib": 15, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15], "truth": 9, "tune": 14, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 11], "txt": 5, "type": [6, 12, 15], "typic": 15, "u": [1, 2, 15], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 15], "ukrainian": [], "unaccept": 1, "underli": 13, "underneath": 6, "understand": [4, 5, 15], "unfortun": 15, "unidecod": 9, "uniform": [7, 8, 14], "uniformli": 8, "uninterrupt": [6, 15], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 15], "updat": 9, "upgrad": [], "upper": [5, 8], "uppercas": 13, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 15], "usabl": 15, "usag": 11, "use_broadcast": 9, "use_polygon": [5, 9, 13], "useabl": 15, "user": [3, 4, 6, 10], "utf": 15, "util": 14, "v0": [], "v1": 12, "v3": [7, 12], "valid": 13, "valu": [2, 6, 8, 15], "valuabl": 4, "variabl": 11, "varieti": 5, "variou": 15, "veri": 7, "verifi": 2, "version": [1, 2, 3, 14, 15], "vgg": 7, "vgg16": 12, "vgg16_bn": [], "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7], "vitstr_bas": 7, "vitstr_smal": 7, "viz": [], "vocab": [12, 13, 15], "vocabulari": [5, 12], "w": [6, 7, 8, 9], "w3": 15, "wa": 1, "wai": [1, 4, 13], "want": [14, 15], "warm": [], "warmup": 15, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 12, 13, 15], "weasyprint": [], "web": 6, "websit": 5, "welcom": 1, "well": [1, 14], "were": [1, 6, 15], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9, 15], "whether": [2, 5, 6, 8, 9, 13], "which": [1, 7, 11, 13, 15], "whichev": 3, "while": [8, 15], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": [2, 14], "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 15], "word_1_1": 15, "word_1_2": 15, "word_1_3": 15, "wordgener": [5, 13], "words_onli": 9, "work": [11, 15], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 15], "worth": 7, "wrap": 15, "wrapper": [5, 8], "write": 11, "written": [1, 6], "www": [1, 6, 15], "x": [6, 8, 9], "x12larg": 15, "x_ascend": 15, "x_descend": 15, "x_i": 9, "x_size": 15, "x_wconf": 15, "xeon": 15, "xhtml": 15, "xmax": 6, "xmin": 6, "xml": 15, "xml_bytes_str": 15, "xml_element": 15, "xml_output": 15, "xmln": 15, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": 14, "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15], "your": [2, 4, 6, 9, 15], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": [], "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": [], "22": 0, "27": 0, "28": 0, "29": [], "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": [], "7": [], "8": [], "9": [], "advanc": [], "annot": 2, "approach": 15, "architectur": 15, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [13, 15], "aw": 11, "backbon": [], "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [13, 15], "classif": [7, 12], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 12, "compos": 8, "compress": 14, "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 12, "correct": 1, "coven": 1, "custom": 5, "data": 13, "dataload": 5, "dataset": [4, 5, 13], "detect": [4, 7, 12, 13, 15], "develop": 2, "do": 15, "docstr": 2, "doctr": [2, 4, 5, 6, 7, 8, 9, 10], "document": [2, 4, 6], "end": 15, "enforc": 1, "evalu": 9, "export": [], "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "format": 2, "from": 12, "gener": [5, 13], "get": [], "git": 3, "guidelin": 1, "half": 14, "hub": 12, "huggingfac": 12, "i": 15, "implement": [], "import": 2, "infer": 14, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 11, "let": 2, "line": 6, "lint": 2, "linux": [], "lite": 14, "load": [12, 13], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 12, 14, 15], "modifi": 2, "modul": [], "name": 12, "note": [], "notebook": 10, "object": 13, "ocr": 15, "onli": [], "onnx": [], "optim": [], "option": [], "order": 2, "orient": [], "our": 1, "output": 15, "own": 13, "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": 14, "pre": [], "precis": 14, "predictor": 15, "prepar": 14, "prerequisit": 3, "pretrain": 12, "privat": [], "process": [], "public": [], "push": 12, "python": 3, "qualiti": 2, "quantiz": 14, "question": 2, "read": 6, "readi": 13, "recognit": [4, 7, 12, 13, 15], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [], "right": 15, "savedmodel": 14, "scope": 1, "share": 12, "should": 15, "stage": 15, "standard": 1, "start": [], "structur": [2, 6], "style": [], "support": [4, 5, 8], "synthet": [5, 13], "task": 9, "temporari": 1, "tensorflow": 14, "test": 2, "text": [4, 15], "train": 14, "transform": 8, "two": 15, "type": 2, "unit": 2, "us": [13, 14], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 15, "word": 6, "your": [12, 13, 14], "zoo": [4, 7, 15]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"Artefact": [[2, "artefact"]], "Available Datasets": [[1, "available-datasets"]], "Block": [[2, "block"]], "Build & train your predictor": [[3, "build-train-your-predictor"]], "Changelog": [[0, null]], "Composing transformations": [[6, "composing-transformations"]], "Data Loading": [[1, "data-loading"]], "Detection models": [[5, "detection-models"]], "Detection predictors": [[5, "detection-predictors"]], "DocTR Vocabs": [[1, "id1"]], "DocTR: Document Text Recognition": [[3, null]], "Document": [[2, "document"]], "Document structure": [[2, "document-structure"]], "End-to-End OCR": [[5, "end-to-end-ocr"]], "File reading": [[2, "file-reading"]], "Getting Started": [[3, "getting-started"]], "Installation": [[4, null]], "Line": [[2, "line"]], "Main Features": [[3, "main-features"]], "Model compression": [[5, "model-compression"]], "Model export": [[5, "model-export"]], "Model zoo": [[3, "model-zoo"]], "Notes": [[3, null]], "Package Reference": [[3, null]], "Page": [[2, "page"]], "Pre-processing for detection": [[5, "pre-processing-for-detection"]], "Pre-processing for recognition": [[5, "pre-processing-for-recognition"]], "Prerequisites": [[4, "prerequisites"]], "Recognition models": [[5, "recognition-models"]], "Recognition predictors": [[5, "recognition-predictors"]], "Supported Vocabs": [[1, "supported-vocabs"]], "Supported datasets": [[3, "supported-datasets"]], "Supported transformations": [[6, "supported-transformations"]], "Task evaluation": [[7, "task-evaluation"]], "Text Detection": [[5, "text-detection"]], "Text Recognition": [[5, "text-recognition"]], "Text detection models": [[3, "text-detection-models"]], "Text recognition model zoo": [[5, "id2"]], "Text recognition models": [[3, "text-recognition-models"]], "Two-stage approaches": [[5, "two-stage-approaches"]], "Using SavedModel": [[5, "using-savedmodel"]], "Via Git": [[4, "via-git"]], "Via Python Package": [[4, "via-python-package"]], "Visualization": [[7, "visualization"]], "Word": [[2, "word"]], "doctr.datasets": [[1, null]], "doctr.documents": [[2, null]], "doctr.models": [[5, null]], "doctr.transforms": [[6, null]], "doctr.utils": [[7, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]]}, "docnames": ["changelog", "datasets", "documents", "index", "installing", "models", "transforms", "utils"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "datasets.rst", "documents.rst", "index.rst", "installing.rst", "models.rst", "transforms.rst", "utils.rst"], "indexentries": {"artefact (class in doctr.documents)": [[2, "doctr.documents.Artefact", false]], "as_images() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.as_images", false]], "block (class in doctr.documents)": [[2, "doctr.documents.Block", false]], "colorinversion (class in doctr.transforms)": [[6, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[6, "doctr.transforms.Compose", false]], "convert_to_fp16() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_fp16", false]], "convert_to_tflite() (in module doctr.models.export)": [[5, "doctr.models.export.convert_to_tflite", false]], "cord (class in doctr.datasets)": [[1, "doctr.datasets.CORD", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.crnn_vgg16_bn", false]], "dataloader (class in doctr.datasets.loader)": [[1, "doctr.datasets.loader.DataLoader", false]], "db_resnet50() (in module doctr.models.detection)": [[5, "doctr.models.detection.db_resnet50", false]], "detection_predictor() (in module doctr.models.detection)": [[5, "doctr.models.detection.detection_predictor", false]], "document (class in doctr.documents)": [[2, "doctr.documents.Document", false]], "documentfile (class in doctr.documents)": [[2, "doctr.documents.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[1, "doctr.datasets.encode_sequences", false]], "from_images() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_images", false]], "from_pdf() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_pdf", false]], "from_url() (doctr.documents.documentfile class method)": [[2, "doctr.documents.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[1, "doctr.datasets.FUNSD", false]], "get_artefacts() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_artefacts", false]], "get_words() (doctr.documents.pdf method)": [[2, "doctr.documents.PDF.get_words", false]], "lambdatransformation (class in doctr.transforms)": [[6, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.documents)": [[2, "doctr.documents.Line", false]], "linknet16() (in module doctr.models.detection)": [[5, "doctr.models.detection.linknet16", false]], "localizationconfusion (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.LocalizationConfusion", false]], "master() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.master", false]], "normalize (class in doctr.transforms)": [[6, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models.zoo)": [[5, "doctr.models.zoo.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[1, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[6, "doctr.transforms.OneOf", false]], "page (class in doctr.documents)": [[2, "doctr.documents.Page", false]], "pdf (class in doctr.documents)": [[2, "doctr.documents.PDF", false]], "quantize_model() (in module doctr.models.export)": [[5, "doctr.models.export.quantize_model", false]], "randomapply (class in doctr.transforms)": [[6, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[6, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[6, "doctr.transforms.RandomContrast", false]], "randomgamma (class in doctr.transforms)": [[6, "doctr.transforms.RandomGamma", false]], "randomhue (class in doctr.transforms)": [[6, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[6, "doctr.transforms.RandomJpegQuality", false]], "randomsaturation (class in doctr.transforms)": [[6, "doctr.transforms.RandomSaturation", false]], "read_html() (in module doctr.documents)": [[2, "doctr.documents.read_html", false]], "read_img() (in module doctr.documents)": [[2, "doctr.documents.read_img", false]], "read_pdf() (in module doctr.documents)": [[2, "doctr.documents.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.recognition_predictor", false]], "resize (class in doctr.transforms)": [[6, "doctr.transforms.Resize", false]], "sar_resnet31() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_resnet31", false]], "sar_vgg16_bn() (in module doctr.models.recognition)": [[5, "doctr.models.recognition.sar_vgg16_bn", false]], "show() (doctr.documents.document method)": [[2, "doctr.documents.Document.show", false]], "show() (doctr.documents.page method)": [[2, "doctr.documents.Page.show", false]], "sroie (class in doctr.datasets)": [[1, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[7, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[7, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[7, "doctr.utils.metrics.TextMatch.summary", false]], "textmatch (class in doctr.utils.metrics)": [[7, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[6, "doctr.transforms.ToGray", false]], "visiondataset (class in doctr.datasets.datasets)": [[1, "doctr.datasets.datasets.VisionDataset", false]], "visualize_page() (in module doctr.utils.visualization)": [[7, "doctr.utils.visualization.visualize_page", false]], "word (class in doctr.documents)": [[2, "doctr.documents.Word", false]]}, "objects": {"doctr.datasets": [[1, 0, 1, "", "CORD"], [1, 0, 1, "", "FUNSD"], [1, 0, 1, "", "OCRDataset"], [1, 0, 1, "", "SROIE"], [1, 1, 1, "", "encode_sequences"]], "doctr.datasets.datasets": [[1, 0, 1, "", "VisionDataset"]], "doctr.datasets.loader": [[1, 0, 1, "", "DataLoader"]], "doctr.documents": [[2, 0, 1, "", "Artefact"], [2, 0, 1, "", "Block"], [2, 0, 1, "", "Document"], [2, 0, 1, "", "DocumentFile"], [2, 0, 1, "", "Line"], [2, 0, 1, "", "PDF"], [2, 0, 1, "", "Page"], [2, 0, 1, "", "Word"], [2, 1, 1, "", "read_html"], [2, 1, 1, "", "read_img"], [2, 1, 1, "", "read_pdf"]], "doctr.documents.Document": [[2, 2, 1, "", "show"]], "doctr.documents.DocumentFile": [[2, 2, 1, "", "from_images"], [2, 2, 1, "", "from_pdf"], [2, 2, 1, "", "from_url"]], "doctr.documents.PDF": [[2, 2, 1, "", "as_images"], [2, 2, 1, "", "get_artefacts"], [2, 2, 1, "", "get_words"]], "doctr.documents.Page": [[2, 2, 1, "", "show"]], "doctr.models.detection": [[5, 1, 1, "", "db_resnet50"], [5, 1, 1, "", "detection_predictor"], [5, 1, 1, "", "linknet16"]], "doctr.models.export": [[5, 1, 1, "", "convert_to_fp16"], [5, 1, 1, "", "convert_to_tflite"], [5, 1, 1, "", "quantize_model"]], "doctr.models.recognition": [[5, 1, 1, "", "crnn_vgg16_bn"], [5, 1, 1, "", "master"], [5, 1, 1, "", "recognition_predictor"], [5, 1, 1, "", "sar_resnet31"], [5, 1, 1, "", "sar_vgg16_bn"]], "doctr.models.zoo": [[5, 1, 1, "", "ocr_predictor"]], "doctr.transforms": [[6, 0, 1, "", "ColorInversion"], [6, 0, 1, "", "Compose"], [6, 0, 1, "", "LambdaTransformation"], [6, 0, 1, "", "Normalize"], [6, 0, 1, "", "OneOf"], [6, 0, 1, "", "RandomApply"], [6, 0, 1, "", "RandomBrightness"], [6, 0, 1, "", "RandomContrast"], [6, 0, 1, "", "RandomGamma"], [6, 0, 1, "", "RandomHue"], [6, 0, 1, "", "RandomJpegQuality"], [6, 0, 1, "", "RandomSaturation"], [6, 0, 1, "", "Resize"], [6, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[7, 0, 1, "", "LocalizationConfusion"], [7, 0, 1, "", "OCRMetric"], [7, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.LocalizationConfusion": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.OCRMetric": [[7, 2, 1, "", "summary"]], "doctr.utils.metrics.TextMatch": [[7, 2, 1, "", "summary"]], "doctr.utils.visualization": [[7, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [2, 7], "0": [1, 3, 5, 6, 7], "00": 5, "01": 5, "0123456789": 1, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "02": 5, "02562": 5, "03": 3, "035": [], "0361328125": [], "04": [], "05": 3, "06": [], "06640625": [], "07": [], "08": 5, "09": [], "0966796875": [], "1": [1, 3, 5, 6, 7], "10": [1, 5, 7], "100": [5, 6, 7], "1000": 5, "101": [], "1024": [5, 7], "104": [], "106": [], "108": [], "1095": [], "11": 3, "110": 7, "1107": [], "114": [], "115": [], "1156": [], "116": [], "118": [], "11800h": [], "11th": [], "12": 5, "120": [], "123": [], "126": [], "1268": [], "128": 5, "13": 5, "130": [], "13068": [], "131": [], "1337891": [], "1357421875": [], "1396484375": [], "14": 5, "1420": [], "14470v1": [], "149": [], "15": 5, "150": 7, "154": 1, "1552": [], "16": 5, "160": 5, "1630859375": [], "1684": [], "16x16": [], "17": [], "1778": [], "1782": [], "18": 3, "185546875": [], "19": 5, "1900": [], "1910": 5, "19342": [], "19370": [], "195": [], "19598": [], "199": 5, "1999": [], "1m": 5, "2": [3, 5, 6], "20": 5, "200": 7, "2000": [], "2003": [], "2012": [], "2013": [], "2015": [], "2019": 3, "2021": 3, "207901": [], "21": 5, "2103": [], "2186": [], "21888": [], "22": [], "224": [5, 6], "225": 6, "22672": [], "229": 6, "23": [], "233": [], "236": [], "24": [], "246": [], "249": [], "25": 5, "2504": [], "255": [5, 6, 7], "256": 5, "257": [], "26": [], "26032": [], "264": [], "27": 5, "2700": [], "2710": [], "2749": [], "28": 3, "287": [], "29": 5, "296": [], "299": [], "2d": [], "3": [2, 3, 4, 5, 6, 7], "30": 5, "300": [], "3000": [], "301": [], "30595": 5, "30ghz": [], "31": 5, "32": [1, 5, 6], "3232421875": [], "33": [], "33402": [], "33608": [], "34": [], "340": [], "3456": [], "3515625": [], "36": [], "360": [], "37": [], "38": [], "39": 5, "4": [], "40": [], "406": 6, "41": [], "42": [], "43": 5, "44": [], "45": [], "456": 6, "46": 5, "47": 5, "472": [], "48": 5, "485": 6, "49": 5, "49377": [], "5": [1, 6, 7], "50": 5, "51": [], "51171875": [], "512": [], "52": [1, 5], "529": [], "53": 5, "533": [], "54": [], "540": [], "5478515625": [], "55": [], "56": [], "57": [], "58": [], "580": [], "5810546875": [], "583": [], "59": 5, "595": [], "597": [], "5k": [], "5m": 5, "6": [4, 5, 6], "60": 6, "600": [5, 7], "61": 5, "611": [], "62": 5, "625": [], "626": [], "629": [], "63": 5, "630": [], "64": [5, 6], "640": [], "641": [], "647": [], "65": 5, "66": 5, "660": [], "664": [], "666": [], "67": 5, "672": [], "68": 5, "689": [], "69": 5, "693": [], "694": [], "695": [], "6m": [], "7": 5, "70": [5, 7], "700": [], "701": [], "702": [], "707470": [], "71": [], "7100000": [], "713": [], "7141797": [], "7149": [], "72": [], "72dpi": [], "73": [], "73257": [], "733": [], "74": 5, "745": [], "75": 5, "753": [], "7581382": [], "76": [], "77": 5, "772": [], "772875": [], "78": 5, "780": [], "781": [], "783": [], "785": [], "789": [], "79": 5, "793533": [], "796": [], "798": [], "7m": [], "8": [5, 6], "80": [], "800": [5, 7], "81": 5, "817": [], "82": 5, "8275l": 5, "83": 5, "830": [], "84": [], "849": [], "85": 5, "8564453125": [], "857": [], "85875": [], "86": 5, "860": [], "8603515625": [], "862": [], "863": [], "87": 5, "8707": [], "875": [], "88": [], "89": 5, "8m": 5, "9": [], "90": 5, "90k": [], "90kdict32px": [], "91": 5, "913": [], "914085328578949": [], "917": [], "92": 5, "921": [], "93": [], "94": [], "95": 7, "9578408598899841": [], "96": 1, "97": [], "98": [], "99": [], "9949972033500671": [], "A": [1, 2, 3, 5], "And": 5, "As": [], "Be": [], "Being": [], "By": [], "For": [4, 5], "If": [2, 4, 5], "In": [1, 5], "It": 6, "Its": 5, "No": [], "Of": 1, "Or": [], "The": [1, 2, 5, 7], "Then": 5, "To": [], "_": [1, 5], "__call__": [], "_build": [], "_i": 7, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 1, "abdef": [], "abl": [], "about": 5, "abov": 5, "abstract": 1, "abstractdataset": [], "abus": [], "accent": [], "accept": [], "access": [1, 2, 3], "account": [], "accur": [], "accuraci": 7, "achiev": [], "act": [], "action": [], "activ": [], "ad": 6, "adapt": [], "add": [6, 7], "add_hook": [], "add_label": 7, "addit": [], "addition": 5, "address": 2, "adjust": 6, "advanc": [], "advantag": [], "advis": [], "aesthet": [], "affect": [], "after": [], "ag": [], "again": [], "aggreg": [1, 7], "aggress": [], "align": 2, "all": [1, 2, 3, 5, 6, 7], "allow": [], "along": 5, "alreadi": [], "also": [], "alwai": [], "an": [1, 2, 3, 5, 7], "analysi": [2, 5], "ancient_greek": [], "angl": 2, "ani": [1, 2, 3, 5, 6, 7], "annot": 2, "anot": [], "anoth": [1, 4, 5], "answer": [], "anyascii": [], "anyon": 3, "anyth": [], "api": [], "apolog": [], "apologi": [], "app": [], "appear": [], "appli": [1, 6], "applic": 5, "appoint": [], "appreci": [], "appropri": [], "ar": [1, 2, 4, 5, 6, 7], "arab": [], "arabic_diacrit": [], "arabic_lett": [], "arabic_punctu": [], "arbitrarili": [], "arch": 5, "architectur": [3, 5], "archiv": [], "area": [], "argument": [1, 2], "around": 5, "arrai": [2, 7], "art": 3, "artefact": 7, "artefact_typ": 2, "artifici": [], "arxiv": 5, "as_imag": 2, "asarrai": 7, "ascii_lett": 1, "aspect": [3, 6], "assess": 7, "assign": 7, "associ": 2, "assum": [], "assume_straight_pag": [], "astyp": [5, 7], "attack": [], "attend": [3, 5], "attent": [], "autoclass": [], "autom": 3, "automat": [], "autoregress": [], "avail": [3, 5, 6], "averag": [5, 6], "avoid": [], "aw": [3, 5], "awar": [], "azur": [], "b": 7, "b_j": 7, "back": [], "backbon": 5, "backend": 5, "background": [], "bangla": [], "bar": [], "bar_cod": [], "base": 5, "baselin": 5, "batch": [1, 5, 6], "batch_siz": 1, "bblanchon": [], "bbox": [], "becaus": [], "been": [5, 7], "befor": 1, "begin": 7, "behavior": [], "being": [5, 7], "belong": [], "benchmark": [], "best": [], "beta": 3, "better": [], "between": [6, 7], "bgr": 2, "bilinear": [5, 6], "bin_thresh": [], "binar": [3, 5], "binari": 2, "bit": [], "block": [5, 7], "block_1_1": [], "blur": [], "bmvc": [], "bn": [], "bodi": [], "bool": [1, 2, 5, 6, 7], "boolean": [], "both": [3, 5, 6], "bottom": [], "bound": [1, 2, 6, 7], "box": [1, 2, 7], "box_thresh": [], "brew": 4, "bright": 6, "browser": [], "build": [], "built": [], "byte": [2, 5], "c": [], "c5": 5, "c_j": [], "cach": [], "cache_sampl": [], "cairo": 4, "call": [], "callabl": [1, 6], "can": [1, 4, 5], "capabl": 5, "case": [1, 7], "cf": 5, "cfg": [], "challeng": [], "challenge2_test_task12_imag": [], "challenge2_test_task1_gt": [], "challenge2_training_task12_imag": [], "challenge2_training_task1_gt": [], "chang": [], "changelog": 3, "channel": [2, 5, 6], "channel_prior": [], "channelshuffl": [], "charact": [1, 2, 3, 5, 7], "charactergener": [], "characterist": [], "charg": 5, "charset": [], "chart": 2, "check": [], "checkpoint": [], "chip": [], "ci": [], "clarifi": [], "clariti": [], "class": [1, 2, 6, 7], "class_nam": [], "classif": [], "classmethod": 2, "clear": [], "clone": 4, "close": [], "co": [], "code": [2, 3], "codecov": [], "colab": [], "collate_fn": [], "collect": 2, "color": 6, "colorinvers": 6, "column": 2, "com": [2, 4], "combin": 5, "command": [], "comment": [], "commit": [], "common": [6, 7], "commun": [], "compar": 3, "comparison": 7, "competit": 1, "compil": [], "complaint": [], "complementari": 7, "complet": [], "compon": 5, "compos": [1, 3, 5], "comprehens": [], "comput": [5, 7], "conf_threshold": [], "confid": 2, "config": [], "configur": [], "confus": 7, "consecut": [5, 6], "consequ": [], "consid": [1, 2, 7], "consist": [], "consolid": [1, 3], "constant": 6, "construct": [], "contact": [], "contain": [], "content": [1, 2], "context": [], "contib": [], "continu": [], "contrast": 6, "contrast_factor": 6, "contrib": [], "contribut": [], "contributor": [], "conv_sequ": 5, "convers": 2, "convert": [2, 5, 6], "convert_page_to_numpi": 2, "convert_to_fp16": 5, "convert_to_tflit": 5, "convolut": 3, "coordin": 2, "cord": [1, 3, 5], "core": 7, "corner": [], "correct": 6, "correspond": [4, 5], "could": [], "counterpart": 7, "cover": [], "coverag": [], "cpu": [3, 5], "creat": [], "crnn": [3, 5], "crnn_mobilenet_v3_larg": [], "crnn_mobilenet_v3_smal": [], "crnn_resnet31": 5, "crnn_vgg16_bn": 5, "crop": 5, "crop_orient": [], "crop_orientation_predictor": [], "crop_param": [], "cuda": [], "currenc": 1, "current": [], "custom": [], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": [], "czczup": [], "czech": [], "d": [], "daili": 3, "danish": [], "data": [2, 3, 5, 6, 7], "dataload": 1, "dataset": 5, "dataset_info": [], "date": [], "db": [], "db_crnn_resnet": 5, "db_crnn_vgg": 5, "db_mobilenet_v3_larg": [], "db_resnet34": [], "db_resnet50": 5, "db_sar_resnet": 5, "db_sar_vgg": 5, "dbnet": [3, 5], "deal": [], "decis": [], "decod": 2, "decode_img_as_tensor": [], "dedic": [], "deem": [], "deep": 5, "def": [], "default": [2, 5], "defer": 1, "defin": 7, "deform": 5, "degre": [], "degress": 2, "delet": [], "delimit": [], "delta": 6, "demo": [], "demonstr": [], "depend": [3, 4], "deploi": [], "deploy": [], "derogatori": [], "describ": 5, "descript": [], "design": 6, "desir": [], "det_arch": 5, "det_b": [], "det_model": [], "det_param": [], "det_predictor": [], "detail": [], "detect": [], "detect_languag": [], "detect_orient": [], "detection_predictor": 5, "detection_task": [], "detectiondataset": [], "detectionmetr": [], "detectionpredictor": 5, "detector": [], "deterior": [], "determin": [], "dev": [], "develop": [], "developp": 4, "deviat": 6, "devic": [], "dict": [2, 7], "dictionari": [2, 7], "differ": [], "differenti": [3, 5], "digit": 1, "dimens": [2, 5, 7], "dimension": 6, "direct": [], "directli": 5, "directori": [], "disabl": [], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 5, "discuss": [], "disk": [], "disparag": [], "displai": [2, 7], "display_artefact": 7, "distanc": [], "distribut": 6, "div": [], "divers": [], "divid": [], "do": 4, "doc": [2, 5], "docartefact": [], "docstr": [], "doctr": 4, "doctr_cache_dir": [], "doctr_multiprocessing_dis": [], "document": [1, 5, 7], "documentbuild": [], "documentfil": 2, "doesn": [], "don": [], "done": 6, "download": 1, "downsiz": [], "draw": 6, "drop": 1, "drop_last": 1, "dtype": 5, "dual": [], "dummi": [], "dummy_img": [], "dummy_input": [], "dure": [], "dutch": [], "dynam": [], "dynamic_seq_length": [], "e": [2, 4], "each": [1, 2, 3, 5, 6, 7], "eas": [], "easi": [3, 7], "easier": 5, "easili": [2, 5, 7], "econom": [], "edit": [], "educ": [], "effect": [], "effici": [1, 5], "either": 5, "element": [1, 2, 5], "els": [], "email": [], "empathi": [], "en": [], "enabl": 2, "enclos": 2, "encod": [1, 2, 5], "encode_sequ": 1, "encount": [], "encrypt": [], "end": [1, 3, 7], "english": [], "enough": 5, "ensur": [], "entir": 2, "entri": [], "environ": [], "eo": 1, "equiv": [], "error": [], "estim": [], "etc": 2, "ethnic": [], "evalu": [1, 3, 5], "event": [], "everyon": [], "everyth": [], "exact": 7, "exactmatch": [], "exampl": [1, 2, 5, 6, 7], "exchang": [], "exclud": 5, "execut": [], "exist": [], "expand": [], "expect": [2, 5, 6], "experi": 5, "explan": 5, "explicit": [], "exploit": 5, "export": [2, 3, 7], "export_as_straight_box": [], "export_as_xml": [], "export_model_to_onnx": [], "express": 6, "extens": 2, "extern": [], "extra": 4, "extract": [1, 3], "extract_arch": 1, "extractor": 5, "f_": 7, "f_a": 7, "factor": 6, "fair": [], "fairli": [], "fals": [1, 5, 6, 7], "faq": [], "fascan": [], "fast": 1, "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [], "fasterrcnn_mobilenet_v3_large_fpn": [], "favorit": [], "featur": [5, 7], "feed": 5, "feedback": [], "feel": [], "felix92": [], "few": 4, "figsiz": 7, "figur": 7, "file": [1, 3], "file_hash": 1, "file_nam": 1, "final": [], "find": 4, "fine": 3, "finnish": [], "first": [], "firsthand": [], "fit": [], "fitz": 2, "flag": [], "flexibl": 7, "flip": [], "float": [2, 6, 7], "float32": 5, "fn": 6, "focu": [], "focus": [], "folder": [1, 5], "follow": [1, 4, 5, 6, 7], "font": [], "font_famili": [], "foral": 7, "forc": [], "forg": [], "form": [1, 3], "format": [2, 5], "forpost": [1, 3], "forum": [], "fp": 5, "fp16": 5, "frac": 7, "frame": 5, "framework": 1, "free": [], "french": [1, 5], "friendli": 3, "from": [1, 2, 3, 5, 6, 7], "from_hub": [], "from_imag": 2, "from_pdf": 2, "from_url": 2, "full": [1, 5, 7], "fulli": [], "function": [5, 6, 7], "funsd": [1, 3, 5], "further": [], "futur": [], "g": 2, "g_": 7, "g_x": 7, "gamma": 6, "gaussian": 6, "gaussianblur": [], "gaussiannois": [], "gdk": 4, "gen": [], "gender": [], "gener": [], "generic_cyrillic_lett": [], "geometri": 2, "geq": 7, "german": [], "get": 2, "get_artefact": 2, "get_word": 2, "gettextword": 2, "git": 3, "github": 4, "give": [], "given": [1, 2, 5, 7], "global": [], "go": [], "good": [], "googl": [], "googlevis": 3, "gpu": 3, "gracefulli": [], "graph": 2, "grayscal": 6, "ground": 7, "groung": [], "group": [], "gt": [], "gt_box": [], "gt_label": [], "gtk": 4, "guid": [], "guidanc": [], "gvision": 5, "h": 2, "h_": 7, "ha": [1, 7], "half": 5, "handl": 1, "handwrit": [], "handwritten": [], "harass": [], "hardwar": [], "harm": [], "hat": 7, "have": [1, 5, 7], "head": [], "healthi": [], "hebrew": [], "height": 2, "hello": 7, "help": [], "here": [1, 4, 6], "hf": [], "hf_hub_download": [], "high": 2, "higher": 4, "hindi": [], "hindi_digit": [], "hocr": [], "hook": [], "horizont": 2, "hous": [], "how": [], "howev": [], "hsv": 6, "html": [], "http": [2, 4, 5], "hub": [], "hue": 6, "huggingfac": [], "hw": [], "i": [1, 2, 5, 6, 7], "i7": [], "ic03": [], "ic13": [], "icdar": 3, "icdar2019": 1, "id": 5, "ident": [], "identifi": [3, 5], "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [], "iiit5k": [], "iiithw": [], "imag": [1, 2, 5, 6, 7], "imagenet": [], "imageri": [], "images_90k_norm": [], "img": [1, 6], "img_cont": [], "img_fold": 1, "img_path": [], "img_transform": [], "imgur5k": [], "imgur5k_annot": [], "imlist": [], "impact": [], "implement": [1, 2, 5, 6, 7], "import": [1, 2, 5, 6, 7], "improv": [], "inappropri": [], "incid": [], "includ": [4, 5], "inclus": [], "increas": 6, "independ": [], "index": 2, "indic": 7, "individu": [], "infer": [3, 6], "inform": [1, 3, 5], "inherit": [1, 5], "input": [2, 5, 6], "input_crop": [], "input_pag": [5, 7], "input_shap": 5, "input_t": 5, "input_tensor": 5, "inspir": 6, "instal": 3, "instanc": 5, "instanti": 5, "instead": [1, 2], "insult": [], "int": [1, 2, 5, 6, 7], "int64": [], "integ": 7, "integr": 3, "intel": [], "interact": [2, 7], "interfac": [], "interoper": [], "interpol": [5, 6], "interpret": [1, 2], "intersect": 7, "invert": 6, "investig": [], "invis": [], "invoic": 5, "involv": 5, "io": [], "iou": 7, "iou_thresh": 7, "iou_threshold": [], "irregular": 5, "isn": 1, "issu": [], "italian": [], "iter": 1, "its": [1, 2, 5, 7], "itself": [], "j": 7, "job": [], "join": [], "jpeg": 6, "jpegqual": 6, "jpg": [1, 2], "json": [], "json_output": [], "jump": [], "just": 5, "kei": [], "kera": 5, "kernel": [], "kernel_s": 5, "kernel_shap": [], "keywoard": [], "keyword": [1, 2], "kie": [], "kie_predictor": [], "kiepredictor": [], "kind": [], "know": [], "kwarg": [1, 2, 5, 7], "l": 7, "l_j": 7, "label": [1, 7], "label_fil": 1, "label_fold": [], "label_path": [], "labels_path": [], "ladder": [], "lambda": 6, "lambdatransform": 6, "lang": [], "languag": [2, 3], "larg": [], "largest": 7, "last": [1, 4, 5], "latenc": [], "later": [], "latest": 4, "latin": 1, "layer": [], "layout": [], "lead": [], "leader": [], "learn": 5, "least": 4, "left": 7, "legacy_french": [], "length": 1, "less": [], "let": 5, "letter": [], "level": [5, 7], "levenshtein": [], "leverag": [], "lf": [], "libffi": 4, "librari": 4, "light": 3, "lightweight": [], "like": [], "limits_": 7, "line": [3, 7], "line_1_1": [], "link": [], "linknet": [3, 5], "linknet16": 5, "linknet_resnet18": [], "linknet_resnet34": [], "linknet_resnet50": [], "linux": 4, "list": [1, 2, 6], "ll": 7, "load": [3, 5], "load_state_dict": [], "load_weight": [], "loader": 1, "loc_pr": [], "local": [1, 3, 5, 7], "localis": [], "localizationconfus": 7, "locat": [], "login": [], "login_to_hub": [], "logo": 2, "love": [], "lower": [6, 7], "m": [5, 7], "m1": [], "macbook": [], "machin": [], "maco": 4, "made": 3, "magc_resnet31": [], "mai": [], "mail": [], "main": [], "maintain": 3, "mainten": [], "make": [5, 7], "mani": [], "manipul": [], "map": 1, "map_loc": [], "mask_shap": 7, "master": [3, 5], "match": [3, 7], "mathcal": 7, "matplotlib": 7, "max": 7, "max_angl": [], "max_area": [], "max_char": [], "max_delta": 6, "max_dist": [], "max_gain": 6, "max_gamma": 6, "max_qual": 6, "max_ratio": [], "maximum": 1, "maxval": [5, 6], "mbox": 7, "mean": [6, 7], "meaniou": 7, "meant": 2, "measur": 5, "media": [], "median": [], "meet": [], "member": [], "memori": [], "mention": [], "merg": [], "messag": [], "meta": [], "metadata": [], "metal": [], "method": 6, "metric": [5, 7], "middl": [], "might": 5, "min": [], "min_area": [], "min_char": [], "min_gain": 6, "min_gamma": 6, "min_qual": 6, "min_ratio": [], "min_val": 6, "minde": 4, "minim": [], "minimalist": [], "minimum": 7, "minval": 6, "miss": [], "mistak": [], "mix": 3, "mixed_float16": [], "mixed_precis": [], "mjsynth": [], "mnt": [], "mobilenet": [], "mobilenet_v3_larg": [], "mobilenet_v3_large_r": [], "mobilenet_v3_smal": [], "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": [], "mobilenetv3": [], "modal": [], "mode": 4, "model": [1, 7], "model_nam": [], "model_path": [], "moder": [], "modif": [], "modifi": [], "modul": [2, 5, 6, 7], "more": [], "most": 5, "mozilla": [], "multi": 3, "multilingu": [], "multipl": [1, 2, 6], "multipli": 6, "multiprocess": [], "my": [], "my_awesome_model": [], "my_hook": [], "n": [1, 5, 7], "na": [], "name": [1, 5], "nation": [], "natur": 3, "ndarrai": [1, 2, 7], "necessari": [], "need": [4, 7], "neg": 6, "nest": [], "nestedobject": [], "network": [3, 5], "neural": [3, 5], "new": [], "newer": [], "next": 1, "nois": [], "noisi": [1, 3], "non": [2, 3, 6, 7], "none": [1, 2, 7], "normal": [5, 6], "norwegian": [], "note": 0, "now": 3, "np": [5, 7], "num_output_channel": [], "num_sampl": [], "number": [1, 6, 7], "numpi": [2, 5, 7], "o": 4, "obb": [], "obj_detect": [], "object": 1, "objectness_scor": [], "oblig": [], "obtain": [], "occupi": [], "ocr": [1, 3, 7], "ocr_carea": [], "ocr_db_crnn": 7, "ocr_lin": [], "ocr_pag": [], "ocr_par": [], "ocr_predictor": 5, "ocrdataset": 1, "ocrmetr": 7, "ocrpredictor": 5, "ocrx_word": [], "offens": [], "offici": [], "offlin": [], "offset": 6, "onc": 5, "one": [1, 5, 6], "oneof": 6, "ones": 1, "onli": [6, 7], "onlin": [], "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": [], "opacity_rang": [], "open": [], "opinion": [], "optic": [3, 5], "optim": 3, "option": 1, "order": [1, 2, 5], "org": 5, "organ": 2, "orient": 2, "orientationpredictor": [], "other": [], "otherwis": 7, "our": 5, "out": [5, 6, 7], "outpout": [], "output": [2, 5, 6], "output_s": [2, 6], "outsid": [], "over": [4, 7], "overal": [], "overlai": 2, "overview": [], "overwrit": 1, "overwritten": [], "own": 3, "p": 6, "packag": 7, "pad": [1, 5, 6], "page": [4, 5, 7], "page1": 2, "page2": 2, "page_1": [], "page_idx": 2, "page_orientation_predictor": [], "page_param": [], "pair": 7, "pango": 4, "paper": 5, "par_1_1": [], "paragraph": [], "paragraph_break": [], "param": [5, 6], "paramet": [1, 2, 3, 5, 6, 7], "pars": [1, 3], "parseq": [], "part": 6, "parti": [], "partial": [], "particip": [], "pass": [1, 5], "password": [], "patch": [], "path": [1, 2, 5], "path_to_checkpoint": [], "path_to_custom_model": [], "path_to_pt": [], "pattern": [], "pdf": [2, 5], "pdfpage": [], "peopl": [], "per": [5, 6], "perform": [2, 3, 5, 6, 7], "period": [], "permiss": [], "permut": [], "persian_lett": [], "person": [], "phase": [], "photo": [], "physic": 2, "pick": 6, "pictur": 2, "pip": 4, "pipelin": [], "pixbuf": 4, "pixel": [2, 6], "platinum": 5, "pleas": [], "plot": 7, "plt": 7, "plug": [], "plugin": [], "png": 2, "point": [], "polici": [], "polish": [], "polit": [], "polygon": 1, "pool": [], "portugues": [], "posit": 7, "possibl": 7, "post": 5, "postprocessor": [], "potenti": 5, "power": 3, "ppageno": [], "pre": [], "precis": [5, 7], "pred": [], "pred_box": [], "pred_label": [], "predefin": 1, "predict": [2, 7], "predictor": [], "prefer": 1, "preinstal": [], "preprocessor": 5, "prerequisit": 3, "present": [], "preserv": 6, "preserve_aspect_ratio": 6, "pretrain": [3, 5, 7], "pretrained_backbon": [], "print": [], "prior": [], "privaci": [], "privat": 5, "probabl": 6, "problem": [], "procedur": 6, "process": [2, 3], "processor": 5, "produc": 5, "product": [], "profession": [], "project": [], "promptli": [], "proper": [], "properli": 1, "properti": 5, "provid": [3, 5], "public": 3, "publicli": [], "publish": [], "pull": [], "punctuat": 1, "pure": [], "purpos": [], "push_to_hf_hub": [], "py": [], "pypdfium2": [], "pyplot": 7, "python": 3, "python3": [], "pytorch": [3, 4], "q": [], "qr": 2, "qr_code": [], "qualiti": 6, "quantiz": 5, "quantize_model": 5, "question": [], "quickli": 3, "quicktour": [], "r": [], "race": [], "ramdisk": [], "rand": [5, 7], "random": [5, 6, 7], "randomappli": 6, "randombright": 6, "randomcontrast": 6, "randomcrop": [], "randomgamma": 6, "randomhorizontalflip": [], "randomhu": 6, "randomjpegqu": 6, "randomli": 6, "randomres": [], "randomrot": [], "randomsatur": 6, "randomshadow": [], "rang": 6, "rassi": [], "ratio": 6, "raw": [2, 7], "re": [], "read": [3, 5], "read_html": 2, "read_img": 2, "read_img_as_numpi": [], "read_img_as_tensor": [], "read_pdf": 2, "readi": [], "real": [5, 6], "reason": [], "rebuild": [], "rebuilt": [], "recal": [5, 7], "receipt": [1, 3, 5], "reco_arch": 5, "reco_b": [], "reco_model": [], "reco_param": [], "reco_predictor": [], "recogn": [], "recognit": 7, "recognition_predictor": 5, "recognition_task": [], "recognitiondataset": [], "recognitionpredictor": 5, "rectangular": [], "recurr": 3, "reduc": 6, "refer": 4, "regardless": [], "region": [], "regroup": 7, "regular": [], "reject": [], "rel": 2, "relat": [], "releas": [0, 4], "relev": [], "religion": [], "relu": 5, "remov": [], "render": [], "repo": [], "repo_id": [], "report": [], "repositori": [], "repres": [2, 5], "represent": 5, "request": [], "requir": [4, 6], "research": 3, "residu": [], "resiz": [5, 6], "resnet": 5, "resnet18": [], "resnet31": [], "resnet34": [], "resnet50": [], "resolv": 2, "resolve_block": [], "resolve_lin": [], "resourc": [], "respect": [], "rest": [6, 7], "restrict": [], "result": [2, 5], "return": [1, 2, 5, 7], "reusabl": 5, "review": [], "rgb": [2, 6], "rgb_mode": [], "rgb_output": 2, "right": [5, 7], "robust": 3, "root": 1, "rotat": [1, 2], "rotated_bbox": [1, 7], "run": 4, "same": [2, 7], "sampl": 1, "sample_transform": 1, "sar": [3, 5], "sar_resnet31": 5, "sar_vgg16_bn": 5, "satur": 6, "save": [1, 5], "saved_model": 5, "scale": 7, "scale_rang": [], "scan": [1, 3], "scene": [3, 5], "scheme": 5, "score": 7, "scratch": 3, "script": [], "seamless": 3, "seamlessli": [], "search": [], "searchabl": [], "sec": [], "second": 5, "section": [], "secur": [], "see": [], "seemlessli": 3, "seen": 5, "segment": 5, "self": [], "semant": 5, "send": [], "sens": 7, "sensit": [], "separ": 5, "sequenc": [1, 2, 5, 7], "sequenti": [5, 6], "seri": [], "serial": 5, "serialized_model": 5, "seriou": [], "set": [1, 5, 7], "set_global_polici": [], "sever": [2, 6], "sex": [], "sexual": [], "sha256": [], "shade": [], "shape": [2, 5, 6, 7], "share": [], "shift": 6, "shm": [], "should": [1, 2, 7], "show": [2, 3, 5, 7], "showcas": [], "shuffl": 1, "side": 7, "signatur": 2, "signific": 1, "simpl": 5, "simpler": [], "sinc": 1, "singl": [], "single_img_doc": [], "size": [1, 2, 5, 6], "skew": [], "slack": [], "slightli": [], "small": 3, "smallest": 2, "snapshot_download": [], "snippet": [], "so": [1, 4], "social": [], "socio": [], "some": [], "someth": [], "somewher": [], "sort": [], "sourc": [1, 2, 5, 6, 7], "space": [], "span": [], "spanish": [], "spatial": 2, "special": 3, "specif": [1, 5, 7], "specifi": 2, "speed": [3, 5], "sphinx": [], "sroie": [1, 3], "stabl": 4, "stackoverflow": [], "stage": 3, "standard": 6, "start": 1, "state": 3, "static": 7, "statist": 5, "statu": [], "std": 6, "step": [], "still": [], "str": [1, 2, 5, 6, 7], "straight": 1, "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 2, "street": [], "strict": [], "strictli": 7, "string": [1, 2, 5, 7], "strive": [], "strong": 5, "structur": [3, 5], "subset": [1, 5], "suggest": [], "sum": 7, "summari": 7, "support": 5, "sustain": [], "svhn": [], "svt": [], "swedish": [], "symbol": [], "symmetr": 6, "symmetric_pad": 6, "synthet": [], "synthtext": [], "system": [], "t": 1, "tabl": [], "take": [], "target": [1, 2, 5, 6], "target_s": 1, "task": [1, 3, 5], "task2": [], "team": [], "techminde": [], "templat": 2, "tensor": [1, 5, 6], "tensorflow": [3, 4, 5, 6], "tensorspec": [], "term": [], "test": [], "test_set": [], "text": [2, 7], "text_output": [], "textmatch": 7, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [3, 5], "textstylebrush": [], "textual": [1, 2, 3], "tf": [5, 6], "tf_model": 5, "tflite": 5, "than": [4, 7], "thank": [], "thei": [], "them": [1, 4], "thi": [4, 5, 7], "thing": [], "third": [], "those": [2, 4, 5], "threaten": [], "threshold": [], "through": [1, 6], "tilman": [], "time": [1, 5, 7], "tini": [], "titl": 2, "tm": [], "tmp": [], "togeth": [2, 5], "tograi": 6, "tool": [], "top": 7, "topic": [], "torch": [], "torchvis": 6, "total": [], "toward": [], "train": [1, 5, 6], "train_it": 1, "train_load": 1, "train_pytorch": [], "train_set": 1, "train_tensorflow": [], "trainabl": 5, "tranform": 6, "transcrib": [], "transfer": [], "transfo": 6, "transform": [1, 3], "translat": [], "troll": [], "true": [1, 2, 5, 6, 7], "truth": 7, "tune": 3, "tupl": [2, 5, 6, 7], "turn": [], "two": 2, "txt": [], "type": [2, 5], "typic": [], "u": [], "ucsd": [], "udac": [], "uint8": [2, 5, 7], "ukrainian": [], "unaccept": [], "underli": 1, "underneath": 2, "understand": [1, 3], "unidecod": 7, "uniform": [5, 6], "uniformli": [], "uninterrupt": 2, "union": 7, "unittest": [], "unlock": [], "unoffici": [], "unprofession": [], "unsolicit": [], "unsupervis": [], "unwelcom": [], "up": 5, "updat": 7, "upgrad": [], "upper": 6, "uppercas": [], "url": [1, 2], "us": [1, 4, 7], "usabl": 5, "usag": 5, "use_polygon": [], "useabl": [], "user": [2, 3, 4], "utf": [], "util": [3, 5], "v0": 3, "v1": [], "v3": [], "valid": [], "valu": [2, 6], "valuabl": 3, "variabl": [], "varieti": [], "veri": [], "verifi": 1, "version": 5, "vgg": 5, "vgg16": 5, "vgg16_bn_r": [], "via": 3, "vietnames": [], "view": [], "viewpoint": [], "violat": [], "visibl": [], "vision": [], "visiondataset": 1, "visiontransform": [], "visual": 3, "visualize_pag": 7, "vit_": [], "vit_b": [], "vitstr": [], "vitstr_bas": [], "vitstr_smal": [], "viz": [], "vocab": [3, 5], "vocabulari": [], "w": [2, 7], "w3": [], "wa": [], "wai": [1, 3, 5], "want": [], "warm": 5, "warmup": [], "wasn": [], "we": [2, 3, 5, 6], "weasyprint": [], "web": 2, "websit": [], "welcom": 3, "well": [], "were": 2, "what": [], "when": [], "whenev": [], "where": [2, 7], "whether": [1, 2, 7], "which": 5, "whichev": 4, "while": 6, "why": [], "width": 2, "wiki": [], "wildreceipt": [], "window": [4, 7], "wish": [], "within": [], "without": 5, "wonder": [], "word": [3, 5, 7], "word_1_1": [], "word_1_2": [], "word_1_3": [], "wordgener": [], "words_onli": 7, "work": [], "worker": 1, "workflow": [], "worklow": [], "world": 7, "worth": [], "wrap": [], "wrapper": [1, 6], "write": [], "written": 2, "www": 2, "x": [2, 6, 7], "x12larg": 5, "x_ascend": [], "x_descend": [], "x_i": 7, "x_size": [], "x_wconf": [], "xeon": 5, "xhtml": [], "xmax": 2, "xmin": 2, "xml": [], "xml_bytes_str": [], "xml_element": [], "xml_output": [], "xmln": [], "y": 7, "y_i": 7, "y_j": 7, "yet": [], "ymax": 2, "ymin": 2, "yolov8": [], "you": [4, 5], "your": [1, 2, 5, 7], "yoursit": 2, "zero": [5, 6], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 1, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": [], "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": [], "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": [], "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": [], "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": [], "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": [], "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": [], "\u067e\u0686\u06a2\u06a4\u06af": [], "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "doctr.datasets", "doctr.documents", "DocTR: Document Text Recognition", "Installation", "doctr.models", "doctr.transforms", "doctr.utils"], "titleterms": {"": [], "0": 0, "01": [], "02": [], "03": 0, "04": [], "05": 0, "07": [], "08": [], "09": [], "1": 0, "10": [], "11": 0, "12": [], "18": 0, "2": 0, "2021": 0, "2022": [], "2023": [], "2024": [], "22": [], "27": [], "28": 0, "29": [], "3": [], "31": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], "advanc": [], "approach": 5, "architectur": [], "arg": [], "artefact": 2, "artefactdetect": [], "attribut": [], "avail": 1, "aw": [], "ban": [], "block": 2, "bug": [], "build": 3, "changelog": 0, "choos": [], "classif": [], "code": [], "codebas": [], "commit": [], "commun": [], "compos": 6, "compress": 5, "conda": [], "conduct": [], "connect": [], "content": [], "continu": [], "contrib": [], "contribut": [], "contributor": [], "convent": [], "correct": [], "coven": [], "custom": [], "data": 1, "dataload": [], "dataset": [1, 3], "detect": [3, 5], "develop": [], "do": [], "doctr": [1, 2, 3, 5, 6, 7], "document": [2, 3], "end": 5, "enforc": [], "evalu": 7, "export": 5, "factori": [], "featur": 3, "feedback": [], "file": 2, "from": [], "gener": [], "get": 3, "git": 4, "guidelin": [], "half": [], "hub": [], "huggingfac": [], "i": [], "implement": [], "infer": [], "instal": 4, "integr": [], "io": [], "lambda": [], "let": [], "line": 2, "linux": [], "load": 1, "loader": [], "main": 3, "mode": [], "model": [3, 5], "modifi": [], "modul": [], "name": [], "note": 3, "notebook": [], "object": [], "ocr": 5, "onli": [], "onnx": [], "optim": [], "option": [], "orient": [], "our": [], "output": [], "own": [], "packag": [3, 4], "page": 2, "perman": [], "pipelin": [], "pledg": [], "post": [], "pre": 5, "precis": [], "predictor": [3, 5], "prepar": [], "prerequisit": 4, "pretrain": [], "process": 5, "push": [], "python": 4, "qualiti": [], "question": [], "read": 2, "readi": [], "recognit": [3, 5], "refer": 3, "report": [], "request": [], "respons": [], "return": [], "right": [], "savedmodel": 5, "scope": [], "share": [], "should": [], "stage": 5, "standard": [], "start": 3, "structur": 2, "style": [], "support": [1, 3, 6], "synthet": [], "task": 7, "temporari": [], "test": [], "text": [3, 5], "train": 3, "transform": 6, "two": 5, "unit": [], "us": 5, "util": 7, "v0": 0, "verif": [], "via": 4, "visual": 7, "vocab": 1, "warn": [], "what": [], "word": 2, "your": 3, "zoo": [3, 5]}}) \ No newline at end of file diff --git a/v0.6.0/transforms.html b/v0.6.0/transforms.html index 0d1b5f7402..d42da50481 100644 --- a/v0.6.0/transforms.html +++ b/v0.6.0/transforms.html @@ -227,28 +227,21 @@ @@ -293,7 +286,7 @@

                                                    doctr.transformstorchvision, we express transformations as composable modules.

                                                    Supported transformations

                                                    -

                                                    Here are all transformations that are available through docTR:

                                                    +

                                                    Here are all transformations that are available through DocTR:

                                                    class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
                                                    @@ -364,7 +357,7 @@

                                                    Supported transformations
                                                    -class doctr.transforms.ToGray(num_output_channels: int = 1)[source]
                                                    +class doctr.transforms.ToGray[source]

                                                    Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

                                                    Example::
                                                    >>> from doctr.transforms import Normalize
                                                    @@ -524,88 +517,6 @@ 

                                                    Supported transformations -
                                                    -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
                                                    -

                                                    Randomly rotate a tensor image and its boxes

                                                    -https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png -
                                                    -
                                                    Parameters:
                                                    -
                                                      -
                                                    • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

                                                    • -
                                                    • expand – whether the image should be padded before the rotation

                                                    • -
                                                    -
                                                    -
                                                    -

                                                    - -
                                                    -
                                                    -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
                                                    -

                                                    Randomly crop a tensor image and its boxes

                                                    -
                                                    -
                                                    Parameters:
                                                    -
                                                      -
                                                    • scale – tuple of floats, relative (min_area, max_area) of the crop

                                                    • -
                                                    • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

                                                    • -
                                                    -
                                                    -
                                                    -
                                                    - -
                                                    -
                                                    -class doctr.transforms.GaussianBlur(kernel_shape: int | Iterable[int], std: Tuple[float, float])[source]
                                                    -

                                                    Randomly adjust jpeg quality of a 3 dimensional RGB image

                                                    -
                                                    -
                                                    Example::
                                                    >>> from doctr.transforms import GaussianBlur
                                                    ->>> import tensorflow as tf
                                                    ->>> transfo = GaussianBlur(3, (.1, 5))
                                                    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
                                                    -
                                                    -
                                                    -
                                                    -
                                                    -
                                                    -
                                                    Parameters:
                                                    -
                                                      -
                                                    • kernel_shape – size of the blurring kernel

                                                    • -
                                                    • std – min and max value of the standard deviation

                                                    • -
                                                    -
                                                    -
                                                    -
                                                    - -
                                                    -
                                                    -class doctr.transforms.ChannelShuffle[source]
                                                    -

                                                    Randomly shuffle channel order of a given image

                                                    -
                                                    - -
                                                    -
                                                    -class doctr.transforms.GaussianNoise(mean: float = 0.0, std: float = 1.0)[source]
                                                    -

                                                    Adds Gaussian Noise to the input tensor

                                                    -
                                                    -
                                                    Example::
                                                    >>> from doctr.transforms import GaussianNoise
                                                    ->>> import tensorflow as tf
                                                    ->>> transfo = GaussianNoise(0., 1.)
                                                    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
                                                    -
                                                    -
                                                    -
                                                    -
                                                    -
                                                    -
                                                    Parameters:
                                                    -
                                                      -
                                                    • mean – mean of the gaussian distribution

                                                    • -
                                                    • std – std of the gaussian distribution

                                                    • -
                                                    -
                                                    -
                                                    -
                                                    -

                                                    Composing transformations

                                                    @@ -744,11 +655,6 @@

                                                    Composing transformationsRandomHue
                                                  • RandomGamma
                                                  • RandomJpegQuality
                                                  • -
                                                  • RandomRotate
                                                  • -
                                                  • RandomCrop
                                                  • -
                                                  • GaussianBlur
                                                  • -
                                                  • ChannelShuffle
                                                  • -
                                                  • GaussianNoise
                                                • Composing transformations

        crnn_vgg16_bn

        tilman-rassy/doctr-crnn-vgg16-bn-fascan-v1

        french + german + §

        PyTorch

        parseq

        Felix92/doctr-torch-parseq-multilingual-v1

        multilingual

        PyTorch

      @@ -464,14 +477,14 @@

      Recognition - +
      Previous
      -
      Choose a ready to use dataset
      +
      Integrate contributions into your pipeline
      @@ -526,7 +539,7 @@

      Recognition +

    diff --git a/v0.6.0/using_doctr/using_datasets.html b/v0.6.0/using_doctr/using_datasets.html index f357308cb2..7a1e0722d9 100644 --- a/v0.6.0/using_doctr/using_datasets.html +++ b/v0.6.0/using_doctr/using_datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Choose a ready to use dataset - docTR documentation @@ -236,12 +236,15 @@

    Package Reference

    from doctr.datasets import CORD
     # Load straight boxes
    -train_set = CORD(train=True, download=True)
    +train_set = CORD(train=True, download=True, detection_task=True)
     # Load rotated boxes
    -train_set = CORD(train=True, download=True, use_polygons=True)
    +train_set = CORD(train=True, download=True, use_polygons=True, detection_task=True)
     img, target = train_set[0]
     
    @@ -445,7 +453,17 @@

    Recognition

    MJSynth

    7581382

    1337891

    -

    english

    +

    english / external resources

    + +

    IIITHWS

    +

    7141797

    +

    793533

    +

    english / handwritten / external resources

    + +

    WILDRECEIPT

    +

    49377

    +

    19598

    +

    english / external resources

    @@ -459,6 +477,18 @@

    Recognition +

    OCR

    +

    The same dataset table as for detection, but with information about the bounding boxes and labels.

    +
    from doctr.datasets import CORD
    +# Load straight boxes
    +train_set = CORD(train=True, download=True)
    +# Load rotated boxes
    +train_set = CORD(train=True, download=True, use_polygons=True)
    +img, target = train_set[0]
    +
    +
    +

    Object Detection

    This datasets contains the information to train or validate a object detection model.

    @@ -536,12 +566,12 @@

    Data Loading - +
    Next
    -
    Share your model with the community
    +
    Integrate contributions into your pipeline
    @@ -590,6 +620,7 @@

    Data LoadingAvailable Datasets @@ -607,7 +638,7 @@

    Data Loading +

    diff --git a/v0.6.0/using_doctr/using_model_export.html b/v0.6.0/using_doctr/using_model_export.html index cea0d92f01..75c81caa7c 100644 --- a/v0.6.0/using_doctr/using_model_export.html +++ b/v0.6.0/using_doctr/using_model_export.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Preparing your model for inference - docTR documentation @@ -236,12 +236,15 @@

    Package Reference

      +
    • doctr.contrib
    • doctr.datasets
    • doctr.io
    • doctr.models
    • @@ -299,64 +302,91 @@

      Preparing your model for inference

      A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

      -
      -

      Model compression

      -

      This section is meant to help you perform inference with compressed versions of your model.

      -
      -

      TensorFlow Lite

      -

      TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

      -
      >>> import tensorflow as tf
      ->>> from tensorflow.keras import Sequential
      ->>> from doctr.models import conv_sequence
      ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
      ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
      ->>> serialized_model = converter.convert()
      -
      -
      -
      +
      +

      Model optimization

      +

      This section is meant to help you perform inference with optimized versions of your model.

      Half-precision

      -

      If you want to convert it to half-precision using your TFLite converter

      -
      >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
      ->>> converter.target_spec.supported_types = [tf.float16]
      ->>> serialized_model = converter.convert()
      +

      NOTE: We support half-precision inference for PyTorch and TensorFlow models only on GPU devices.

      +

      Half-precision (or FP16) is a binary floating-point format that occupies 16 bits in computer memory.

      +

      Advantages:

      +
        +
      • Faster inference

      • +
      • Less memory usage

      • +
      +
      +
      import tensorflow as tf
      +from tensorflow.keras import mixed_precision
      +mixed_precision.set_global_policy('mixed_float16')
      +predictor = ocr_predictor(reco_arch="crnn_mobilenet_v3_small", det_arch="linknet_resnet34", pretrained=True)
       
      -
      -
      -

      Post-training quantization

      -

      Finally if you wish to quantize the model with your TFLite converter

      -
      >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
      ->>> # Float fallback for operators that do not have an integer implementation
      ->>> def representative_dataset():
      ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
      ->>> converter.representative_dataset = representative_dataset
      ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
      ->>> converter.inference_input_type = tf.int8
      ->>> converter.inference_output_type = tf.int8
      ->>> serialized_model = converter.convert()
      +
      +
      +

      Export to ONNX

      +

      ONNX (Open Neural Network Exchange) is an open and interoperable format for representing and exchanging machine learning models. +It defines a common format for representing models, including the network structure, layer types, parameters, and metadata.

      +
      +
      import tensorflow as tf
      +from doctr.models import vitstr_small
      +from doctr.models.utils import export_model_to_onnx
      +
      +batch_size = 16
      +input_shape = (3, 32, 128)
      +model = vitstr_small(pretrained=True, exportable=True)
      +dummy_input = [tf.TensorSpec([batch_size, input_shape], tf.float32, name="input")]
      +model_path, output = export_model_to_onnx(model, model_name="vitstr.onnx", dummy_input=dummy_input)
      +
      +
      +
      -
      -

      Using SavedModel

      -

      Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

      -
      >>> import tensorflow as tf
      ->>> from doctr.models import db_resnet50
      ->>> model = db_resnet50(pretrained=True)
      ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
      ->>> _ = model(input_t, training=False)
      ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
      +
      +

      Using your ONNX exported model

      +

      To use your exported model, we have build a dedicated lightweight package called OnnxTR. +The package doesn’t require PyTorch or TensorFlow to be installed - build on top of ONNXRuntime. +It is simple and easy-to-use (with the same interface you know already from docTR), that allows you to perform inference with your exported model.

      + +
      pip install onnxtr[cpu]
       
      -

      And loaded just as easily:

      -
      >>> import tensorflow as tf
      ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
      +
      from onnxtr.io import DocumentFile
      +from onnxtr.models import ocr_predictor, parseq, linknet_resnet18
      +# Load your documents
      +single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
      +
      +# Load your exported model/s
      +reco_model = parseq("path_to_custom_model.onnx", vocab="ABC")
      +det_model = linknet_resnet18("path_to_custom_model.onnx")
      +predictor = ocr_predictor(det_arch=det_model, reco_arch=reco_model)
      +# Or use any of the pre-trained models
      +predictor = ocr_predictor(det_arch="linknet_resnet18", reco_arch="parseq")
      +
      +# Get your results
      +res = predictor(single_img_doc)
       
      +
      @@ -364,12 +394,12 @@

      Using SavedModel - +
      Next
      -
      AWS Lambda
      +
      Train your own model
      @@ -415,13 +445,12 @@

      Using SavedModel

    + \ No newline at end of file diff --git a/v0.6.0/using_doctr/using_models.html b/v0.6.0/using_doctr/using_models.html index 1cf8113160..cfad7ff606 100644 --- a/v0.6.0/using_doctr/using_models.html +++ b/v0.6.0/using_doctr/using_models.html @@ -236,12 +236,15 @@

    Package Reference

      +
    • doctr.contrib
    • doctr.datasets
    • doctr.io
    • doctr.models
    • @@ -315,51 +318,204 @@

      Available architecturesThe following architectures are currently supported:

      -

      We also provide 2 models working with any kind of rotated documents:

      -

      For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

      - + - + + - + - + + - - - - - + + + + + - + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

      FUNSD

      CORD

      Architecture

      Backend

      Architecture

      Input shape

      # params

      Recall

      Precision

      Recall

      Precision

      FPS

      sec/it (B: 1)

      db_resnet50

      TensorFlow

      db_resnet50

      (1024, 1024, 3)

      25.2 M

      82.14

      87.64

      92.49

      89.66

      2.1

      84.39

      85.86

      93.70

      83.24

      1.2

      db_mobilenet_v3_large

      TensorFlow

      db_mobilenet_v3_large

      (1024, 1024, 3)

      4.2 M

      79.35

      84.03

      81.14

      66.85

      80.29

      70.90

      84.70

      67.76

      0.5

      TensorFlow

      linknet_resnet18

      (1024, 1024, 3)

      11.5 M

      81.37

      84.08

      85.71

      83.70

      0.7

      TensorFlow

      linknet_resnet34

      (1024, 1024, 3)

      21.6 M

      82.20

      85.49

      87.63

      87.17

      0.8

      TensorFlow

      linknet_resnet50

      (1024, 1024, 3)

      28.8 M

      80.70

      83.51

      86.46

      84.94

      1.1

      TensorFlow

      fast_tiny

      (1024, 1024, 3)

      13.5 M (8.5M)

      85.29

      85.34

      93.46

      75.99

      0.7 (0.4)

      TensorFlow

      fast_small

      (1024, 1024, 3)

      14.7 M (9.7M)

      85.50

      86.89

      94.05

      78.33

      0.7 (0.5)

      TensorFlow

      fast_base

      (1024, 1024, 3)

      16.3 M (10.6M)

      85.22

      86.97

      94.18

      84.74

      0.8 (0.5)

      PyTorch

      db_resnet34

      (1024, 1024, 3)

      22.4 M

      82.76

      76.75

      89.20

      71.74

      0.8

      PyTorch

      db_resnet50

      (1024, 1024, 3)

      25.4 M

      83.56

      86.68

      92.61

      86.39

      1.1

      PyTorch

      db_mobilenet_v3_large

      (1024, 1024, 3)

      4.2 M

      82.69

      84.63

      94.51

      70.28

      0.5

      PyTorch

      linknet_resnet18

      (1024, 1024, 3)

      11.5 M

      81.64

      85.52

      88.92

      82.74

      0.6

      PyTorch

      linknet_resnet34

      (1024, 1024, 3)

      21.6 M

      81.62

      82.95

      86.26

      81.06

      0.7

      PyTorch

      linknet_resnet50

      (1024, 1024, 3)

      28.8 M

      81.78

      82.47

      87.29

      85.54

      1.0

      PyTorch

      fast_tiny

      (1024, 1024, 3)

      13.5 M (8.5M)

      84.90

      85.04

      93.73

      76.26

      0.7 (0.4)

      PyTorch

      fast_small

      (1024, 1024, 3)

      14.7 M (9.7M)

      85.36

      86.68

      94.09

      78.53

      0.7 (0.5)

      PyTorch

      fast_base

      (1024, 1024, 3)

      16.3 M (10.6M)

      84.95

      86.73

      94.39

      85.36

      0.8 (0.5)

      @@ -367,16 +523,16 @@

      Available architecturesAll text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. doctr.datasets). Explanations about the metrics being used are available in Task evaluation.

      Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

      -

      FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large <https://aws.amazon.com/ec2/instance-types/c5/> AWS instance (CPU Xeon Platinum 8275L).

      +

      Seconds per iteration (with a batch size of 1) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz.

      Detection predictors

      detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

      -
      >>> import numpy as np
      ->>> from doctr.models import detection_predictor
      ->>> predictor = detection_predictor('db_resnet50')
      ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
      ->>> out = model([dummy_img])
      +
      import numpy as np
      +from doctr.models import detection_predictor
      +predictor = detection_predictor('db_resnet50')
      +dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
      +out = model([dummy_img])
       

      You can pass specific boolean arguments to the predictor:

      @@ -386,11 +542,10 @@

      Detection predictors

      symmetric_pad: if you choose to preserve the aspect ratio, it will pad the image symmetrically and not from the bottom-right.

    For instance, this snippet will instantiates a detection predictor able to detect text on rotated documents while preserving the aspect ratio:

    -

For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

-
- - +
+
Text recognition model zoo
- - - - - - + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + + + - + + - - - - + + + + + + - + + - - - - + + + + + + - + + - - - - + + + + + + - + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Architecture

Input shape

# params

FUNSD

CORD

FPS

FUNSD

CORD

crnn_vgg16_bn

Backend

Architecture

Input shape

# params

Exact

Partial

Exact

Partial

sec/it (B: 64)

TensorFlow

crnn_vgg16_bn

(32, 128, 3)

15.8 M

88.12

88.85

94.68

95.10

0.9

TensorFlow

crnn_mobilenet_v3_small

(32, 128, 3)

2.1 M

86.88

87.61

92.28

92.73

0.25

TensorFlow

crnn_mobilenet_v3_large

(32, 128, 3)

4.5 M

87.44

88.12

94.14

94.55

0.34

TensorFlow

master

(32, 128, 3)

58.8 M

87.44

88.21

93.83

94.25

22.3

TensorFlow

sar_resnet31

(32, 128, 3)

57.2 M

87.67

88.48

94.21

94.66

7.1

TensorFlow

vitstr_small

(32, 128, 3)

21.4 M

83.01

83.84

86.57

87.00

2.0

TensorFlow

vitstr_base

(32, 128, 3)

85.2 M

85.98

86.70

90.47

90.95

5.8

TensorFlow

parseq

(32, 128, 3)

23.8 M

81.62

82.29

79.13

79.52

3.6

PyTorch

crnn_vgg16_bn

(32, 128, 3)

15.8M

87.18

92.93

12.8

15.8 M

86.54

87.41

94.29

94.69

0.6

crnn_mobilenet_v3_small

PyTorch

crnn_mobilenet_v3_small

(32, 128, 3)

2.1M

86.21

90.56

2.1 M

87.25

87.99

93.91

94.34

0.05

crnn_mobilenet_v3_large

PyTorch

crnn_mobilenet_v3_large

(32, 128, 3)

4.5M

86.95

92.03

4.5 M

87.38

88.09

94.46

94.92

0.08

sar_resnet31

PyTorch

master

(32, 128, 3)

56.2M

87.70

93.41

2.7

58.7 M

88.57

89.39

95.73

96.21

17.6

master

PyTorch

sar_resnet31

(32, 128, 3)

67.7M

87.62

93.27

55.4 M

88.10

88.88

94.83

95.29

4.9

PyTorch

vitstr_small

(32, 128, 3)

21.4 M

88.00

88.82

95.40

95.78

1.5

PyTorch

vitstr_base

(32, 128, 3)

85.2 M

88.33

89.09

95.32

95.71

4.1

PyTorch

parseq

(32, 128, 3)

23.8 M

88.53

89.24

95.56

95.91

2.2

@@ -461,22 +751,22 @@

Available architecturesdoctr.datasets). Explanations about the metric being used (exact match) are available in Task evaluation.

While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

-
>>> from doctr.models import recognition_predictor
->>> predictor = recognition_predictor('crnn_vgg16_bn')
->>> print(predictor.model.cfg['vocab'])
+
from doctr.models import recognition_predictor
+predictor = recognition_predictor('crnn_vgg16_bn')
+print(predictor.model.cfg['vocab'])
 

Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large <https://aws.amazon.com/ec2/instance-types/c5/> AWS instance (CPU Xeon Platinum 8275L).

+

Seconds per iteration (with a batch size of 64) is computed after a warmup phase of 100 tensors, by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a 11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz.

Recognition predictors

recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

-
>>> import numpy as np
->>> from doctr.models import recognition_predictor
->>> predictor = recognition_predictor('crnn_vgg16_bn')
->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
->>> out = model([dummy_img])
+
import numpy as np
+from doctr.models import recognition_predictor
+predictor = recognition_predictor('crnn_vgg16_bn')
+dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
+out = model([dummy_img])
 
@@ -486,96 +776,162 @@

End-to-End OCR

Available architectures

-

You can use any combination of detection and recognition models supporte by docTR.

+

You can use any combination of detection and recognition models supported by docTR.

For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

- - - + + + - - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + - + + - - - - - - - + + + + + - + + + + + + +

FUNSD

CORD

FUNSD

CORD

Architecture

Recall

Precision

FPS

Backend

Architecture

Recall | Precision

Recall

Precision

FPS

db_resnet50 + crnn_vgg16_bn

71.25

76.02

0.85

84.00

81.42

1.6

db_resnet50 + master

71.03

76.06

84.49

81.94

db_resnet50 + sar_resnet31

71.25

76.29

0.27

84.50

81.96

0.83

db_resnet50 + crnn_mobilenet_v3_small

69.85

74.80

80.85

78.42

0.83

db_resnet50 + crnn_mobilenet_v3_large

70.57

75.57

82.57

80.08

0.83

db_mobilenet_v3_large + crnn_vgg16_bn

67.73

71.73

71.65

59.03

Gvision text detection

TensorFlow

db_resnet50 + crnn_vgg16_bn

73.45

74.73

85.79

76.21

TensorFlow

db_resnet50 + crnn_mobilenet_v3_small

72.66

73.93

83.43

74.11

TensorFlow

db_resnet50 + crnn_mobilenet_v3_large

72.86

74.13

85.16

75.65

TensorFlow

db_resnet50 + master

72.73

74.00

84.13

75.05

TensorFlow

db_resnet50 + sar_resnet31

73.23

74.51

85.34

76.03

TensorFlow

db_resnet50 + vitstr_small

68.57

69.77

78.24

69.51

TensorFlow

db_resnet50 + vitstr_base

70.96

72.20

82.10

72.94

TensorFlow

db_resnet50 + parseq

68.85

70.05

72.38

64.30

PyTorch

db_resnet50 + crnn_vgg16_bn

72.43

75.13

85.05

79.33

PyTorch

db_resnet50 + crnn_mobilenet_v3_small

73.06

75.79

84.64

78.94

PyTorch

db_resnet50 + crnn_mobilenet_v3_large

73.17

75.90

84.96

79.25

PyTorch

db_resnet50 + master

73.90

76.66

85.84

80.07

PyTorch

db_resnet50 + sar_resnet31

73.58

76.33

85.64

79.88

PyTorch

db_resnet50 + vitstr_small

73.06

75.79

85.95

80.17

PyTorch

db_resnet50 + vitstr_base

73.70

76.46

85.76

79.99

PyTorch

db_resnet50 + parseq

73.52

76.27

85.91

80.13

None

Gvision text detection

59.50

62.50

75.30

70.00

59.03

Gvision doc. text detection

None

Gvision doc. text detection

64.00

53.30

68.90

61.10

AWS textract

78.10

83.00

87.50

None

AWS textract

78.10

83.00

87.50

66.00

None

Azure Form Recognizer (v3.2)

79.42

85.89

89.62

88.93

@@ -583,174 +939,52 @@

Available architecturesdoctr.datasets). Explanations about the metrics being used are available in Task evaluation.

Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

-

FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large <https://aws.amazon.com/ec2/instance-types/c5/> AWS instance (CPU Xeon Platinum 8275L).

-

Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Receipts

Invoices

IDs

US Tax Forms

Resumes

Road Fines

Architecture

Recall

Precision

Recall

Precision

Recall

Precision

Recall

Precision

Recall

Precision

Recall

Precision

db_resnet50 + crnn_vgg16_bn (ours)

78.70

81.12

65.80

70.70

50.25

51.78

79.08

92.83

db_resnet50 + master (ours)

79.00

81.42

65.57

69.86

51.34

52.90

78.86

92.57

db_resnet50 + sar_resnet31 (ours)

78.94

81.37

65.89

70.79

51.78

53.35

79.04

92.78

db_resnet50 + crnn_mobilenet_v3_small (ours)

76.81

79.15

64.89

69.61

45.03

46.38

78.96

92.11

85.91

87.20

84.85

85.86

db_resnet50 + crnn_mobilenet_v3_large (ours)

78.01

80.39

65.36

70.11

48.00

49.43

79.39

92.62

87.68

89.00

85.65

86.67

db_mobilenet_v3_large + crnn_vgg16_bn (ours)

78.36

74.93

63.04

68.41

39.36

41.75

72.14

89.97

Gvision doc. text detection

68.91

59.89

63.20

52.85

43.70

29.21

69.79

65.68

AWS textract

75.77

77.70

70.47

69.13

46.39

43.32

84.31

98.11

-

Two-stage approaches

Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

-
>>> import numpy as np
->>> from doctr.models import ocr_predictor
->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
->>> out = model([input_page])
+
import numpy as np
+from doctr.models import ocr_predictor
+model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
+input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
+out = model([input_page])
 

You can pass specific boolean arguments to the predictor:

    -
  • assume_straight_pages

  • -
  • preserve_aspect_ratio

  • -
  • symmetric_pad

  • +
  • assume_straight_pages: if you work with straight documents only, it will fit straight bounding boxes to the text areas.

  • +
  • preserve_aspect_ratio: if you want to preserve the aspect ratio of your documents while resizing before sending them to the model.

  • +
  • symmetric_pad: if you choose to preserve the aspect ratio, it will pad the image symmetrically and not from the bottom-right.

Those 3 are going straight to the detection predictor, as mentioned above (in the detection part).

+

Additional arguments which can be passed to the ocr_predictor are:

  • export_as_straight_boxes: If you work with rotated and skewed documents but you still want to export straight bounding boxes and not polygons, set it to True.

  • +
  • straighten_pages: If you want to straighten the pages before sending them to the detection model, set it to True.

For instance, this snippet instantiates an end-to-end ocr_predictor working with rotated documents, which preserves the aspect ratio of the documents, and returns polygons:

-
>>> from doctr.model import ocr_predictor
->>> model = ocr_predictor('linknet_resnet18_rotation', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True)
+
from doctr.model import ocr_predictor
+model = ocr_predictor('linknet_resnet18', pretrained=True, assume_straight_pages=False, preserve_aspect_ratio=True)
+
+
+

Additionally, you can change the batch size of the underlying detection and recognition predictors to optimize the performance depending on your hardware:

+
    +
  • det_bs: batch size for the detection model (default: 2)

  • +
  • reco_bs: batch size for the recognition model (default: 128)

  • +
+
from doctr.model import ocr_predictor
+model = ocr_predictor(pretrained=True, det_bs=4, reco_bs=1024)
+
+
+

To modify the output structure you can pass the following arguments to the predictor which will be handled by the underlying DocumentBuilder:

+
    +
  • resolve_lines: whether words should be automatically grouped into lines (default: True)

  • +
  • resolve_blocks: whether lines should be automatically grouped into blocks (default: False)

  • +
  • paragraph_break: relative length of the minimum space separating paragraphs (default: 0.035)

  • +
+

For example to disable the automatic grouping of lines into blocks:

+
from doctr.model import ocr_predictor
+model = ocr_predictor(pretrained=True, resolve_blocks=False)
 
@@ -776,11 +1010,19 @@

What should I do with the output?)

+

To get only the text content of the Document, you can use the render method:

+
text_output = result.render()
+
+
+

For reference, here is the output for the Document above:

+
No. RECEIPT DATE
+
+

You can also export them as a nested dict, more appropriate for JSON format:

json_output = result.export()
 
-

For reference, here is the JSON export for the same Document as above:

+

For reference, here is the export for the same Document as above:

{
   'pages': [
       {
@@ -798,17 +1040,23 @@ 

What should I do with the output?{ 'value': 'No.', 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) + 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)), + 'objectness_score': 0.96, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'RECEIPT', 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) + 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, }, { 'value': 'DATE', 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) + 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)), + 'objectness_score': 0.99, + 'crop_orientation': {'value': 0, 'confidence': None}, } ] } @@ -822,34 +1070,95 @@

What should I do with the output?
xml_output = result.export_as_xml()
+
xml_output = result.export_as_xml()
 for output in xml_output:
     xml_bytes_string = output[0]
     xml_element = output[1]
 

For reference, here is a sample XML byte string output:

-
<?xml version="1.0" encoding="UTF-8"?>
-<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
-  <head>
-    <title>docTR - hOCR</title>
-    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-    <meta name="ocr-system" content="doctr 0.5.0" />
-    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
-  </head>
-  <body>
-    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
-    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
-      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
-        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
-          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
-          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
-          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
-        </span>
-      </p>
-    </div>
-  </body>
-</html>
+
<?xml version="1.0" encoding="UTF-8"?>
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+  <head>
+    <title>docTR - hOCR</title>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    <meta name="ocr-system" content="doctr 0.5.0" />
+    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+  </head>
+  <body>
+    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
+    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
+      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
+        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
+          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
+          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
+          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
+        </span>
+      </p>
+    </div>
+  </body>
+</html>
+
+
+ +
+

Advanced options

+

We provide a few advanced options to customize the behavior of the predictor to your needs:

+
    +
  • Modify the binarization threshold for the detection model.

  • +
  • Modify the box threshold for the detection model.

  • +
+

This is useful to detect (possible less) text regions more accurately with a higher threshold, or to detect more text regions with a lower threshold.

+
import numpy as np
+from doctr.models import ocr_predictor
+predictor = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
+
+# Modify the binarization threshold and the box threshold
+predictor.det_predictor.model.postprocessor.bin_thresh = 0.5
+predictor.det_predictor.model.postprocessor.box_thresh = 0.2
+
+input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
+out = predictor([input_page])
+
+
+
    +
  • Disable page orientation classification

  • +
+

If you deal with documents which contains only small rotations (~ -45 to 45 degrees), you can disable the page orientation classification to speed up the inference.

+

This will only have an effect with assume_straight_pages=False and/or straighten_pages=True and/or detect_orientation=True.

+
from doctr.model import ocr_predictor
+model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_page_orientation=True)
+
+
+
    +
  • Disable crop orientation classification

  • +
+

If you deal with documents which contains only horizontal text, you can disable the crop orientation classification to speed up the inference.

+

This will only have an effect with assume_straight_pages=False and/or straighten_pages=True.

+
from doctr.model import ocr_predictor
+model = ocr_predictor(pretrained=True, assume_straight_pages=False, disable_crop_orientation=True)
+
+
+
    +
  • Add a hook to the ocr_predictor to manipulate the location predictions before the crops are passed to the recognition model.

  • +
+
from doctr.model import ocr_predictor
+
+class CustomHook:
+    def __call__(self, loc_preds):
+        # Manipulate the location predictions here
+        # 1. The outpout structure needs to be the same as the input location predictions
+        # 2. Be aware that the coordinates are relative and needs to be between 0 and 1
+        return loc_preds
+
+my_hook = CustomHook()
+
+predictor = ocr_predictor(pretrained=True)
+# Add a hook in the middle of the pipeline
+predictor.add_hook(my_hook)
+# You can also add multiple hooks which will be executed sequentially
+for hook in [my_hook, my_hook, my_hook]:
+    predictor.add_hook(hook)
 
@@ -926,6 +1235,7 @@

What should I do with the output?Available architectures
  • Two-stage approaches
  • What should I do with the output?
  • +
  • Advanced options
  • @@ -939,7 +1249,7 @@

    What should I do with the output? +

    diff --git a/v0.6.0/using_model_export.html b/v0.6.0/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.6.0/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Preparing your model for inference

    -

    A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

    -
    -

    Model compression

    -

    This section is meant to help you perform inference with compressed versions of your model.

    -
    -

    TensorFlow Lite

    -

    TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

    -
    >>> import tensorflow as tf
    ->>> from tensorflow.keras import Sequential
    ->>> from doctr.models import conv_sequence
    ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Half-precision

    -

    If you want to convert it to half-precision using your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> converter.target_spec.supported_types = [tf.float16]
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Post-training quantization

    -

    Finally if you wish to quantize the model with your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> # Float fallback for operators that do not have an integer implementation
    ->>> def representative_dataset():
    ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
    ->>> converter.representative_dataset = representative_dataset
    ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    ->>> converter.inference_input_type = tf.int8
    ->>> converter.inference_output_type = tf.int8
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -
    -

    Using SavedModel

    -

    Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

    -
    >>> import tensorflow as tf
    ->>> from doctr.models import db_resnet50
    ->>> model = db_resnet50(pretrained=True)
    ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> _ = model(input_t, training=False)
    ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    -
    -
    -

    And loaded just as easily:

    -
    >>> import tensorflow as tf
    ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/using_models.html b/v0.6.0/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.6.0/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Choosing the right model

    -

    The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

    -

    For a given task, docTR provides a Predictor, which is composed of 2 components:

    -
      -
    • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

    • -
    • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

    • -
    -
    -

    Text Detection

    -

    The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    db_mobilenet_v3_large

    (1024, 1024, 3)

    4.2 M

    79.35

    84.03

    81.14

    66.85

    -
    -

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Detection predictors

    -

    detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import detection_predictor
    ->>> predictor = detection_predictor('db_resnet50')
    ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    Text Recognition

    -

    The task consists of transcribing the character sequence in a given image.

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    87.18

    92.93

    12.8

    crnn_mobilenet_v3_small

    (32, 128, 3)

    2.1M

    86.21

    90.56

    crnn_mobilenet_v3_large

    (32, 128, 3)

    4.5M

    86.95

    92.03

    sar_resnet31

    (32, 128, 3)

    56.2M

    87.70

    93.41

    2.7

    master

    (32, 128, 3)

    67.7M

    87.62

    93.27

    -
    -

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

    -

    While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

    -
    >>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> print(predictor.model.cfg['vocab'])
    -
    -
    -

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Recognition predictors

    -

    recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    End-to-End OCR

    -

    The task consists of both localizing and transcribing textual elements in a given image.

    -
    -

    Available architectures

    -

    You can use any combination of detection and recognition models supporte by docTR.

    -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    71.25

    76.02

    0.85

    84.00

    81.42

    1.6

    db_resnet50 + master

    71.03

    76.06

    84.49

    81.94

    db_resnet50 + sar_resnet31

    71.25

    76.29

    0.27

    84.50

    81.96

    0.83

    db_resnet50 + crnn_mobilenet_v3_small

    69.85

    74.80

    80.85

    78.42

    0.83

    db_resnet50 + crnn_mobilenet_v3_large

    70.57

    75.57

    82.57

    80.08

    0.83

    db_mobilenet_v3_large + crnn_vgg16_bn

    67.73

    71.73

    71.65

    59.03

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    -
    -

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -

    Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Receipts

    Invoices

    IDs

    US Tax Forms

    Resumes

    Road Fines

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.70

    81.12

    65.80

    70.70

    50.25

    51.78

    79.08

    92.83

    db_resnet50 + master (ours)

    79.00

    81.42

    65.57

    69.86

    51.34

    52.90

    78.86

    92.57

    db_resnet50 + sar_resnet31 (ours)

    78.94

    81.37

    65.89

    70.79

    51.78

    53.35

    79.04

    92.78

    db_resnet50 + crnn_mobilenet_v3_small (ours)

    76.81

    79.15

    64.89

    69.61

    45.03

    46.38

    78.96

    92.11

    85.91

    87.20

    84.85

    85.86

    db_resnet50 + crnn_mobilenet_v3_large (ours)

    78.01

    80.39

    65.36

    70.11

    48.00

    49.43

    79.39

    92.62

    87.68

    89.00

    85.65

    86.67

    db_mobilenet_v3_large + crnn_vgg16_bn (ours)

    78.36

    74.93

    63.04

    68.41

    39.36

    41.75

    72.14

    89.97

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    69.79

    65.68

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    84.31

    98.11

    -
    -
    -
    -

    Two-stage approaches

    -

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

    -
    >>> import numpy as np
    ->>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([input_page])
    -
    -
    -
    -
    -

    What should I do with the output?

    -

    The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

    -

    Here is a typical Document layout:

    -
    Document(
    -  (pages): [Page(
    -    dimensions=(340, 600)
    -    (blocks): [Block(
    -      (lines): [Line(
    -        (words): [
    -          Word(value='No.', confidence=0.91),
    -          Word(value='RECEIPT', confidence=0.99),
    -          Word(value='DATE', confidence=0.96),
    -        ]
    -      )]
    -      (artefacts): []
    -    )]
    -  )]
    -)
    -
    -
    -

    You can also export them as a nested dict, more appropriate for JSON format:

    -
    json_output = result.export()
    -
    -
    -

    For reference, here is the JSON export for the same Document as above:

    -
    {
    -  'pages': [
    -      {
    -          'page_idx': 0,
    -          'dimensions': (340, 600),
    -          'orientation': {'value': None, 'confidence': None},
    -          'language': {'value': None, 'confidence': None},
    -          'blocks': [
    -              {
    -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                  'lines': [
    -                      {
    -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                          'words': [
    -                              {
    -                                  'value': 'No.',
    -                                  'confidence': 0.914085328578949,
    -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
    -                              },
    -                              {
    -                                  'value': 'RECEIPT',
    -                                  'confidence': 0.9949972033500671,
    -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
    -                              },
    -                              {
    -                                  'value': 'DATE',
    -                                  'confidence': 0.9578408598899841,
    -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
    -                              }
    -                          ]
    -                      }
    -                  ],
    -                  'artefacts': []
    -              }
    -          ]
    -      }
    -  ]
    -}
    -
    -
    -

    To export the outpout as XML (hocr-format) you can use the export_as_xml method:

    -
    xml_output = result.export_as_xml()
    -for output in xml_output:
    -  xml_bytes_string = output[0]
    -  xml_element = output[1]
    -
    -
    -

    For reference, here is a sample XML byte string output:

    -
    <?xml version="1.0" encoding="UTF-8"?>
    -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    -  <head>
    -    <title>docTR - hOCR</title>
    -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    -    <meta name="ocr-system" content="doctr 0.5.0" />
    -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
    -  </head>
    -  <body>
    -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
    -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
    -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
    -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
    -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
    -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
    -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
    -        </span>
    -      </p>
    -    </div>
    -  </body>
    -</html>
    -
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.6.0/utils.html b/v0.6.0/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.6.0/utils.html +++ b/v0.6.0/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

    Visualization -
    -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
    -

    Draw a the content of the element page (OCR response) on a blank page.

    -
    -
    Parameters:
    -
      -
    • page – exported Page object to represent

    • -
    • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

    • -
    • font_size – size of the font, default font = 13

    • -
    • font_family – family of the font

    • -
    -
    -
    Returns:
    -

    the synthesized page

    -
    -
    -
    -

    Task evaluation

    @@ -382,20 +356,6 @@

    Visualization -
    -update(gt: List[str], pred: List[str]) None[source]
    -

    Update the state of the metric with new predictions

    -
    -
    Parameters:
    -
      -
    • gt – list of groung-truth character sequences

    • -
    • pred – list of predicted character sequences

    • -
    -
    -
    -
    -
    summary() Dict[str, float][source]
    @@ -412,14 +372,14 @@

    Visualization
    -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

    Implements common confusion metrics and mean IoU for localization evaluation.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    -update(gts: ndarray, preds: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    -
    -
    -
    -
    summary() Tuple[float | None, float | None, float | None][source]
    @@ -485,15 +426,15 @@

    Visualization
    -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an end-to-end OCR metric.

    +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

    Implements end-to-end OCR metric.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -

    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – a list of N string labels

    • -
    • pred_labels – a list of M string labels

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

    Computes the aggregated metrics

    Returns:
    -

    a tuple with the recall & precision for each string comparison and the mean IoU

    -
    -
    -
    - - - -
    -
    -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an object detection metric.

    -

    The aggregated metrics are computed as follows:

    -
    -
    -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
    -
    -

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

    -
    -
    -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
    -
    -

    where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.utils import DetectionMetric
    ->>> metric = DetectionMetric(iou_thresh=0.5)
    ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
    -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
    ->>> metric.summary()
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -
    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – an array of class indices of shape (N,)

    • -
    • pred_labels – an array of class indices of shape (M,)

    • -
    -
    -
    -
    - -
    -
    -summary() Tuple[float | None, float | None, float | None][source]
    -

    Computes the aggregated metrics

    -
    -
    Returns:
    -

    a tuple with the recall & precision for each class prediction and the mean IoU

    +

    a tuple with the recall & precision for each string comparison flexibility and the mean IoU

    @@ -649,15 +490,7 @@

    Visualization - -
    -
    - Next -
    -
    Changelog
    -
    - -
    + diff --git a/v0.7.0/_modules/doctr/datasets/classification/tensorflow.html b/v0.7.0/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.7.0/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    -
    - -
    - -
    -
    -

    Source code for doctr.datasets.classification.tensorflow

    -# Copyright (C) 2021, Mindee.
    -
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    -
    -import tensorflow as tf
    -
    -from .base import _CharacterGenerator
    -
    -__all__ = ['CharacterGenerator']
    -
    -
    -
    -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
    - -
    -
    -
    -
    - - -
    -
    - - Made with Sphinx and @pradyunsg's - - Furo - -
    -
    - -
    -
    - -
    -
    - -
    -
    - - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.7.0/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -284,7 +284,6 @@

      Source code for doctr.datasets.datasets.tensorflow

      from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

      Source code for doctr.datasets.datasets.tensorflow

      class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

      Source code for doctr.datasets.datasets.tensorflow

      +
      diff --git a/v0.7.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.7.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index a0f857205e..0000000000 --- a/v0.7.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,688 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.mobilenet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
      -
      -from typing import Any, Dict, List, Optional, Tuple, Union
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ....datasets import VOCABS
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
      -           "mobilenet_v3_large_r"]
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'mobilenet_v3_large': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
      -    },
      -    'mobilenet_v3_large_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    },
      -    'mobilenet_v3_small': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
      -    },
      -    'mobilenet_v3_small_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    }
      -}
      -
      -
      -def hard_swish(x: tf.Tensor) -> tf.Tensor:
      -    return x * tf.nn.relu6(x + 3.) / 6.0
      -
      -
      -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
      -    if min_value is None:
      -        min_value = divisor
      -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
      -    # Make sure that round down does not go down by more than 10%.
      -    if new_v < 0.9 * v:
      -        new_v += divisor
      -    return new_v
      -
      -
      -class SqueezeExcitation(Sequential):
      -    """Squeeze and Excitation.
      -    """
      -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
      -        super().__init__(
      -            [
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(chan // squeeze_factor, activation='relu'),
      -                layers.Dense(chan, activation='hard_sigmoid'),
      -                layers.Reshape((1, 1, chan))
      -            ]
      -        )
      -
      -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
      -        x = super().call(inputs, **kwargs)
      -        x = tf.math.multiply(inputs, x)
      -        return x
      -
      -
      -class InvertedResidualConfig:
      -    def __init__(
      -        self,
      -        input_channels: int,
      -        kernel: int,
      -        expanded_channels: int,
      -        out_channels: int,
      -        use_se: bool,
      -        activation: str,
      -        stride: Union[int, Tuple[int, int]],
      -        width_mult: float = 1,
      -    ) -> None:
      -        self.input_channels = self.adjust_channels(input_channels, width_mult)
      -        self.kernel = kernel
      -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
      -        self.out_channels = self.adjust_channels(out_channels, width_mult)
      -        self.use_se = use_se
      -        self.use_hs = activation == "HS"
      -        self.stride = stride
      -
      -    @staticmethod
      -    def adjust_channels(channels: int, width_mult: float):
      -        return _make_divisible(channels * width_mult, 8)
      -
      -
      -class InvertedResidual(layers.Layer):
      -    """InvertedResidual for mobilenet
      -
      -    Args:
      -        conf: configuration object for inverted residual
      -    """
      -    def __init__(
      -        self,
      -        conf: InvertedResidualConfig,
      -        **kwargs: Any,
      -    ) -> None:
      -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
      -        super().__init__(**kwargs)
      -
      -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
      -
      -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
      -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
      -
      -        _layers = []
      -        # expand
      -        if conf.expanded_channels != conf.input_channels:
      -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
      -
      -        # depth-wise
      -        _layers.extend(conv_sequence(
      -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
      -            groups=conf.expanded_channels,
      -        ))
      -
      -        if conf.use_se:
      -            _layers.append(SqueezeExcitation(conf.expanded_channels))
      -
      -        # project
      -        _layers.extend(conv_sequence(
      -            conf.out_channels, None, kernel_size=1, bn=True,
      -        ))
      -
      -        self.block = Sequential(_layers)
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor,
      -        **kwargs: Any,
      -    ) -> tf.Tensor:
      -
      -        out = self.block(inputs, **kwargs)
      -        if self.use_res_connect:
      -            out = tf.add(out, inputs)
      -
      -        return out
      -
      -
      -class MobileNetV3(Sequential):
      -    """Implements MobileNetV3, inspired from both:
      -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
      -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
      -    """
      -
      -    def __init__(
      -        self,
      -        layout: List[InvertedResidualConfig],
      -        input_shape: Tuple[int, int, int],
      -        include_top: bool = False,
      -        head_chans: int = 1024,
      -        num_classes: int = 1000,
      -    ) -> None:
      -
      -        _layers = [
      -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
      -                       input_shape=input_shape), name="stem")
      -        ]
      -
      -        for idx, conf in enumerate(layout):
      -            _layers.append(
      -                InvertedResidual(conf, name=f"inverted_{idx}"),
      -            )
      -
      -        _layers.append(
      -            Sequential(
      -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
      -                name="final_block"
      -            )
      -        )
      -
      -        if include_top:
      -            _layers.extend([
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(head_chans, activation=hard_swish),
      -                layers.Dropout(0.2),
      -                layers.Dense(num_classes),
      -            ])
      -
      -        super().__init__(_layers)
      -
      -
      -def _mobilenet_v3(
      -    arch: str,
      -    pretrained: bool,
      -    input_shape: Optional[Tuple[int, int, int]] = None,
      -    **kwargs: Any
      -) -> MobileNetV3:
      -    input_shape = input_shape or default_cfgs[arch]['input_shape']
      -
      -    # cf. Table 1 & 2 of the paper
      -    if arch.startswith("mobilenet_v3_small"):
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
      -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -        ]
      -        head_chans = 1024
      -    else:
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
      -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
      -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -        ]
      -        head_chans = 1280
      -
      -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
      -
      -    # Build the model
      -    model = MobileNetV3(
      -        inverted_residual_setting,
      -        input_shape,
      -        head_chans=head_chans,
      -        **kwargs,
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.7.0/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index d959be9a0f..0000000000 --- a/v0.7.0/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.resnet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, List, Optional, Tuple
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
      -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
      -                 'url': None},
      -}
      -
      -
      -class ResnetBlock(layers.Layer):
      -
      -    """Implements a resnet31 block with shortcut
      -
      -    Args:
      -        conv_shortcut: Use of shortcut
      -        output_channels: number of channels to use in Conv2D
      -        kernel_size: size of square kernels
      -        strides: strides to use in the first convolution of the block
      -    """
      -    def __init__(
      -        self,
      -        output_channels: int,
      -        conv_shortcut: bool,
      -        strides: int = 1,
      -        **kwargs
      -    ) -> None:
      -
      -        super().__init__(**kwargs)
      -        if conv_shortcut:
      -            self.shortcut = Sequential(
      -                [
      -                    layers.Conv2D(
      -                        filters=output_channels,
      -                        strides=strides,
      -                        padding='same',
      -                        kernel_size=1,
      -                        use_bias=False,
      -                        kernel_initializer='he_normal'
      -                    ),
      -                    layers.BatchNormalization()
      -                ]
      -            )
      -        else:
      -            self.shortcut = layers.Lambda(lambda x: x)
      -        self.conv_block = Sequential(
      -            self.conv_resnetblock(output_channels, 3, strides)
      -        )
      -        self.act = layers.Activation('relu')
      -
      -    @staticmethod
      -    def conv_resnetblock(
      -        output_channels: int,
      -        kernel_size: int,
      -        strides: int = 1,
      -    ) -> List[layers.Layer]:
      -        return [
      -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
      -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
      -            layers.BatchNormalization(),
      -        ]
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor
      -    ) -> tf.Tensor:
      -        clone = self.shortcut(inputs)
      -        conv_out = self.conv_block(inputs)
      -        out = self.act(clone + conv_out)
      -
      -        return out
      -
      -
      -class ResnetStage(Sequential):
      -
      -    """Implements a resnet31 stage
      -
      -    Args:
      -        num_blocks: number of blocks inside the stage
      -        output_channels: number of channels to use in Conv2D
      -        downsample: if true, performs a /2 downsampling at the first block of the stage
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: int,
      -        output_channels: int,
      -        downsample: bool = False,
      -    ) -> None:
      -
      -        super().__init__()
      -        final_blocks = [
      -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
      -        ]
      -        if downsample is True:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
      -        else:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
      -        for final_block in final_blocks:
      -            self.add(final_block)
      -
      -
      -class ResNet(Sequential):
      -
      -    """Resnet class with two convolutions and a maxpooling before the first stage
      -
      -    Args:
      -        num_blocks: number of resnet block in each stage
      -        output_channels: number of channels in each stage
      -        conv_seq: wether to add a conv_sequence after each stage
      -        pooling: pooling to add after each stage (if None, no pooling)
      -        input_shape: shape of inputs
      -        include_top: whether the classifier head should be instantiated
      -    """
      -
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int],
      -        output_channels: Tuple[int, int, int, int],
      -        conv_seq: Tuple[bool, bool, bool, bool],
      -        pooling: Tuple[
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]]
      -        ],
      -        input_shape: Tuple[int, int, int] = (640, 640, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = [
      -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
      -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
      -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
      -        ]
      -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
      -            _layers.append(ResnetStage(n_blocks, out_channels))
      -            if conv:
      -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
      -            if pool:
      -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
      -        super().__init__(_layers)
      -
      -
      -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
      -
      -    # Build the model
      -    model = ResNet(
      -        default_cfgs[arch]['num_blocks'],
      -        default_cfgs[arch]['output_channels'],
      -        default_cfgs[arch]['conv_seq'],
      -        default_cfgs[arch]['pooling'],
      -        **kwargs
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.7.0/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 48c285257a..0000000000 --- a/v0.7.0/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.vgg.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, Tuple
      -
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['VGG', 'vgg16_bn']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
      -                 'rect_pools': (False, False, True, True, True),
      -                 'url': None},
      -}
      -
      -
      -class VGG(Sequential):
      -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
      -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
      -
      -    Args:
      -        num_blocks: number of convolutional block in each stage
      -        planes: number of output channels in each stage
      -        rect_pools: whether pooling square kernels should be replace with rectangular ones
      -        input_shape: shapes of the input tensor
      -        include_top: whether the classifier head should be instantiated
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int, int],
      -        planes: Tuple[int, int, int, int, int],
      -        rect_pools: Tuple[bool, bool, bool, bool, bool],
      -        input_shape: Tuple[int, int, int] = (512, 512, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = []
      -        # Specify input_shape only for the first layer
      -        kwargs = {"input_shape": input_shape}
      -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
      -            for _ in range(nb_blocks):
      -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
      -                kwargs = {}
      -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
      -        super().__init__(_layers)
      -
      -
      -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
      -
      -    # Build the model
      -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
      -                default_cfgs[arch]['rect_pools'], **kwargs)
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html b/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html index 8f38b3470e..407e480818 100644 --- a/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/classification/textnet/tensorflow.html @@ -302,7 +302,7 @@

      Source code for doctr.models.classification.textnet.tensorflow

      from copy import deepcopy from typing import Any, Dict, List, Optional, Tuple -from keras import Sequential, layers +from tensorflow.keras import Sequential, layers from doctr.datasets import VOCABS diff --git a/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.7.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

      Source code for doctr.models.detection.fast.tensorflow

      import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.7.0/_sources/datasets.rst.txt b/v0.7.0/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.7.0/_sources/datasets.rst.txt +++ b/v0.7.0/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.7.0/_sources/installing.rst.txt b/v0.7.0/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.7.0/_sources/installing.rst.txt +++ b/v0.7.0/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.7.0/_sources/io.rst.txt b/v0.7.0/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.7.0/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.7.0/_sources/models.rst.txt b/v0.7.0/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.7.0/_sources/models.rst.txt +++ b/v0.7.0/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.7.0/_sources/notebooks.md.txt b/v0.7.0/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.7.0/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.7.0/_sources/transforms.rst.txt b/v0.7.0/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.7.0/_sources/transforms.rst.txt +++ b/v0.7.0/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.7.0/_sources/using_model_export.rst.txt b/v0.7.0/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.7.0/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.7.0/_sources/using_models.rst.txt b/v0.7.0/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.7.0/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
      -
      -

      - - Hello - XML - World - -

      -
      - - \ No newline at end of file diff --git a/v0.7.0/_sources/utils.rst.txt b/v0.7.0/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.7.0/_sources/utils.rst.txt +++ b/v0.7.0/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.7.0/datasets.html b/v0.7.0/datasets.html index 1f5855cc82..640791680a 100644 --- a/v0.7.0/datasets.html +++ b/v0.7.0/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,12 +287,16 @@

      doctr.datasets

      Available Datasets

      -

      Here are all datasets that are available through docTR:

      -
      -

      Public datasets

      +

      The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

      +
      +
      +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
      +
      + +

      Here are all datasets that are available through DocTR:

      -class doctr.datasets.FUNSD(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

      Example::
      >>> from doctr.datasets import FUNSD
      @@ -313,7 +310,8 @@ 

      Public datasetsParameters:
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      @@ -322,7 +320,7 @@

      Public datasets
      -class doctr.datasets.SROIE(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

      Example::
      - -
      -
      -class doctr.datasets.IIIT5K(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

      -
      -
      Example::
      >>> # NOTE: this dataset is for character-level localization
      ->>> from doctr.datasets import IIIT5K
      ->>> train_set = IIIT5K(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVT(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

      -
      -
      Example::
      >>> from doctr.datasets import SVT
      ->>> train_set = SVT(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVHN(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVHN dataset from “The Street View House Numbers (SVHN) Dataset”.

      -
      -
      Example::
      >>> from doctr.datasets import SVHN
      ->>> train_set = SVHN(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SynthText(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SynthText dataset from “Synthetic Data for Text Localisation in Natural Images” | “repository” | -“website”.

      -
      -
      Example::
      >>> from doctr.datasets import SynthText
      ->>> train_set = SynthText(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC03(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC03 dataset from “ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions”.

      -
      -
      Example::
      >>> from doctr.datasets import IC03
      ->>> train_set = IC03(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC13(img_folder: str, label_folder: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC13 dataset from “ICDAR 2013 Robust Reading Competition”. -Example:

      -
      >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
      ->>> from doctr.datasets import IC13
      ->>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
      ->>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
      ->>> img, target = train_set[0]
      ->>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
      ->>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
      ->>> img, target = test_set[0]
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_folder – folder with all annotation files for the images

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -

      -
      -

      docTR synthetic datasets

      -
      -
      -class doctr.datasets.DocArtefacts(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

      -
      -
      Example::
      >>> from doctr.datasets import DocArtefacts
      ->>> train_set = DocArtefacts(download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      -
      -
      -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import CharacterGenerator
      ->>> ds = CharacterGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.WordGenerator(vocab: str, min_chars: int, max_chars: int, num_samples: int, cache_samples: bool = False, font_family: str | List[str] | None = None, img_transforms: Callable[[Any], Any] | None = None, sample_transforms: Callable[[Any, Any], Tuple[Any, Any]] | None = None)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import WordGenerator
      ->>> ds = WordGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • min_chars – minimum number of characters in a word

      • -
      • max_chars – maximum number of characters in a word

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -

      docTR private datasets

      -

      Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same.

      -
      -
      -class doctr.datasets.DetectionDataset(img_folder: str, label_path: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Implements a text detection dataset

      -
      -
      Example::
      >>> from doctr.datasets import DetectionDataset
      ->>> train_set = DetectionDataset(img_folder="/path/to/images", label_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_path – path to the annotations of each image

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.RecognitionDataset(img_folder: str, labels_path: str, **kwargs: Any)[source]
      -

      Dataset implementation for text recognition tasks

      -
      -
      Example::
      >>> from doctr.datasets import RecognitionDataset
      ->>> train_set = RecognitionDataset(img_folder="/path/to/images", labels_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – path to the images folder

      • -
      • labels_path – pathe to the json file containing all labels (character sequences)

      • -
      -
      -
      -
      -
      -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      Implements an OCR dataset

      Parameters:
      • img_folder – local path to image folder (all jpg at the root)

      • label_file – local path to the label file

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • **kwargs – keyword arguments from VisionDataset.

      -

    Data Loading

    -

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

    +

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

    -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
    +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

    Implements a dataset wrapper for fast data loading

    Example::
    >>> from doctr.datasets import FUNSD, DataLoader
    @@ -681,7 +408,7 @@ 

    Data Loading

    Supported Vocabs

    -

    Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

    Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

    - +@@ -724,39 +451,19 @@

    Data Loading

    - - - - - - - - - - + + - - - - - - - - - - - - - - + +
    docTR VocabsDocTR Vocabs

    latin

    94

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

    english

    100

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

    legacy_french

    123

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    96

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

    french

    126

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

    portuguese

    131

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

    spanish

    116

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

    german

    108

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

    154

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
    +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

    Encode character sequences using a given vocab as mapping

    Parameters:
    @@ -767,7 +474,6 @@

    Data LoadingReturns: @@ -784,23 +490,23 @@

    Data Loading - +
    Next
    -
    doctr.io
    +
    doctr.documents
    - +
    Previous
    -
    Preparing your model for inference
    +
    Changelog
    @@ -836,32 +542,13 @@

    Data Loadingdoctr.datasets

    diff --git a/v0.7.0/installing.html b/v0.7.0/installing.html index b79f453bd6..8068adc0ba 100644 --- a/v0.7.0/installing.html +++ b/v0.7.0/installing.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Installation - docTR documentation @@ -227,28 +227,21 @@ @@ -290,16 +283,16 @@

    Installation

    -

    This library requires Python 3.6 or higher.

    +

    This library requires Python 3.6 or higher.

    Prerequisites

    Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

    If you are running another OS than Linux, you will need a few extra dependencies.

    -

    For MacOS users, you can install them using Homebrew as follows:

    +

    For MacOS users, you can install them as follows:

    brew install cairo pango gdk-pixbuf libffi
     
    @@ -307,17 +300,10 @@

    Prerequisites

    Via Python Package

    -

    Install the last stable release of the package using pip:

    +

    Install the last stable release of the package using pip:

    pip install python-doctr
     
    -

    We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

    -
    # for TensorFlow
    -pip install "python-doctr[tf]"
    -# for PyTorch
    -pip install "python-doctr[torch]"
    -
    -

    Via Git

    @@ -326,14 +312,6 @@

    Via Git¶ pip install -e doctr/.

    -

    Again, for framework-specific builds:

    -
    git clone https://github.com/mindee/doctr.git
    -# for TensorFlow
    -pip install -e doctr/.[tf]
    -# for PyTorch
    -pip install -e doctr/.[torch]
    -
    -
    @@ -342,12 +320,12 @@

    Via Git

    +

    diff --git a/v0.7.0/io.html b/v0.7.0/io.html deleted file mode 100644 index a61f5b20af..0000000000 --- a/v0.7.0/io.html +++ /dev/null @@ -1,839 +0,0 @@ - - - - - - - - - - - - - doctr.io - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    doctr.io

    -

    The io module enables users to easily access content from documents and export analysis -results to structured formats.

    -
    -

    Document structure

    -

    Structural organization of the documents.

    -
    -

    Word

    -

    A Word is an uninterrupted sequence of characters.

    -
    -
    -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray)[source]
    -

    Implements a word element

    -
    -
    Parameters:
    -
      -
    • value – the text string of the word

    • -
    • confidence – the confidence associated with the text prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

    • -
    • size (the page's)

    • -
    -
    -
    -
    - -
    -
    -

    Line

    -

    A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

    -
    -
    -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a line element as a collection of words

    -
    -
    Parameters:
    -
      -
    • words – list of word elements

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

    • -
    -
    -
    -
    - -
    -
    -

    Artefact

    -

    An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

    -
    -
    -class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
    -

    Implements a non-textual element

    -
    -
    Parameters:
    -
      -
    • artefact_type – the type of artefact

    • -
    • confidence – the confidence of the type prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

    • -
    -
    -
    -
    - -
    -
    -

    Block

    -

    A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

    -
    -
    -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a block element as a collection of lines and artefacts

    -
    -
    Parameters:
    -
      -
    • lines – list of line elements

    • -
    • artefacts – list of artefacts

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

    • -
    -
    -
    -
    - -
    -
    -

    Page

    -

    A Page is a collection of Blocks that were on the same physical page.

    -
    -
    -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
    -

    Implements a page element as a collection of blocks

    -
    -
    Parameters:
    -
      -
    • blocks – list of block elements

    • -
    • page_idx – the index of the page in the input raw document

    • -
    • dimensions – the page size in pixels in format (height, width)

    • -
    • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

    • -
    • language – a dictionary with the language value and confidence of the prediction

    • -
    -
    -
    -
    -
    -show(page: ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -
      -
    • page – image encoded as a numpy array in uint8

    • -
    • interactive – whether the display should be interactive

    • -
    • preserve_aspect_ratio – pass True if you passed True to the predictor

    • -
    -
    -
    -
    - -
    - -
    -
    -

    Document

    -

    A Document is a collection of Pages.

    -
    -
    -class doctr.io.Document(pages: List[Page])[source]
    -

    Implements a document element as a collection of pages

    -
    -
    Parameters:
    -

    pages – list of page elements

    -
    -
    -
    -
    -show(pages: List[ndarray], **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -

    pages – list of images encoded as numpy arrays in uint8

    -
    -
    -
    - -
    - -
    -
    -
    -

    File reading

    -

    High-performance file reading and conversion to processable structured data.

    -
    -
    -doctr.io.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_pdf
    ->>> doc = read_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
    -

    Read an image file into numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_img
    ->>> page = read_img("path/to/your/doc.jpg")
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • file – the path to the image file

    • -
    • output_size – the expected output size of each page in format H x W

    • -
    • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

    • -
    -
    -
    Returns:
    -

    the page decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]
    -

    Read an image file as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_path – location of the image file

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]
    -

    Read a byte stream as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_content – bytes of a decoded image

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.read_html(url: str, **kwargs: Any) bytes[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_html
    ->>> doc = read_html("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – URL of the target web page

    -
    -
    Returns:
    -

    decoded PDF file as a bytes stream

    -
    -
    -
    - -
    -
    -class doctr.io.DocumentFile[source]
    -

    Read a document from multiple extensions

    -
    -
    -classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
    -

    Read a PDF file

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file or a binary stream

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_url(url: str, **kwargs) PDF[source]
    -

    Interpret a web page as a PDF document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_url("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – the URL of the target web page

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
    -

    Read an image file (or a collection of image files) and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    files – the path to the image file or a binary stream, or a collection of those

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    - -
    -
    -class doctr.io.PDF(doc: Document)[source]
    -

    PDF document template

    -
    -
    Parameters:
    -

    doc – input PDF document

    -
    -
    -
    -
    -as_images(**kwargs) List[ndarray][source]
    -

    Convert all document pages to images

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of convert_page_to_numpy

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all words in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_lines(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all lines in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_artefacts() List[List[Tuple[float, float, float, float]]][source]
    -

    Get the artefacts for the entire document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
    -
    -
    -
    -
    -
    -
    Returns:
    -

    the list of pages artefacts, represented as a list of bounding boxes

    -
    -
    -
    - -
    - -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/models.html b/v0.7.0/models.html index 04ff61d44e..270664068f 100644 --- a/v0.7.0/models.html +++ b/v0.7.0/models.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.models - docTR documentation @@ -227,28 +227,21 @@ @@ -290,286 +283,64 @@

    doctr.models

    -
    -

    doctr.models.classification

    -
    -
    -doctr.models.classification.vgg16_bn_r(pretrained: bool = False, **kwargs: Any) VGG[source]
    -

    VGG-16 architecture as described in “Very Deep Convolutional Networks for Large-Scale Image Recognition”, modified by adding batch normalization, rectangular pooling and a simpler -classification head.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import vgg16_bn_r
    ->>> model = vgg16_bn_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on ImageNet

    -
    -
    Returns:
    -

    VGG feature extractor

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet18(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet-18 architecture as described in “Deep Residual Learning for Image Recognition”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet18
    ->>> model = resnet18(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with rectangular pooling windows as described in -“Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition”,. Downsizing: (H, W) –> (H/8, W/4)

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet31
    ->>> model = resnet31(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_small(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_large(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,, with rectangular pooling.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_r
    ->>> model = mobilenet_v3_small_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_large_r
    ->>> model = mobilenet_v3_large_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_orientation
    ->>> model = mobilenet_v3_small_orientation(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.magc_resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with Multi-Aspect Global Context Attention as described in -“MASTER: Multi-Aspect Non-local Network for Scene Text Recognition”,.

    -
    -
    Example::
    >>> import torch
    ->>> from doctr.models import magc_resnet31
    ->>> model = magc_resnet31(pretrained=False)
    ->>> input_tensor = torch.rand((1, 3, 224, 224), dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A feature extractor model

    -
    -
    -
    - -
    -
    -doctr.models.classification.crop_orientation_predictor(arch: str = 'mobilenet_v3_small_orientation', pretrained: bool = False, **kwargs: Any) CropOrientationPredictor[source]
    -

    Orientation classification architecture.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.models import crop_orientation_predictor
    ->>> model = crop_orientation_predictor(arch='classif_mobilenet_v3_small', pretrained=True)
    ->>> input_crop = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
    ->>> out = model([input_crop])
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • arch – name of the architecture to use (e.g. ‘mobilenet_v3_small’)

    • -
    • pretrained – If True, returns a model pre-trained on our recognition crops dataset

    • +

      The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture.

      +

      For a given task, DocTR provides a Predictor, which is composed of 2 components:

      +
        +
      • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

      • +
      • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

      -
    -
    Returns:
    -

    CropOrientationPredictor

    -
    -
    -
    - -
    -
    -

    doctr.models.detection

    -
    -
    -doctr.models.detection.linknet_resnet18(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    -

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import linknet_resnet18
    ->>> model = linknet_resnet18(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    +
    +

    Text Detection

    +

    Localizing text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

    -
    -
    Returns:
    -

    text detection architecture

    -
    -
    -
    - +

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for detection

    +

    In DocTR, the pre-processing scheme for detection is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

    2. +
    3. batch images together

    4. +
    5. normalize the batch using the training data statistics

    6. +
    +
    +
    +

    Detection models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
    @@ -595,13 +366,13 @@

    doctr.models.detection

    -
    -doctr.models.detection.db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) DBNet[source]
    -

    DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a mobilenet v3 large backbone.

    +
    +doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    +

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import db_mobilenet_v3_large
    ->>> model = db_mobilenet_v3_large(pretrained=True)
    +>>> from doctr.models import linknet16
    +>>> model = linknet16(pretrained=True)
     >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -618,14 +389,18 @@

    doctr.models.detection

    +
    +
    +

    Detection predictors

    +

    Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

    -doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, assume_straight_pages: bool = True, **kwargs: Any) DetectionPredictor[source]
    +doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]

    Text detection architecture.

    Example::
    >>> import numpy as np
     >>> from doctr.models import detection_predictor
    ->>> model = detection_predictor(arch='db_resnet50', pretrained=True)
    +>>> model = detection_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -635,9 +410,8 @@

    doctr.models.detection
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘db_resnet50’)

    • +
    • arch – name of the architecture to use (‘db_resnet50’)

    • pretrained – If True, returns a model pre-trained on our text detection dataset

    • -
    • assume_straight_pages – If True, fit straight boxes to the page

    Returns:
    @@ -647,8 +421,74 @@

    doctr.models.detection

    -
    -

    doctr.models.recognition

    +
    +
    +

    Text Recognition

    +

    Identifying strings in images

    +
    + + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    86.02

    91.3

    12.8

    sar_vgg16_bn

    (32, 128, 3)

    21.5M

    86.2

    91.7

    3.3

    sar_resnet31

    (32, 128, 3)

    53.1M

    86.3

    92.1

    2.7

    +
    +

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All these recognition models are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for recognition

    +

    In DocTR, the pre-processing scheme for recognition is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) without deformation.

    2. +
    3. pad the image to the target size (with zeros by default)

    4. +
    5. batch images together

    6. +
    7. normalize the batch using the training data statistics

    8. +
    +
    +
    +

    Recognition models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
    @@ -675,40 +515,15 @@

    doctr.models.recognition -
    -doctr.models.recognition.crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Small backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_small
    ->>> model = crnn_mobilenet_v3_small(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    -
    -
    Returns:
    -

    text recognition architecture

    -
    -
    -

    - -
    -
    -doctr.models.recognition.crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Large backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    +
    +doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
    +

    SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong +Baseline for Irregular Text Recognition”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_large
    ->>> model = crnn_mobilenet_v3_large(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    +>>> from doctr.models import sar_vgg16_bn
    +>>> model = sar_vgg16_bn(pretrained=False)
    +>>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -750,17 +565,15 @@

    doctr.models.recognition
    doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
    -

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.

    -
    -
    Example::
    >>> import tensorflow as tf
    +

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. +Example:

    +
    >>> import tensorflow as tf
     >>> from doctr.models import master
     >>> model = master(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    -
    -
    Parameters:

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    @@ -771,6 +584,10 @@

    doctr.models.recognition +

    Recognition predictors

    +

    Combining the right components around a given architecture for easier usage.

    doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
    @@ -788,7 +605,7 @@

    doctr.models.recognition
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘crnn_vgg16_bn’)

    • +
    • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

    • pretrained – If True, returns a model pre-trained on our text recognition dataset

    @@ -799,16 +616,141 @@

    doctr.models.recognition -

    doctr.models.zoo

    +

    +
    +

    End-to-End OCR

    +

    Predictors that localize and identify text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    70.08

    74.77

    0.85

    82.19

    79.67

    1.6

    db_resnet50 + sar_vgg16_bn

    N/A

    N/A

    0.49

    N/A

    N/A

    1.0

    db_resnet50 + sar_resnet31

    N/A

    N/A

    0.27

    N/A

    N/A

    0.83

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    +
    +

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +

    Results on private ocr datasets

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Receipts

    Invoices

    IDs

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.90

    81.01

    65.68

    69.86

    49.48

    50.46

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    +
    +
    +

    Two-stage approaches

    +

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

    -
    -doctr.models.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, assume_straight_pages: bool = True, export_as_straight_boxes: bool = False, preserve_aspect_ratio: bool = False, **kwargs: Any) OCRPredictor[source]
    +
    +doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]

    End-to-end OCR architecture using one model for localization, and another for text recognition.

    Example::
    >>> import numpy as np
     >>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    +>>> model = ocr_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -818,15 +760,8 @@

    doctr.models.zoo
    Parameters:
      -
    • det_arch – name of the detection architecture to use (e.g. ‘db_resnet50’, ‘db_mobilenet_v3_large’)

    • -
    • reco_arch – name of the recognition architecture to use (e.g. ‘crnn_vgg16_bn’, ‘sar_resnet31’)

    • +
    • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

    • pretrained – If True, returns a model pre-trained on our OCR dataset

    • -
    • assume_straight_pages – if True, speeds up the inference by assuming you only pass straight pages -without rotated textual elements.

    • -
    • export_as_straight_boxes – when assume_straight_pages is set to False, export final predictions -(potentially rotated) as straight bounding boxes.

    • -
    • preserve_aspect_ratio – If True, pad the input document image to preserve the aspect ratio before -running the detection model on it.

    Returns:
    @@ -835,6 +770,113 @@

    doctr.models.zoo +

    Model export

    +

    Utility functions to make the most of document analysis models.

    +
    +

    Model compression

    +
    +
    +doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
    +

    Converts a model to TFLite format

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_tflite, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_tflite(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
    +

    Converts a model to half precision

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_fp16, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_fp16(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the serialized FP16 model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
    +

    Quantize a Tensorflow model

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import quantize_model, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = quantize_model(model, (224, 224, 3))
    +
    +
    +
    +
    +
    +
    Parameters:
    +
      +
    • tf_model – a keras model

    • +
    • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

    • +
    +
    +
    Returns:
    +

    the serialized quantized model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +

    Using SavedModel

    +

    Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +SavedModel format as follows:

    +
    >>> import tensorflow as tf
    +>>> from doctr.models import db_resnet50
    +>>> model = db_resnet50(pretrained=True)
    +>>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    +>>> _ = model(input_t, training=False)
    +>>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    +
    +
    +

    And loaded just as easily:

    +
    >>> import tensorflow as tf
    +>>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    +
    +
    +

    @@ -852,14 +894,14 @@

    doctr.models.zoo - +
    Previous
    -
    doctr.io
    +
    doctr.documents
    @@ -894,37 +936,49 @@

    doctr.models.zoo

    diff --git a/v0.7.0/searchindex.js b/v0.7.0/searchindex.js index 0144823bd4..4ff5c109cf 100644 --- a/v0.7.0/searchindex.js +++ b/v0.7.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id5"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet18_rotation() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18_rotation", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet18_rotation"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": 16, "02562": 7, "03": 16, "035": [], "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": [], "106": [], "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": [], "115": [], "1156": 14, "116": 5, "118": [], "11800h": 16, "11th": 16, "12": [3, 16], "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": [], "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": [7, 16], "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": [], "19598": [], "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": [], "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": [], "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": [], "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "2m": [], "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": [7, 16], "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": 8, "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": 16, "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": [], "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": [], "48": [5, 16], "485": 8, "49": [], "49377": [], "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": 16, "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": [], "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": [], "There": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": [], "_build": 2, "_helper": [], "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": [], "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13], "add_hook": [], "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": [], "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "amazon": [], "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "anywher": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [5, 7], "argument": [5, 7, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": 7, "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": [], "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": [], "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "bash": [], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": 1, "being": [9, 16], "belong": 16, "below": [], "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "catch": [], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "cleaner": [], "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compli": [], "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "constraint": [], "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convent": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": 13, "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": 5, "d": [5, 14], "daili": [], "danish": [], "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": 16, "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": [], "default": [6, 9, 11, 12], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": 11, "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": [], "documentfil": [6, 13], "doe": [], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "ec2": [], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enivron": [], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13], "exchang": 15, "exclud": [], "execut": [], "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fallback": [], "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": [], "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": 15, "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flake8": [], "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float16": [], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_keras_model": [], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11], "get": 16, "get_artefact": [], "get_lin": [], "get_text_word": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": 14, "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 16], "http": [1, 3, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "incom": [], "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inference_input_typ": [], "inference_output_typ": [], "inform": [1, 2, 4, 5, 14], "inherit": [], "ini": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "int8": [], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "isort": [], "issu": [1, 2, 13], "italian": [], "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "keep": [], "kei": [], "kera": [7, 15], "kernel": 8, "kernel_s": [], "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": [1, 16], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": 5, "less": 15, "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [7, 16], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": [], "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "look": [], "love": 13, "lower": [8, 9], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": [], "map": 5, "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": 9, "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": [], "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [8, 16], "metric": [9, 16], "middl": [], "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "mobilenetv3_larg": [], "mobilenetv3_smal": [], "modal": [], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12], "modul": [6, 8, 9, 16], "moment": 16, "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": [], "mypi": [], "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": 16, "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": [], "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "oper": [], "opinion": 1, "opsset": [], "optic": [4, 16], "optim": 4, "option": [5, 11], "order": [2, 5, 6, 8], "org": [1, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": [], "paragraph_break": [], "param": [8, 16], "paramet": [4, 5, 6, 7, 8, 9, 15], "pars": [4, 5], "parseq": [4, 7, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": 7, "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdf_render": [], "pdfdocument": [], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": [], "polit": 1, "polygon": [5, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13], "post": [1, 16], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 16, "pr": [], "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pydocstyl": [], "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": [], "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": 1, "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": [], "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": 6, "render_pdf_topil": [], "render_to": [], "reorder": [], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "representative_dataset": [], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "resum": [], "return": [5, 6, 7, 9, 16], "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "road": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sane": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": [], "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": 8, "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": 1, "span": 16, "spanish": 5, "spatial": [6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "style": [], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "supported_op": [], "supported_typ": [], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": [], "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "target_spec": [], "task": [4, 5, 7, 13, 14, 16], "task2": 5, "tax": [], "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": 14, "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "tflite_builtins_int8": [], "tfliteconvert": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": [], "through": [1, 8, 14], "tilman": [], "time": [1, 4, 7, 9, 14], "tini": [], "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": 14, "underneath": 6, "understand": [4, 5, 16], "unfortun": [], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "variou": [], "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn": [], "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": [], "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": [], "8": [], "9": [], "advanc": [], "annot": [], "approach": 16, "architectur": 16, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "backbon": [], "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "docstr": [], "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "format": [], "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "import": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "lint": [], "linux": [], "lite": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": [], "order": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "privat": [], "process": [], "public": [], "push": 13, "python": 3, "qualiti": 2, "quantiz": [], "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "tensorflow": [], "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "type": [], "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id5"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet18_rotation() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18_rotation", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet18_rotation"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": 16, "02562": 7, "03": 16, "035": [], "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": [], "106": [], "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": [], "115": [], "1156": 14, "116": 5, "118": [], "11800h": 16, "11th": 16, "12": [3, 16], "120": [], "123": 5, "126": 5, "1268": [], "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": [], "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": [7, 16], "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": [], "19598": [], "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": [], "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": [], "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": [], "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": [7, 16], "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": 8, "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": 16, "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": [], "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": [], "48": [5, 16], "485": 8, "49": [], "49377": [], "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": 16, "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": [], "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": [], "_build": 2, "_i": 9, "ab": [], "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": [], "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13], "add_hook": [], "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": [], "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [5, 7], "argument": [5, 7, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": 7, "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": [], "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": [], "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": 1, "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9], "bgr": 6, "bilinear": 8, "bin_thresh": [], "binar": [4, 7], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": [], "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": 13, "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": [], "cvit": 4, "czczup": [], "czech": 5, "d": [5, 14], "daili": [], "danish": [], "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": 16, "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": [], "default": [6, 9, 11, 12], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": 11, "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": [], "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": [], "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13], "exchang": 15, "exclud": [], "execut": [], "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": [], "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": 15, "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": [], "first": 2, "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": 6, "grayscal": 8, "ground": 9, "groung": 9, "group": 4, "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": 14, "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": [], "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": [], "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 16], "http": [1, 3, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": [], "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": [], "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [], "kera": [7, 15], "kernel": 8, "kernel_s": [], "kernel_shap": 8, "keywoard": [], "keyword": [5, 7], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": [1, 16], "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": 5, "less": 15, "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [7, 16], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": [], "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": [], "map": 5, "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": 9, "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": [], "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [8, 16], "metric": [9, 16], "middl": [], "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [], "minimum": [3, 5, 8, 9], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12], "modul": [6, 8, 9, 16], "moment": 16, "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [], "multipl": [5, 6, 8], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": [], "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": 16, "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": [], "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": 1, "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 11], "order": [2, 5, 6, 8], "org": [1, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": [], "paragraph_break": [], "param": [8, 16], "paramet": [4, 5, 6, 7, 8, 9, 15], "pars": [4, 5], "parseq": [4, 7, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": 7, "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": [], "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": [], "polit": 1, "polygon": [5, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13], "post": [1, 16], "postprocessor": [], "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": 9, "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": [], "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": 6, "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": 1, "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": [], "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": 6, "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": [], "resolve_lin": [], "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": [5, 6, 7, 9, 16], "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": [], "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": 8, "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": 1, "span": 16, "spanish": 5, "spatial": [6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": [], "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": [], "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": 14, "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": [], "textmatch": 9, "textnet": [], "textnet_bas": [], "textnet_smal": [], "textnet_tini": [], "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": [], "through": [1, 8, 14], "tilman": [], "time": [1, 4, 7, 9, 14], "tini": [], "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": 14, "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": [], "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": [], "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": [], "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": [], "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": [], "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": [], "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": [], "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": [], "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": [], "8": [], "9": [], "advanc": [], "approach": 16, "architectur": 16, "arg": [], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.7.0/transforms.html b/v0.7.0/transforms.html index 0d1b5f7402..d42da50481 100644 --- a/v0.7.0/transforms.html +++ b/v0.7.0/transforms.html @@ -227,28 +227,21 @@ @@ -293,7 +286,7 @@

    doctr.transformstorchvision, we express transformations as composable modules.

    Supported transformations

    -

    Here are all transformations that are available through docTR:

    +

    Here are all transformations that are available through DocTR:

    class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
    @@ -364,7 +357,7 @@

    Supported transformations
    -class doctr.transforms.ToGray(num_output_channels: int = 1)[source]
    +class doctr.transforms.ToGray[source]

    Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

    Example::
    >>> from doctr.transforms import Normalize
    @@ -524,88 +517,6 @@ 

    Supported transformations -
    -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
    -

    Randomly rotate a tensor image and its boxes

    -https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png -
    -
    Parameters:
    -
      -
    • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

    • -
    • expand – whether the image should be padded before the rotation

    • -
    -
    -
    -

    - -
    -
    -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
    -

    Randomly crop a tensor image and its boxes

    -
    -
    Parameters:
    -
      -
    • scale – tuple of floats, relative (min_area, max_area) of the crop

    • -
    • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.GaussianBlur(kernel_shape: int | Iterable[int], std: Tuple[float, float])[source]
    -

    Randomly adjust jpeg quality of a 3 dimensional RGB image

    -
    -
    Example::
    >>> from doctr.transforms import GaussianBlur
    ->>> import tensorflow as tf
    ->>> transfo = GaussianBlur(3, (.1, 5))
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • kernel_shape – size of the blurring kernel

    • -
    • std – min and max value of the standard deviation

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.ChannelShuffle[source]
    -

    Randomly shuffle channel order of a given image

    -
    - -
    -
    -class doctr.transforms.GaussianNoise(mean: float = 0.0, std: float = 1.0)[source]
    -

    Adds Gaussian Noise to the input tensor

    -
    -
    Example::
    >>> from doctr.transforms import GaussianNoise
    ->>> import tensorflow as tf
    ->>> transfo = GaussianNoise(0., 1.)
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • mean – mean of the gaussian distribution

    • -
    • std – std of the gaussian distribution

    • -
    -
    -
    -
    -

    Composing transformations

    @@ -744,11 +655,6 @@

    Composing transformationsRandomHue
  • RandomGamma
  • RandomJpegQuality
  • -
  • RandomRotate
  • -
  • RandomCrop
  • -
  • GaussianBlur
  • -
  • ChannelShuffle
  • -
  • GaussianNoise
  • Composing transformations
      @@ -768,7 +674,7 @@

      Composing transformations +

  • diff --git a/v0.7.0/using_model_export.html b/v0.7.0/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.7.0/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Preparing your model for inference

    -

    A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

    -
    -

    Model compression

    -

    This section is meant to help you perform inference with compressed versions of your model.

    -
    -

    TensorFlow Lite

    -

    TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

    -
    >>> import tensorflow as tf
    ->>> from tensorflow.keras import Sequential
    ->>> from doctr.models import conv_sequence
    ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Half-precision

    -

    If you want to convert it to half-precision using your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> converter.target_spec.supported_types = [tf.float16]
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Post-training quantization

    -

    Finally if you wish to quantize the model with your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> # Float fallback for operators that do not have an integer implementation
    ->>> def representative_dataset():
    ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
    ->>> converter.representative_dataset = representative_dataset
    ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    ->>> converter.inference_input_type = tf.int8
    ->>> converter.inference_output_type = tf.int8
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -
    -

    Using SavedModel

    -

    Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

    -
    >>> import tensorflow as tf
    ->>> from doctr.models import db_resnet50
    ->>> model = db_resnet50(pretrained=True)
    ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> _ = model(input_t, training=False)
    ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    -
    -
    -

    And loaded just as easily:

    -
    >>> import tensorflow as tf
    ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/using_models.html b/v0.7.0/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.7.0/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Choosing the right model

    -

    The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

    -

    For a given task, docTR provides a Predictor, which is composed of 2 components:

    -
      -
    • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

    • -
    • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

    • -
    -
    -

    Text Detection

    -

    The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    db_mobilenet_v3_large

    (1024, 1024, 3)

    4.2 M

    79.35

    84.03

    81.14

    66.85

    -
    -

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Detection predictors

    -

    detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import detection_predictor
    ->>> predictor = detection_predictor('db_resnet50')
    ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    Text Recognition

    -

    The task consists of transcribing the character sequence in a given image.

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    87.18

    92.93

    12.8

    crnn_mobilenet_v3_small

    (32, 128, 3)

    2.1M

    86.21

    90.56

    crnn_mobilenet_v3_large

    (32, 128, 3)

    4.5M

    86.95

    92.03

    sar_resnet31

    (32, 128, 3)

    56.2M

    87.70

    93.41

    2.7

    master

    (32, 128, 3)

    67.7M

    87.62

    93.27

    -
    -

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

    -

    While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

    -
    >>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> print(predictor.model.cfg['vocab'])
    -
    -
    -

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Recognition predictors

    -

    recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    End-to-End OCR

    -

    The task consists of both localizing and transcribing textual elements in a given image.

    -
    -

    Available architectures

    -

    You can use any combination of detection and recognition models supporte by docTR.

    -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    71.25

    76.02

    0.85

    84.00

    81.42

    1.6

    db_resnet50 + master

    71.03

    76.06

    84.49

    81.94

    db_resnet50 + sar_resnet31

    71.25

    76.29

    0.27

    84.50

    81.96

    0.83

    db_resnet50 + crnn_mobilenet_v3_small

    69.85

    74.80

    80.85

    78.42

    0.83

    db_resnet50 + crnn_mobilenet_v3_large

    70.57

    75.57

    82.57

    80.08

    0.83

    db_mobilenet_v3_large + crnn_vgg16_bn

    67.73

    71.73

    71.65

    59.03

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    -
    -

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -

    Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Receipts

    Invoices

    IDs

    US Tax Forms

    Resumes

    Road Fines

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.70

    81.12

    65.80

    70.70

    50.25

    51.78

    79.08

    92.83

    db_resnet50 + master (ours)

    79.00

    81.42

    65.57

    69.86

    51.34

    52.90

    78.86

    92.57

    db_resnet50 + sar_resnet31 (ours)

    78.94

    81.37

    65.89

    70.79

    51.78

    53.35

    79.04

    92.78

    db_resnet50 + crnn_mobilenet_v3_small (ours)

    76.81

    79.15

    64.89

    69.61

    45.03

    46.38

    78.96

    92.11

    85.91

    87.20

    84.85

    85.86

    db_resnet50 + crnn_mobilenet_v3_large (ours)

    78.01

    80.39

    65.36

    70.11

    48.00

    49.43

    79.39

    92.62

    87.68

    89.00

    85.65

    86.67

    db_mobilenet_v3_large + crnn_vgg16_bn (ours)

    78.36

    74.93

    63.04

    68.41

    39.36

    41.75

    72.14

    89.97

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    69.79

    65.68

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    84.31

    98.11

    -
    -
    -
    -

    Two-stage approaches

    -

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

    -
    >>> import numpy as np
    ->>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([input_page])
    -
    -
    -
    -
    -

    What should I do with the output?

    -

    The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

    -

    Here is a typical Document layout:

    -
    Document(
    -  (pages): [Page(
    -    dimensions=(340, 600)
    -    (blocks): [Block(
    -      (lines): [Line(
    -        (words): [
    -          Word(value='No.', confidence=0.91),
    -          Word(value='RECEIPT', confidence=0.99),
    -          Word(value='DATE', confidence=0.96),
    -        ]
    -      )]
    -      (artefacts): []
    -    )]
    -  )]
    -)
    -
    -
    -

    You can also export them as a nested dict, more appropriate for JSON format:

    -
    json_output = result.export()
    -
    -
    -

    For reference, here is the JSON export for the same Document as above:

    -
    {
    -  'pages': [
    -      {
    -          'page_idx': 0,
    -          'dimensions': (340, 600),
    -          'orientation': {'value': None, 'confidence': None},
    -          'language': {'value': None, 'confidence': None},
    -          'blocks': [
    -              {
    -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                  'lines': [
    -                      {
    -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                          'words': [
    -                              {
    -                                  'value': 'No.',
    -                                  'confidence': 0.914085328578949,
    -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
    -                              },
    -                              {
    -                                  'value': 'RECEIPT',
    -                                  'confidence': 0.9949972033500671,
    -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
    -                              },
    -                              {
    -                                  'value': 'DATE',
    -                                  'confidence': 0.9578408598899841,
    -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
    -                              }
    -                          ]
    -                      }
    -                  ],
    -                  'artefacts': []
    -              }
    -          ]
    -      }
    -  ]
    -}
    -
    -
    -

    To export the outpout as XML (hocr-format) you can use the export_as_xml method:

    -
    xml_output = result.export_as_xml()
    -for output in xml_output:
    -  xml_bytes_string = output[0]
    -  xml_element = output[1]
    -
    -
    -

    For reference, here is a sample XML byte string output:

    -
    <?xml version="1.0" encoding="UTF-8"?>
    -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    -  <head>
    -    <title>docTR - hOCR</title>
    -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    -    <meta name="ocr-system" content="doctr 0.5.0" />
    -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
    -  </head>
    -  <body>
    -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
    -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
    -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
    -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
    -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
    -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
    -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
    -        </span>
    -      </p>
    -    </div>
    -  </body>
    -</html>
    -
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.7.0/utils.html b/v0.7.0/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.7.0/utils.html +++ b/v0.7.0/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

    Visualization -
    -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
    -

    Draw a the content of the element page (OCR response) on a blank page.

    -
    -
    Parameters:
    -
      -
    • page – exported Page object to represent

    • -
    • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

    • -
    • font_size – size of the font, default font = 13

    • -
    • font_family – family of the font

    • -
    -
    -
    Returns:
    -

    the synthesized page

    -
    -
    -
    -

    Task evaluation

    @@ -382,20 +356,6 @@

    Visualization -
    -update(gt: List[str], pred: List[str]) None[source]
    -

    Update the state of the metric with new predictions

    -
    -
    Parameters:
    -
      -
    • gt – list of groung-truth character sequences

    • -
    • pred – list of predicted character sequences

    • -
    -
    -
    -
    -
    summary() Dict[str, float][source]
    @@ -412,14 +372,14 @@

    Visualization
    -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

    Implements common confusion metrics and mean IoU for localization evaluation.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    -update(gts: ndarray, preds: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    -
    -
    -
    -
    summary() Tuple[float | None, float | None, float | None][source]
    @@ -485,15 +426,15 @@

    Visualization
    -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an end-to-end OCR metric.

    +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

    Implements end-to-end OCR metric.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -

    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – a list of N string labels

    • -
    • pred_labels – a list of M string labels

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

    Computes the aggregated metrics

    Returns:
    -

    a tuple with the recall & precision for each string comparison and the mean IoU

    -
    -
    -
    - - - -
    -
    -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an object detection metric.

    -

    The aggregated metrics are computed as follows:

    -
    -
    -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
    -
    -

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

    -
    -
    -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
    -
    -

    where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.utils import DetectionMetric
    ->>> metric = DetectionMetric(iou_thresh=0.5)
    ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
    -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
    ->>> metric.summary()
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -
    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – an array of class indices of shape (N,)

    • -
    • pred_labels – an array of class indices of shape (M,)

    • -
    -
    -
    -
    - -
    -
    -summary() Tuple[float | None, float | None, float | None][source]
    -

    Computes the aggregated metrics

    -
    -
    Returns:
    -

    a tuple with the recall & precision for each class prediction and the mean IoU

    +

    a tuple with the recall & precision for each string comparison flexibility and the mean IoU

    @@ -649,15 +490,7 @@

    Visualization - -
    -
    - Next -
    -
    Changelog
    -
    - -
    + diff --git a/v0.8.0/_modules/doctr/datasets/classification/tensorflow.html b/v0.8.0/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.8.0/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    -
    - -
    - -
    -
    -

    Source code for doctr.datasets.classification.tensorflow

    -# Copyright (C) 2021, Mindee.
    -
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    -
    -import tensorflow as tf
    -
    -from .base import _CharacterGenerator
    -
    -__all__ = ['CharacterGenerator']
    -
    -
    -
    -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
    - -
    -
    -
    -
    - - -
    -
    - - Made with Sphinx and @pradyunsg's - - Furo - -
    -
    - -
    -
    - -
    -
    - -
    -
    - - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html b/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.8.0/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -284,7 +284,6 @@

      Source code for doctr.datasets.datasets.tensorflow

      from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

      Source code for doctr.datasets.datasets.tensorflow

      class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

      Source code for doctr.datasets.datasets.tensorflow

      +
      diff --git a/v0.8.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.8.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index a0f857205e..0000000000 --- a/v0.8.0/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,688 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.mobilenet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
      -
      -from typing import Any, Dict, List, Optional, Tuple, Union
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ....datasets import VOCABS
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
      -           "mobilenet_v3_large_r"]
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'mobilenet_v3_large': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
      -    },
      -    'mobilenet_v3_large_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    },
      -    'mobilenet_v3_small': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
      -    },
      -    'mobilenet_v3_small_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    }
      -}
      -
      -
      -def hard_swish(x: tf.Tensor) -> tf.Tensor:
      -    return x * tf.nn.relu6(x + 3.) / 6.0
      -
      -
      -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
      -    if min_value is None:
      -        min_value = divisor
      -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
      -    # Make sure that round down does not go down by more than 10%.
      -    if new_v < 0.9 * v:
      -        new_v += divisor
      -    return new_v
      -
      -
      -class SqueezeExcitation(Sequential):
      -    """Squeeze and Excitation.
      -    """
      -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
      -        super().__init__(
      -            [
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(chan // squeeze_factor, activation='relu'),
      -                layers.Dense(chan, activation='hard_sigmoid'),
      -                layers.Reshape((1, 1, chan))
      -            ]
      -        )
      -
      -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
      -        x = super().call(inputs, **kwargs)
      -        x = tf.math.multiply(inputs, x)
      -        return x
      -
      -
      -class InvertedResidualConfig:
      -    def __init__(
      -        self,
      -        input_channels: int,
      -        kernel: int,
      -        expanded_channels: int,
      -        out_channels: int,
      -        use_se: bool,
      -        activation: str,
      -        stride: Union[int, Tuple[int, int]],
      -        width_mult: float = 1,
      -    ) -> None:
      -        self.input_channels = self.adjust_channels(input_channels, width_mult)
      -        self.kernel = kernel
      -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
      -        self.out_channels = self.adjust_channels(out_channels, width_mult)
      -        self.use_se = use_se
      -        self.use_hs = activation == "HS"
      -        self.stride = stride
      -
      -    @staticmethod
      -    def adjust_channels(channels: int, width_mult: float):
      -        return _make_divisible(channels * width_mult, 8)
      -
      -
      -class InvertedResidual(layers.Layer):
      -    """InvertedResidual for mobilenet
      -
      -    Args:
      -        conf: configuration object for inverted residual
      -    """
      -    def __init__(
      -        self,
      -        conf: InvertedResidualConfig,
      -        **kwargs: Any,
      -    ) -> None:
      -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
      -        super().__init__(**kwargs)
      -
      -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
      -
      -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
      -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
      -
      -        _layers = []
      -        # expand
      -        if conf.expanded_channels != conf.input_channels:
      -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
      -
      -        # depth-wise
      -        _layers.extend(conv_sequence(
      -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
      -            groups=conf.expanded_channels,
      -        ))
      -
      -        if conf.use_se:
      -            _layers.append(SqueezeExcitation(conf.expanded_channels))
      -
      -        # project
      -        _layers.extend(conv_sequence(
      -            conf.out_channels, None, kernel_size=1, bn=True,
      -        ))
      -
      -        self.block = Sequential(_layers)
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor,
      -        **kwargs: Any,
      -    ) -> tf.Tensor:
      -
      -        out = self.block(inputs, **kwargs)
      -        if self.use_res_connect:
      -            out = tf.add(out, inputs)
      -
      -        return out
      -
      -
      -class MobileNetV3(Sequential):
      -    """Implements MobileNetV3, inspired from both:
      -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
      -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
      -    """
      -
      -    def __init__(
      -        self,
      -        layout: List[InvertedResidualConfig],
      -        input_shape: Tuple[int, int, int],
      -        include_top: bool = False,
      -        head_chans: int = 1024,
      -        num_classes: int = 1000,
      -    ) -> None:
      -
      -        _layers = [
      -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
      -                       input_shape=input_shape), name="stem")
      -        ]
      -
      -        for idx, conf in enumerate(layout):
      -            _layers.append(
      -                InvertedResidual(conf, name=f"inverted_{idx}"),
      -            )
      -
      -        _layers.append(
      -            Sequential(
      -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
      -                name="final_block"
      -            )
      -        )
      -
      -        if include_top:
      -            _layers.extend([
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(head_chans, activation=hard_swish),
      -                layers.Dropout(0.2),
      -                layers.Dense(num_classes),
      -            ])
      -
      -        super().__init__(_layers)
      -
      -
      -def _mobilenet_v3(
      -    arch: str,
      -    pretrained: bool,
      -    input_shape: Optional[Tuple[int, int, int]] = None,
      -    **kwargs: Any
      -) -> MobileNetV3:
      -    input_shape = input_shape or default_cfgs[arch]['input_shape']
      -
      -    # cf. Table 1 & 2 of the paper
      -    if arch.startswith("mobilenet_v3_small"):
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
      -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -        ]
      -        head_chans = 1024
      -    else:
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
      -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
      -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -        ]
      -        head_chans = 1280
      -
      -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
      -
      -    # Build the model
      -    model = MobileNetV3(
      -        inverted_residual_setting,
      -        input_shape,
      -        head_chans=head_chans,
      -        **kwargs,
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.8.0/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index d959be9a0f..0000000000 --- a/v0.8.0/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.resnet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, List, Optional, Tuple
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
      -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
      -                 'url': None},
      -}
      -
      -
      -class ResnetBlock(layers.Layer):
      -
      -    """Implements a resnet31 block with shortcut
      -
      -    Args:
      -        conv_shortcut: Use of shortcut
      -        output_channels: number of channels to use in Conv2D
      -        kernel_size: size of square kernels
      -        strides: strides to use in the first convolution of the block
      -    """
      -    def __init__(
      -        self,
      -        output_channels: int,
      -        conv_shortcut: bool,
      -        strides: int = 1,
      -        **kwargs
      -    ) -> None:
      -
      -        super().__init__(**kwargs)
      -        if conv_shortcut:
      -            self.shortcut = Sequential(
      -                [
      -                    layers.Conv2D(
      -                        filters=output_channels,
      -                        strides=strides,
      -                        padding='same',
      -                        kernel_size=1,
      -                        use_bias=False,
      -                        kernel_initializer='he_normal'
      -                    ),
      -                    layers.BatchNormalization()
      -                ]
      -            )
      -        else:
      -            self.shortcut = layers.Lambda(lambda x: x)
      -        self.conv_block = Sequential(
      -            self.conv_resnetblock(output_channels, 3, strides)
      -        )
      -        self.act = layers.Activation('relu')
      -
      -    @staticmethod
      -    def conv_resnetblock(
      -        output_channels: int,
      -        kernel_size: int,
      -        strides: int = 1,
      -    ) -> List[layers.Layer]:
      -        return [
      -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
      -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
      -            layers.BatchNormalization(),
      -        ]
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor
      -    ) -> tf.Tensor:
      -        clone = self.shortcut(inputs)
      -        conv_out = self.conv_block(inputs)
      -        out = self.act(clone + conv_out)
      -
      -        return out
      -
      -
      -class ResnetStage(Sequential):
      -
      -    """Implements a resnet31 stage
      -
      -    Args:
      -        num_blocks: number of blocks inside the stage
      -        output_channels: number of channels to use in Conv2D
      -        downsample: if true, performs a /2 downsampling at the first block of the stage
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: int,
      -        output_channels: int,
      -        downsample: bool = False,
      -    ) -> None:
      -
      -        super().__init__()
      -        final_blocks = [
      -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
      -        ]
      -        if downsample is True:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
      -        else:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
      -        for final_block in final_blocks:
      -            self.add(final_block)
      -
      -
      -class ResNet(Sequential):
      -
      -    """Resnet class with two convolutions and a maxpooling before the first stage
      -
      -    Args:
      -        num_blocks: number of resnet block in each stage
      -        output_channels: number of channels in each stage
      -        conv_seq: wether to add a conv_sequence after each stage
      -        pooling: pooling to add after each stage (if None, no pooling)
      -        input_shape: shape of inputs
      -        include_top: whether the classifier head should be instantiated
      -    """
      -
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int],
      -        output_channels: Tuple[int, int, int, int],
      -        conv_seq: Tuple[bool, bool, bool, bool],
      -        pooling: Tuple[
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]]
      -        ],
      -        input_shape: Tuple[int, int, int] = (640, 640, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = [
      -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
      -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
      -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
      -        ]
      -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
      -            _layers.append(ResnetStage(n_blocks, out_channels))
      -            if conv:
      -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
      -            if pool:
      -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
      -        super().__init__(_layers)
      -
      -
      -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
      -
      -    # Build the model
      -    model = ResNet(
      -        default_cfgs[arch]['num_blocks'],
      -        default_cfgs[arch]['output_channels'],
      -        default_cfgs[arch]['conv_seq'],
      -        default_cfgs[arch]['pooling'],
      -        **kwargs
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.8.0/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 48c285257a..0000000000 --- a/v0.8.0/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.vgg.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, Tuple
      -
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['VGG', 'vgg16_bn']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
      -                 'rect_pools': (False, False, True, True, True),
      -                 'url': None},
      -}
      -
      -
      -class VGG(Sequential):
      -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
      -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
      -
      -    Args:
      -        num_blocks: number of convolutional block in each stage
      -        planes: number of output channels in each stage
      -        rect_pools: whether pooling square kernels should be replace with rectangular ones
      -        input_shape: shapes of the input tensor
      -        include_top: whether the classifier head should be instantiated
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int, int],
      -        planes: Tuple[int, int, int, int, int],
      -        rect_pools: Tuple[bool, bool, bool, bool, bool],
      -        input_shape: Tuple[int, int, int] = (512, 512, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = []
      -        # Specify input_shape only for the first layer
      -        kwargs = {"input_shape": input_shape}
      -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
      -            for _ in range(nb_blocks):
      -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
      -                kwargs = {}
      -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
      -        super().__init__(_layers)
      -
      -
      -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
      -
      -    # Build the model
      -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
      -                default_cfgs[arch]['rect_pools'], **kwargs)
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html b/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html index dc7d8f50f2..af51a9abeb 100644 --- a/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html +++ b/v0.8.0/_modules/doctr/models/detection/fast/tensorflow.html @@ -305,7 +305,7 @@

      Source code for doctr.models.detection.fast.tensorflow

      import numpy as np import tensorflow as tf -from keras import Model, Sequential, layers +from tensorflow.keras import Model, Sequential, layers from doctr.file_utils import CLASS_NAME from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, load_pretrained_params diff --git a/v0.8.0/_sources/datasets.rst.txt b/v0.8.0/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.8.0/_sources/datasets.rst.txt +++ b/v0.8.0/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.8.0/_sources/installing.rst.txt b/v0.8.0/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.8.0/_sources/installing.rst.txt +++ b/v0.8.0/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.8.0/_sources/io.rst.txt b/v0.8.0/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.8.0/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.8.0/_sources/models.rst.txt b/v0.8.0/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.8.0/_sources/models.rst.txt +++ b/v0.8.0/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.0/_sources/notebooks.md.txt b/v0.8.0/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.8.0/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.8.0/_sources/transforms.rst.txt b/v0.8.0/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.8.0/_sources/transforms.rst.txt +++ b/v0.8.0/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.8.0/_sources/using_model_export.rst.txt b/v0.8.0/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.8.0/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.0/_sources/using_models.rst.txt b/v0.8.0/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.8.0/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
      -
      -

      - - Hello - XML - World - -

      -
      - - \ No newline at end of file diff --git a/v0.8.0/_sources/utils.rst.txt b/v0.8.0/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.8.0/_sources/utils.rst.txt +++ b/v0.8.0/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.8.0/datasets.html b/v0.8.0/datasets.html index 1f5855cc82..640791680a 100644 --- a/v0.8.0/datasets.html +++ b/v0.8.0/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,12 +287,16 @@

      doctr.datasets

      Available Datasets

      -

      Here are all datasets that are available through docTR:

      -
      -

      Public datasets

      +

      The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

      +
      +
      +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
      +
      + +

      Here are all datasets that are available through DocTR:

      -class doctr.datasets.FUNSD(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

      Example::
      >>> from doctr.datasets import FUNSD
      @@ -313,7 +310,8 @@ 

      Public datasetsParameters:
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      @@ -322,7 +320,7 @@

      Public datasets
      -class doctr.datasets.SROIE(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

      Example::
      - -
      -
      -class doctr.datasets.IIIT5K(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

      -
      -
      Example::
      >>> # NOTE: this dataset is for character-level localization
      ->>> from doctr.datasets import IIIT5K
      ->>> train_set = IIIT5K(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVT(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

      -
      -
      Example::
      >>> from doctr.datasets import SVT
      ->>> train_set = SVT(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVHN(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVHN dataset from “The Street View House Numbers (SVHN) Dataset”.

      -
      -
      Example::
      >>> from doctr.datasets import SVHN
      ->>> train_set = SVHN(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SynthText(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SynthText dataset from “Synthetic Data for Text Localisation in Natural Images” | “repository” | -“website”.

      -
      -
      Example::
      >>> from doctr.datasets import SynthText
      ->>> train_set = SynthText(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC03(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC03 dataset from “ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions”.

      -
      -
      Example::
      >>> from doctr.datasets import IC03
      ->>> train_set = IC03(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC13(img_folder: str, label_folder: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC13 dataset from “ICDAR 2013 Robust Reading Competition”. -Example:

      -
      >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
      ->>> from doctr.datasets import IC13
      ->>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
      ->>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
      ->>> img, target = train_set[0]
      ->>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
      ->>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
      ->>> img, target = test_set[0]
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_folder – folder with all annotation files for the images

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -

      -
      -

      docTR synthetic datasets

      -
      -
      -class doctr.datasets.DocArtefacts(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

      -
      -
      Example::
      >>> from doctr.datasets import DocArtefacts
      ->>> train_set = DocArtefacts(download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      -
      -
      -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import CharacterGenerator
      ->>> ds = CharacterGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.WordGenerator(vocab: str, min_chars: int, max_chars: int, num_samples: int, cache_samples: bool = False, font_family: str | List[str] | None = None, img_transforms: Callable[[Any], Any] | None = None, sample_transforms: Callable[[Any, Any], Tuple[Any, Any]] | None = None)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import WordGenerator
      ->>> ds = WordGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • min_chars – minimum number of characters in a word

      • -
      • max_chars – maximum number of characters in a word

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -

      docTR private datasets

      -

      Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same.

      -
      -
      -class doctr.datasets.DetectionDataset(img_folder: str, label_path: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Implements a text detection dataset

      -
      -
      Example::
      >>> from doctr.datasets import DetectionDataset
      ->>> train_set = DetectionDataset(img_folder="/path/to/images", label_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_path – path to the annotations of each image

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.RecognitionDataset(img_folder: str, labels_path: str, **kwargs: Any)[source]
      -

      Dataset implementation for text recognition tasks

      -
      -
      Example::
      >>> from doctr.datasets import RecognitionDataset
      ->>> train_set = RecognitionDataset(img_folder="/path/to/images", labels_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – path to the images folder

      • -
      • labels_path – pathe to the json file containing all labels (character sequences)

      • -
      -
      -
      -
      -
      -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      Implements an OCR dataset

      Parameters:
      • img_folder – local path to image folder (all jpg at the root)

      • label_file – local path to the label file

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • **kwargs – keyword arguments from VisionDataset.

      -

    Data Loading

    -

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

    +

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

    -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
    +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

    Implements a dataset wrapper for fast data loading

    Example::
    >>> from doctr.datasets import FUNSD, DataLoader
    @@ -681,7 +408,7 @@ 

    Data Loading

    Supported Vocabs

    -

    Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

    Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

    - +@@ -724,39 +451,19 @@

    Data Loading

    - - - - - - - - - - + + - - - - - - - - - - - - - - + +
    docTR VocabsDocTR Vocabs

    latin

    94

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

    english

    100

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

    legacy_french

    123

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    96

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

    french

    126

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

    portuguese

    131

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

    spanish

    116

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

    german

    108

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

    154

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
    +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

    Encode character sequences using a given vocab as mapping

    Parameters:
    @@ -767,7 +474,6 @@

    Data LoadingReturns: @@ -784,23 +490,23 @@

    Data Loading - +
    Next
    -
    doctr.io
    +
    doctr.documents
    - +
    Previous
    -
    Preparing your model for inference
    +
    Changelog
    @@ -836,32 +542,13 @@

    Data Loadingdoctr.datasets

    diff --git a/v0.8.0/installing.html b/v0.8.0/installing.html index b79f453bd6..8068adc0ba 100644 --- a/v0.8.0/installing.html +++ b/v0.8.0/installing.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Installation - docTR documentation @@ -227,28 +227,21 @@ @@ -290,16 +283,16 @@

    Installation

    -

    This library requires Python 3.6 or higher.

    +

    This library requires Python 3.6 or higher.

    Prerequisites

    Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

    If you are running another OS than Linux, you will need a few extra dependencies.

    -

    For MacOS users, you can install them using Homebrew as follows:

    +

    For MacOS users, you can install them as follows:

    brew install cairo pango gdk-pixbuf libffi
     
    @@ -307,17 +300,10 @@

    Prerequisites

    Via Python Package

    -

    Install the last stable release of the package using pip:

    +

    Install the last stable release of the package using pip:

    pip install python-doctr
     
    -

    We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

    -
    # for TensorFlow
    -pip install "python-doctr[tf]"
    -# for PyTorch
    -pip install "python-doctr[torch]"
    -
    -

    Via Git

    @@ -326,14 +312,6 @@

    Via Git¶ pip install -e doctr/.

    -

    Again, for framework-specific builds:

    -
    git clone https://github.com/mindee/doctr.git
    -# for TensorFlow
    -pip install -e doctr/.[tf]
    -# for PyTorch
    -pip install -e doctr/.[torch]
    -
    -
    @@ -342,12 +320,12 @@

    Via Git

    +

    diff --git a/v0.8.0/io.html b/v0.8.0/io.html deleted file mode 100644 index a61f5b20af..0000000000 --- a/v0.8.0/io.html +++ /dev/null @@ -1,839 +0,0 @@ - - - - - - - - - - - - - doctr.io - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    doctr.io

    -

    The io module enables users to easily access content from documents and export analysis -results to structured formats.

    -
    -

    Document structure

    -

    Structural organization of the documents.

    -
    -

    Word

    -

    A Word is an uninterrupted sequence of characters.

    -
    -
    -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray)[source]
    -

    Implements a word element

    -
    -
    Parameters:
    -
      -
    • value – the text string of the word

    • -
    • confidence – the confidence associated with the text prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

    • -
    • size (the page's)

    • -
    -
    -
    -
    - -
    -
    -

    Line

    -

    A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

    -
    -
    -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a line element as a collection of words

    -
    -
    Parameters:
    -
      -
    • words – list of word elements

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

    • -
    -
    -
    -
    - -
    -
    -

    Artefact

    -

    An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

    -
    -
    -class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
    -

    Implements a non-textual element

    -
    -
    Parameters:
    -
      -
    • artefact_type – the type of artefact

    • -
    • confidence – the confidence of the type prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

    • -
    -
    -
    -
    - -
    -
    -

    Block

    -

    A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

    -
    -
    -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a block element as a collection of lines and artefacts

    -
    -
    Parameters:
    -
      -
    • lines – list of line elements

    • -
    • artefacts – list of artefacts

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

    • -
    -
    -
    -
    - -
    -
    -

    Page

    -

    A Page is a collection of Blocks that were on the same physical page.

    -
    -
    -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
    -

    Implements a page element as a collection of blocks

    -
    -
    Parameters:
    -
      -
    • blocks – list of block elements

    • -
    • page_idx – the index of the page in the input raw document

    • -
    • dimensions – the page size in pixels in format (height, width)

    • -
    • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

    • -
    • language – a dictionary with the language value and confidence of the prediction

    • -
    -
    -
    -
    -
    -show(page: ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -
      -
    • page – image encoded as a numpy array in uint8

    • -
    • interactive – whether the display should be interactive

    • -
    • preserve_aspect_ratio – pass True if you passed True to the predictor

    • -
    -
    -
    -
    - -
    - -
    -
    -

    Document

    -

    A Document is a collection of Pages.

    -
    -
    -class doctr.io.Document(pages: List[Page])[source]
    -

    Implements a document element as a collection of pages

    -
    -
    Parameters:
    -

    pages – list of page elements

    -
    -
    -
    -
    -show(pages: List[ndarray], **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -

    pages – list of images encoded as numpy arrays in uint8

    -
    -
    -
    - -
    - -
    -
    -
    -

    File reading

    -

    High-performance file reading and conversion to processable structured data.

    -
    -
    -doctr.io.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_pdf
    ->>> doc = read_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
    -

    Read an image file into numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_img
    ->>> page = read_img("path/to/your/doc.jpg")
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • file – the path to the image file

    • -
    • output_size – the expected output size of each page in format H x W

    • -
    • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

    • -
    -
    -
    Returns:
    -

    the page decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]
    -

    Read an image file as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_path – location of the image file

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]
    -

    Read a byte stream as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_content – bytes of a decoded image

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.read_html(url: str, **kwargs: Any) bytes[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_html
    ->>> doc = read_html("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – URL of the target web page

    -
    -
    Returns:
    -

    decoded PDF file as a bytes stream

    -
    -
    -
    - -
    -
    -class doctr.io.DocumentFile[source]
    -

    Read a document from multiple extensions

    -
    -
    -classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
    -

    Read a PDF file

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file or a binary stream

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_url(url: str, **kwargs) PDF[source]
    -

    Interpret a web page as a PDF document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_url("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – the URL of the target web page

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
    -

    Read an image file (or a collection of image files) and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    files – the path to the image file or a binary stream, or a collection of those

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    - -
    -
    -class doctr.io.PDF(doc: Document)[source]
    -

    PDF document template

    -
    -
    Parameters:
    -

    doc – input PDF document

    -
    -
    -
    -
    -as_images(**kwargs) List[ndarray][source]
    -

    Convert all document pages to images

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of convert_page_to_numpy

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all words in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_lines(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all lines in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_artefacts() List[List[Tuple[float, float, float, float]]][source]
    -

    Get the artefacts for the entire document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
    -
    -
    -
    -
    -
    -
    Returns:
    -

    the list of pages artefacts, represented as a list of bounding boxes

    -
    -
    -
    - -
    - -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/models.html b/v0.8.0/models.html index 04ff61d44e..270664068f 100644 --- a/v0.8.0/models.html +++ b/v0.8.0/models.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.models - docTR documentation @@ -227,28 +227,21 @@ @@ -290,286 +283,64 @@

    doctr.models

    -
    -

    doctr.models.classification

    -
    -
    -doctr.models.classification.vgg16_bn_r(pretrained: bool = False, **kwargs: Any) VGG[source]
    -

    VGG-16 architecture as described in “Very Deep Convolutional Networks for Large-Scale Image Recognition”, modified by adding batch normalization, rectangular pooling and a simpler -classification head.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import vgg16_bn_r
    ->>> model = vgg16_bn_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on ImageNet

    -
    -
    Returns:
    -

    VGG feature extractor

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet18(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet-18 architecture as described in “Deep Residual Learning for Image Recognition”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet18
    ->>> model = resnet18(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with rectangular pooling windows as described in -“Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition”,. Downsizing: (H, W) –> (H/8, W/4)

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet31
    ->>> model = resnet31(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_small(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_large(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,, with rectangular pooling.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_r
    ->>> model = mobilenet_v3_small_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_large_r
    ->>> model = mobilenet_v3_large_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_orientation
    ->>> model = mobilenet_v3_small_orientation(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.magc_resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with Multi-Aspect Global Context Attention as described in -“MASTER: Multi-Aspect Non-local Network for Scene Text Recognition”,.

    -
    -
    Example::
    >>> import torch
    ->>> from doctr.models import magc_resnet31
    ->>> model = magc_resnet31(pretrained=False)
    ->>> input_tensor = torch.rand((1, 3, 224, 224), dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A feature extractor model

    -
    -
    -
    - -
    -
    -doctr.models.classification.crop_orientation_predictor(arch: str = 'mobilenet_v3_small_orientation', pretrained: bool = False, **kwargs: Any) CropOrientationPredictor[source]
    -

    Orientation classification architecture.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.models import crop_orientation_predictor
    ->>> model = crop_orientation_predictor(arch='classif_mobilenet_v3_small', pretrained=True)
    ->>> input_crop = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
    ->>> out = model([input_crop])
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • arch – name of the architecture to use (e.g. ‘mobilenet_v3_small’)

    • -
    • pretrained – If True, returns a model pre-trained on our recognition crops dataset

    • +

      The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture.

      +

      For a given task, DocTR provides a Predictor, which is composed of 2 components:

      +
        +
      • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

      • +
      • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

      -
    -
    Returns:
    -

    CropOrientationPredictor

    -
    -
    -
    - -
    -
    -

    doctr.models.detection

    -
    -
    -doctr.models.detection.linknet_resnet18(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    -

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import linknet_resnet18
    ->>> model = linknet_resnet18(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    +
    +

    Text Detection

    +

    Localizing text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

    -
    -
    Returns:
    -

    text detection architecture

    -
    -
    -
    - +

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for detection

    +

    In DocTR, the pre-processing scheme for detection is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

    2. +
    3. batch images together

    4. +
    5. normalize the batch using the training data statistics

    6. +
    +
    +
    +

    Detection models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
    @@ -595,13 +366,13 @@

    doctr.models.detection

    -
    -doctr.models.detection.db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) DBNet[source]
    -

    DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a mobilenet v3 large backbone.

    +
    +doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    +

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import db_mobilenet_v3_large
    ->>> model = db_mobilenet_v3_large(pretrained=True)
    +>>> from doctr.models import linknet16
    +>>> model = linknet16(pretrained=True)
     >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -618,14 +389,18 @@

    doctr.models.detection

    +
    +
    +

    Detection predictors

    +

    Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

    -doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, assume_straight_pages: bool = True, **kwargs: Any) DetectionPredictor[source]
    +doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]

    Text detection architecture.

    Example::
    >>> import numpy as np
     >>> from doctr.models import detection_predictor
    ->>> model = detection_predictor(arch='db_resnet50', pretrained=True)
    +>>> model = detection_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -635,9 +410,8 @@

    doctr.models.detection
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘db_resnet50’)

    • +
    • arch – name of the architecture to use (‘db_resnet50’)

    • pretrained – If True, returns a model pre-trained on our text detection dataset

    • -
    • assume_straight_pages – If True, fit straight boxes to the page

    Returns:
    @@ -647,8 +421,74 @@

    doctr.models.detection

    -
    -

    doctr.models.recognition

    +
    +
    +

    Text Recognition

    +

    Identifying strings in images

    +
    + + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    86.02

    91.3

    12.8

    sar_vgg16_bn

    (32, 128, 3)

    21.5M

    86.2

    91.7

    3.3

    sar_resnet31

    (32, 128, 3)

    53.1M

    86.3

    92.1

    2.7

    +
    +

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All these recognition models are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for recognition

    +

    In DocTR, the pre-processing scheme for recognition is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) without deformation.

    2. +
    3. pad the image to the target size (with zeros by default)

    4. +
    5. batch images together

    6. +
    7. normalize the batch using the training data statistics

    8. +
    +
    +
    +

    Recognition models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
    @@ -675,40 +515,15 @@

    doctr.models.recognition -
    -doctr.models.recognition.crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Small backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_small
    ->>> model = crnn_mobilenet_v3_small(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    -
    -
    Returns:
    -

    text recognition architecture

    -
    -
    -

    - -
    -
    -doctr.models.recognition.crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Large backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    +
    +doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
    +

    SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong +Baseline for Irregular Text Recognition”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_large
    ->>> model = crnn_mobilenet_v3_large(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    +>>> from doctr.models import sar_vgg16_bn
    +>>> model = sar_vgg16_bn(pretrained=False)
    +>>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -750,17 +565,15 @@

    doctr.models.recognition
    doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
    -

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.

    -
    -
    Example::
    >>> import tensorflow as tf
    +

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. +Example:

    +
    >>> import tensorflow as tf
     >>> from doctr.models import master
     >>> model = master(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    -
    -
    Parameters:

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    @@ -771,6 +584,10 @@

    doctr.models.recognition +

    Recognition predictors

    +

    Combining the right components around a given architecture for easier usage.

    doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
    @@ -788,7 +605,7 @@

    doctr.models.recognition
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘crnn_vgg16_bn’)

    • +
    • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

    • pretrained – If True, returns a model pre-trained on our text recognition dataset

    @@ -799,16 +616,141 @@

    doctr.models.recognition -

    doctr.models.zoo

    +

    +
    +

    End-to-End OCR

    +

    Predictors that localize and identify text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    70.08

    74.77

    0.85

    82.19

    79.67

    1.6

    db_resnet50 + sar_vgg16_bn

    N/A

    N/A

    0.49

    N/A

    N/A

    1.0

    db_resnet50 + sar_resnet31

    N/A

    N/A

    0.27

    N/A

    N/A

    0.83

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    +
    +

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +

    Results on private ocr datasets

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Receipts

    Invoices

    IDs

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.90

    81.01

    65.68

    69.86

    49.48

    50.46

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    +
    +
    +

    Two-stage approaches

    +

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

    -
    -doctr.models.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, assume_straight_pages: bool = True, export_as_straight_boxes: bool = False, preserve_aspect_ratio: bool = False, **kwargs: Any) OCRPredictor[source]
    +
    +doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]

    End-to-end OCR architecture using one model for localization, and another for text recognition.

    Example::
    >>> import numpy as np
     >>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    +>>> model = ocr_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -818,15 +760,8 @@

    doctr.models.zoo
    Parameters:
      -
    • det_arch – name of the detection architecture to use (e.g. ‘db_resnet50’, ‘db_mobilenet_v3_large’)

    • -
    • reco_arch – name of the recognition architecture to use (e.g. ‘crnn_vgg16_bn’, ‘sar_resnet31’)

    • +
    • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

    • pretrained – If True, returns a model pre-trained on our OCR dataset

    • -
    • assume_straight_pages – if True, speeds up the inference by assuming you only pass straight pages -without rotated textual elements.

    • -
    • export_as_straight_boxes – when assume_straight_pages is set to False, export final predictions -(potentially rotated) as straight bounding boxes.

    • -
    • preserve_aspect_ratio – If True, pad the input document image to preserve the aspect ratio before -running the detection model on it.

    Returns:
    @@ -835,6 +770,113 @@

    doctr.models.zoo +

    Model export

    +

    Utility functions to make the most of document analysis models.

    +
    +

    Model compression

    +
    +
    +doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
    +

    Converts a model to TFLite format

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_tflite, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_tflite(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
    +

    Converts a model to half precision

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_fp16, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_fp16(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the serialized FP16 model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
    +

    Quantize a Tensorflow model

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import quantize_model, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = quantize_model(model, (224, 224, 3))
    +
    +
    +
    +
    +
    +
    Parameters:
    +
      +
    • tf_model – a keras model

    • +
    • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

    • +
    +
    +
    Returns:
    +

    the serialized quantized model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +

    Using SavedModel

    +

    Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +SavedModel format as follows:

    +
    >>> import tensorflow as tf
    +>>> from doctr.models import db_resnet50
    +>>> model = db_resnet50(pretrained=True)
    +>>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    +>>> _ = model(input_t, training=False)
    +>>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    +
    +
    +

    And loaded just as easily:

    +
    >>> import tensorflow as tf
    +>>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    +
    +
    +

    @@ -852,14 +894,14 @@

    doctr.models.zoo - +
    Previous
    -
    doctr.io
    +
    doctr.documents
    @@ -894,37 +936,49 @@

    doctr.models.zoo

    diff --git a/v0.8.0/searchindex.js b/v0.8.0/searchindex.js index 930e27aabf..9474806c95 100644 --- a/v0.8.0/searchindex.js +++ b/v0.8.0/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id99"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id118"], [7, "id122"], [7, "id126"], [7, "id131"], [7, "id135"], [7, "id139"], [7, "id143"], [7, "id145"], [7, "id147"], [7, "id149"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id102"], [7, "id106"], [7, "id111"], [7, "id116"], [7, "id121"], [7, "id125"], [7, "id129"], [7, "id134"], [7, "id138"], [7, "id142"], [7, "id144"], [7, "id146"], [7, "id148"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2024-09-09)": [[0, "v0-7-0-2024-09-09"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "2m": [], "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "There": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_helper": [], "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "amazon": [], "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "anywher": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": 7, "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "bash": [], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "below": [], "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "catch": [], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "cleaner": [], "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compli": [], "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "constraint": [], "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convent": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": 7, "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doe": [], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "ec2": [], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enivron": [], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fallback": [], "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flake8": [], "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float16": [], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_keras_model": [], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_lin": [], "get_text_word": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "incom": [], "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inference_input_typ": [], "inference_output_typ": [], "inform": [1, 2, 4, 5, 14], "inherit": [], "ini": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "int8": [], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "isort": [], "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "keep": [], "kei": [4, 5], "kera": [7, 15], "kernel": [7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "look": [], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": 7, "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "mobilenetv3_larg": [], "mobilenetv3_smal": [], "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "mypi": [], "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "oper": [], "opinion": 1, "opsset": [], "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdf_render": [], "pdfdocument": [], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pr": [], "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pydocstyl": [], "pypdfium2": 6, "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "render_pdf_topil": [], "render_to": [], "reorder": [], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "representative_dataset": [], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "resum": [], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "road": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sane": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "style": [], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "supported_op": [], "supported_typ": [], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "target_spec": [], "task": [4, 5, 7, 13, 14, 16], "task2": 5, "tax": [], "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "tflite_builtins_int8": [], "tfliteconvert": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unfortun": [], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "variou": [], "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn": [], "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": [], "9": [], "advanc": 16, "annot": [], "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "backbon": [], "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "docstr": [], "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "format": [], "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "import": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "lint": [], "linux": [], "lite": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": 16, "order": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "privat": [], "process": [], "public": [], "push": 13, "python": 3, "qualiti": 2, "quantiz": [], "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "tensorflow": [], "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "type": [], "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id99"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id118"], [7, "id122"], [7, "id126"], [7, "id131"], [7, "id135"], [7, "id139"], [7, "id143"], [7, "id145"], [7, "id147"], [7, "id149"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id102"], [7, "id106"], [7, "id111"], [7, "id116"], [7, "id121"], [7, "id125"], [7, "id129"], [7, "id134"], [7, "id138"], [7, "id142"], [7, "id144"], [7, "id146"], [7, "id148"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2024-09-09)": [[0, "v0-7-0-2024-09-09"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": [], "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": [], "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": [], "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": 7, "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": [], "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": [], "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": 7, "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": 7, "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [], "fast_smal": [], "fast_tini": [], "faster": [7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": [], "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": 3, "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": 7, "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": 6, "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": [], "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": [], "techminde": [], "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": [], "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": [], "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": [], "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": [], "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.8.0/transforms.html b/v0.8.0/transforms.html index 0d1b5f7402..d42da50481 100644 --- a/v0.8.0/transforms.html +++ b/v0.8.0/transforms.html @@ -227,28 +227,21 @@ @@ -293,7 +286,7 @@

    doctr.transformstorchvision, we express transformations as composable modules.

    Supported transformations

    -

    Here are all transformations that are available through docTR:

    +

    Here are all transformations that are available through DocTR:

    class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
    @@ -364,7 +357,7 @@

    Supported transformations
    -class doctr.transforms.ToGray(num_output_channels: int = 1)[source]
    +class doctr.transforms.ToGray[source]

    Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

    Example::
    >>> from doctr.transforms import Normalize
    @@ -524,88 +517,6 @@ 

    Supported transformations -
    -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
    -

    Randomly rotate a tensor image and its boxes

    -https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png -
    -
    Parameters:
    -
      -
    • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

    • -
    • expand – whether the image should be padded before the rotation

    • -
    -
    -
    -

    - -
    -
    -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
    -

    Randomly crop a tensor image and its boxes

    -
    -
    Parameters:
    -
      -
    • scale – tuple of floats, relative (min_area, max_area) of the crop

    • -
    • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.GaussianBlur(kernel_shape: int | Iterable[int], std: Tuple[float, float])[source]
    -

    Randomly adjust jpeg quality of a 3 dimensional RGB image

    -
    -
    Example::
    >>> from doctr.transforms import GaussianBlur
    ->>> import tensorflow as tf
    ->>> transfo = GaussianBlur(3, (.1, 5))
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • kernel_shape – size of the blurring kernel

    • -
    • std – min and max value of the standard deviation

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.ChannelShuffle[source]
    -

    Randomly shuffle channel order of a given image

    -
    - -
    -
    -class doctr.transforms.GaussianNoise(mean: float = 0.0, std: float = 1.0)[source]
    -

    Adds Gaussian Noise to the input tensor

    -
    -
    Example::
    >>> from doctr.transforms import GaussianNoise
    ->>> import tensorflow as tf
    ->>> transfo = GaussianNoise(0., 1.)
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • mean – mean of the gaussian distribution

    • -
    • std – std of the gaussian distribution

    • -
    -
    -
    -
    -

    Composing transformations

    @@ -744,11 +655,6 @@

    Composing transformationsRandomHue
  • RandomGamma
  • RandomJpegQuality
  • -
  • RandomRotate
  • -
  • RandomCrop
  • -
  • GaussianBlur
  • -
  • ChannelShuffle
  • -
  • GaussianNoise
  • Composing transformations
      @@ -768,7 +674,7 @@

      Composing transformations +

  • diff --git a/v0.8.0/using_model_export.html b/v0.8.0/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.8.0/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Preparing your model for inference

    -

    A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

    -
    -

    Model compression

    -

    This section is meant to help you perform inference with compressed versions of your model.

    -
    -

    TensorFlow Lite

    -

    TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

    -
    >>> import tensorflow as tf
    ->>> from tensorflow.keras import Sequential
    ->>> from doctr.models import conv_sequence
    ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Half-precision

    -

    If you want to convert it to half-precision using your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> converter.target_spec.supported_types = [tf.float16]
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Post-training quantization

    -

    Finally if you wish to quantize the model with your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> # Float fallback for operators that do not have an integer implementation
    ->>> def representative_dataset():
    ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
    ->>> converter.representative_dataset = representative_dataset
    ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    ->>> converter.inference_input_type = tf.int8
    ->>> converter.inference_output_type = tf.int8
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -
    -

    Using SavedModel

    -

    Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

    -
    >>> import tensorflow as tf
    ->>> from doctr.models import db_resnet50
    ->>> model = db_resnet50(pretrained=True)
    ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> _ = model(input_t, training=False)
    ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    -
    -
    -

    And loaded just as easily:

    -
    >>> import tensorflow as tf
    ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/using_models.html b/v0.8.0/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.8.0/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Choosing the right model

    -

    The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

    -

    For a given task, docTR provides a Predictor, which is composed of 2 components:

    -
      -
    • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

    • -
    • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

    • -
    -
    -

    Text Detection

    -

    The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    db_mobilenet_v3_large

    (1024, 1024, 3)

    4.2 M

    79.35

    84.03

    81.14

    66.85

    -
    -

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Detection predictors

    -

    detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import detection_predictor
    ->>> predictor = detection_predictor('db_resnet50')
    ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    Text Recognition

    -

    The task consists of transcribing the character sequence in a given image.

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    87.18

    92.93

    12.8

    crnn_mobilenet_v3_small

    (32, 128, 3)

    2.1M

    86.21

    90.56

    crnn_mobilenet_v3_large

    (32, 128, 3)

    4.5M

    86.95

    92.03

    sar_resnet31

    (32, 128, 3)

    56.2M

    87.70

    93.41

    2.7

    master

    (32, 128, 3)

    67.7M

    87.62

    93.27

    -
    -

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

    -

    While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

    -
    >>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> print(predictor.model.cfg['vocab'])
    -
    -
    -

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Recognition predictors

    -

    recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    End-to-End OCR

    -

    The task consists of both localizing and transcribing textual elements in a given image.

    -
    -

    Available architectures

    -

    You can use any combination of detection and recognition models supporte by docTR.

    -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    71.25

    76.02

    0.85

    84.00

    81.42

    1.6

    db_resnet50 + master

    71.03

    76.06

    84.49

    81.94

    db_resnet50 + sar_resnet31

    71.25

    76.29

    0.27

    84.50

    81.96

    0.83

    db_resnet50 + crnn_mobilenet_v3_small

    69.85

    74.80

    80.85

    78.42

    0.83

    db_resnet50 + crnn_mobilenet_v3_large

    70.57

    75.57

    82.57

    80.08

    0.83

    db_mobilenet_v3_large + crnn_vgg16_bn

    67.73

    71.73

    71.65

    59.03

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    -
    -

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -

    Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Receipts

    Invoices

    IDs

    US Tax Forms

    Resumes

    Road Fines

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.70

    81.12

    65.80

    70.70

    50.25

    51.78

    79.08

    92.83

    db_resnet50 + master (ours)

    79.00

    81.42

    65.57

    69.86

    51.34

    52.90

    78.86

    92.57

    db_resnet50 + sar_resnet31 (ours)

    78.94

    81.37

    65.89

    70.79

    51.78

    53.35

    79.04

    92.78

    db_resnet50 + crnn_mobilenet_v3_small (ours)

    76.81

    79.15

    64.89

    69.61

    45.03

    46.38

    78.96

    92.11

    85.91

    87.20

    84.85

    85.86

    db_resnet50 + crnn_mobilenet_v3_large (ours)

    78.01

    80.39

    65.36

    70.11

    48.00

    49.43

    79.39

    92.62

    87.68

    89.00

    85.65

    86.67

    db_mobilenet_v3_large + crnn_vgg16_bn (ours)

    78.36

    74.93

    63.04

    68.41

    39.36

    41.75

    72.14

    89.97

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    69.79

    65.68

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    84.31

    98.11

    -
    -
    -
    -

    Two-stage approaches

    -

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

    -
    >>> import numpy as np
    ->>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([input_page])
    -
    -
    -
    -
    -

    What should I do with the output?

    -

    The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

    -

    Here is a typical Document layout:

    -
    Document(
    -  (pages): [Page(
    -    dimensions=(340, 600)
    -    (blocks): [Block(
    -      (lines): [Line(
    -        (words): [
    -          Word(value='No.', confidence=0.91),
    -          Word(value='RECEIPT', confidence=0.99),
    -          Word(value='DATE', confidence=0.96),
    -        ]
    -      )]
    -      (artefacts): []
    -    )]
    -  )]
    -)
    -
    -
    -

    You can also export them as a nested dict, more appropriate for JSON format:

    -
    json_output = result.export()
    -
    -
    -

    For reference, here is the JSON export for the same Document as above:

    -
    {
    -  'pages': [
    -      {
    -          'page_idx': 0,
    -          'dimensions': (340, 600),
    -          'orientation': {'value': None, 'confidence': None},
    -          'language': {'value': None, 'confidence': None},
    -          'blocks': [
    -              {
    -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                  'lines': [
    -                      {
    -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                          'words': [
    -                              {
    -                                  'value': 'No.',
    -                                  'confidence': 0.914085328578949,
    -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
    -                              },
    -                              {
    -                                  'value': 'RECEIPT',
    -                                  'confidence': 0.9949972033500671,
    -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
    -                              },
    -                              {
    -                                  'value': 'DATE',
    -                                  'confidence': 0.9578408598899841,
    -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
    -                              }
    -                          ]
    -                      }
    -                  ],
    -                  'artefacts': []
    -              }
    -          ]
    -      }
    -  ]
    -}
    -
    -
    -

    To export the outpout as XML (hocr-format) you can use the export_as_xml method:

    -
    xml_output = result.export_as_xml()
    -for output in xml_output:
    -  xml_bytes_string = output[0]
    -  xml_element = output[1]
    -
    -
    -

    For reference, here is a sample XML byte string output:

    -
    <?xml version="1.0" encoding="UTF-8"?>
    -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    -  <head>
    -    <title>docTR - hOCR</title>
    -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    -    <meta name="ocr-system" content="doctr 0.5.0" />
    -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
    -  </head>
    -  <body>
    -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
    -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
    -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
    -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
    -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
    -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
    -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
    -        </span>
    -      </p>
    -    </div>
    -  </body>
    -</html>
    -
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.0/utils.html b/v0.8.0/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.8.0/utils.html +++ b/v0.8.0/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

    Visualization -
    -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
    -

    Draw a the content of the element page (OCR response) on a blank page.

    -
    -
    Parameters:
    -
      -
    • page – exported Page object to represent

    • -
    • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

    • -
    • font_size – size of the font, default font = 13

    • -
    • font_family – family of the font

    • -
    -
    -
    Returns:
    -

    the synthesized page

    -
    -
    -
    -

    Task evaluation

    @@ -382,20 +356,6 @@

    Visualization -
    -update(gt: List[str], pred: List[str]) None[source]
    -

    Update the state of the metric with new predictions

    -
    -
    Parameters:
    -
      -
    • gt – list of groung-truth character sequences

    • -
    • pred – list of predicted character sequences

    • -
    -
    -
    -
    -
    summary() Dict[str, float][source]
    @@ -412,14 +372,14 @@

    Visualization
    -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

    Implements common confusion metrics and mean IoU for localization evaluation.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    -update(gts: ndarray, preds: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    -
    -
    -
    -
    summary() Tuple[float | None, float | None, float | None][source]
    @@ -485,15 +426,15 @@

    Visualization
    -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an end-to-end OCR metric.

    +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

    Implements end-to-end OCR metric.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -

    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – a list of N string labels

    • -
    • pred_labels – a list of M string labels

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

    Computes the aggregated metrics

    Returns:
    -

    a tuple with the recall & precision for each string comparison and the mean IoU

    -
    -
    -
    - - - -
    -
    -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an object detection metric.

    -

    The aggregated metrics are computed as follows:

    -
    -
    -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
    -
    -

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

    -
    -
    -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
    -
    -

    where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.utils import DetectionMetric
    ->>> metric = DetectionMetric(iou_thresh=0.5)
    ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
    -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
    ->>> metric.summary()
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -
    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – an array of class indices of shape (N,)

    • -
    • pred_labels – an array of class indices of shape (M,)

    • -
    -
    -
    -
    - -
    -
    -summary() Tuple[float | None, float | None, float | None][source]
    -

    Computes the aggregated metrics

    -
    -
    Returns:
    -

    a tuple with the recall & precision for each class prediction and the mean IoU

    +

    a tuple with the recall & precision for each string comparison flexibility and the mean IoU

    @@ -649,15 +490,7 @@

    Visualization - -
    -
    - Next -
    -
    Changelog
    -
    - -
    + diff --git a/v0.8.1/_modules/doctr/datasets/classification/tensorflow.html b/v0.8.1/_modules/doctr/datasets/classification/tensorflow.html deleted file mode 100644 index 829b6efb9d..0000000000 --- a/v0.8.1/_modules/doctr/datasets/classification/tensorflow.html +++ /dev/null @@ -1,366 +0,0 @@ - - - - - - - - - - - - doctr.datasets.classification.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    -
    - -
    - -
    -
    -

    Source code for doctr.datasets.classification.tensorflow

    -# Copyright (C) 2021, Mindee.
    -
    -# This program is licensed under the Apache License version 2.
    -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
    -
    -import tensorflow as tf
    -
    -from .base import _CharacterGenerator
    -
    -__all__ = ['CharacterGenerator']
    -
    -
    -
    -[docs] -class CharacterGenerator(_CharacterGenerator): - """Implements a character image generation dataset - - Example:: - >>> from doctr.datasets import CharacterGenerator - >>> ds = CharacterGenerator(vocab='abdef') - >>> img, target = ds[0] - - Args: - vocab: vocabulary to take the character from - num_samples: number of samples that will be generated iterating over the dataset - cache_samples: whether generated images should be cached firsthand - sample_transforms: composable transformations that will be applied to each image - """ - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - - @staticmethod - def collate_fn(samples): - - images, targets = zip(*samples) - images = tf.stack(images, axis=0) - - return images, tf.convert_to_tensor(targets)
    - -
    -
    -
    -
    - - -
    -
    - - Made with Sphinx and @pradyunsg's - - Furo - -
    -
    - -
    -
    - -
    -
    - -
    -
    - - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html b/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html index 8a191ecfc7..fddca20034 100644 --- a/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html +++ b/v0.8.1/_modules/doctr/datasets/datasets/tensorflow.html @@ -236,7 +236,7 @@

    Package Reference

    • doctr.datasets
    • -
    • doctr.io
    • +
    • doctr.documents
    • doctr.models
    • doctr.transforms
    • doctr.utils
    • @@ -284,7 +284,6 @@

      Source code for doctr.datasets.datasets.tensorflow

      from typing import List, Any, Tuple import tensorflow as tf -from doctr.io import read_img_as_tensor from .base import _AbstractDataset, _VisionDataset @@ -293,14 +292,11 @@

      Source code for doctr.datasets.datasets.tensorflow

      class AbstractDataset(_AbstractDataset): - @staticmethod - def _get_img_shape(img: Any) -> Tuple[int, int]: - return img.shape[:2] - def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]: img_name, target = self.data[index] # Read image - img = read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float16 if self.fp16 else tf.float32) + img = tf.io.read_file(os.path.join(self.root, img_name)) + img = tf.image.decode_jpeg(img, channels=3) return img, target @@ -350,7 +346,7 @@

      Source code for doctr.datasets.datasets.tensorflow

      +
      diff --git a/v0.8.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html b/v0.8.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html deleted file mode 100644 index a0f857205e..0000000000 --- a/v0.8.1/_modules/doctr/models/backbones/mobilenet/tensorflow.html +++ /dev/null @@ -1,688 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.mobilenet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.mobilenet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -# Greatly inspired by https://github.com/pytorch/vision/blob/master/torchvision/models/mobilenetv3.py
      -
      -from typing import Any, Dict, List, Optional, Tuple, Union
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ....datasets import VOCABS
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ["MobileNetV3", "mobilenet_v3_small", "mobilenet_v3_small_r", "mobilenet_v3_large",
      -           "mobilenet_v3_large_r"]
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'mobilenet_v3_large': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_large-d27d66f2.zip'
      -    },
      -    'mobilenet_v3_large_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    },
      -    'mobilenet_v3_small': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['legacy_french'],
      -        'url': 'https://github.com/mindee/doctr/releases/download/v0.3.0/mobilenet_v3_small-d624c4de.zip'
      -    },
      -    'mobilenet_v3_small_r': {
      -        'mean': (0.694, 0.695, 0.693),
      -        'std': (0.299, 0.296, 0.301),
      -        'input_shape': (32, 32, 3),
      -        'vocab': VOCABS['french'],
      -        'url': None,
      -    }
      -}
      -
      -
      -def hard_swish(x: tf.Tensor) -> tf.Tensor:
      -    return x * tf.nn.relu6(x + 3.) / 6.0
      -
      -
      -def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
      -    if min_value is None:
      -        min_value = divisor
      -    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
      -    # Make sure that round down does not go down by more than 10%.
      -    if new_v < 0.9 * v:
      -        new_v += divisor
      -    return new_v
      -
      -
      -class SqueezeExcitation(Sequential):
      -    """Squeeze and Excitation.
      -    """
      -    def __init__(self, chan: int, squeeze_factor: int = 4) -> None:
      -        super().__init__(
      -            [
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(chan // squeeze_factor, activation='relu'),
      -                layers.Dense(chan, activation='hard_sigmoid'),
      -                layers.Reshape((1, 1, chan))
      -            ]
      -        )
      -
      -    def call(self, inputs: tf.Tensor, **kwargs: Any) -> tf.Tensor:
      -        x = super().call(inputs, **kwargs)
      -        x = tf.math.multiply(inputs, x)
      -        return x
      -
      -
      -class InvertedResidualConfig:
      -    def __init__(
      -        self,
      -        input_channels: int,
      -        kernel: int,
      -        expanded_channels: int,
      -        out_channels: int,
      -        use_se: bool,
      -        activation: str,
      -        stride: Union[int, Tuple[int, int]],
      -        width_mult: float = 1,
      -    ) -> None:
      -        self.input_channels = self.adjust_channels(input_channels, width_mult)
      -        self.kernel = kernel
      -        self.expanded_channels = self.adjust_channels(expanded_channels, width_mult)
      -        self.out_channels = self.adjust_channels(out_channels, width_mult)
      -        self.use_se = use_se
      -        self.use_hs = activation == "HS"
      -        self.stride = stride
      -
      -    @staticmethod
      -    def adjust_channels(channels: int, width_mult: float):
      -        return _make_divisible(channels * width_mult, 8)
      -
      -
      -class InvertedResidual(layers.Layer):
      -    """InvertedResidual for mobilenet
      -
      -    Args:
      -        conf: configuration object for inverted residual
      -    """
      -    def __init__(
      -        self,
      -        conf: InvertedResidualConfig,
      -        **kwargs: Any,
      -    ) -> None:
      -        _kwargs = {'input_shape': kwargs.pop('input_shape')} if isinstance(kwargs.get('input_shape'), tuple) else {}
      -        super().__init__(**kwargs)
      -
      -        act_fn = hard_swish if conf.use_hs else tf.nn.relu
      -
      -        _is_s1 = (isinstance(conf.stride, tuple) and conf.stride == (1, 1)) or conf.stride == 1
      -        self.use_res_connect = _is_s1 and conf.input_channels == conf.out_channels
      -
      -        _layers = []
      -        # expand
      -        if conf.expanded_channels != conf.input_channels:
      -            _layers.extend(conv_sequence(conf.expanded_channels, act_fn, kernel_size=1, bn=True, **_kwargs))
      -
      -        # depth-wise
      -        _layers.extend(conv_sequence(
      -            conf.expanded_channels, act_fn, kernel_size=conf.kernel, strides=conf.stride, bn=True,
      -            groups=conf.expanded_channels,
      -        ))
      -
      -        if conf.use_se:
      -            _layers.append(SqueezeExcitation(conf.expanded_channels))
      -
      -        # project
      -        _layers.extend(conv_sequence(
      -            conf.out_channels, None, kernel_size=1, bn=True,
      -        ))
      -
      -        self.block = Sequential(_layers)
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor,
      -        **kwargs: Any,
      -    ) -> tf.Tensor:
      -
      -        out = self.block(inputs, **kwargs)
      -        if self.use_res_connect:
      -            out = tf.add(out, inputs)
      -
      -        return out
      -
      -
      -class MobileNetV3(Sequential):
      -    """Implements MobileNetV3, inspired from both:
      -    <https://github.com/xiaochus/MobileNetV3/tree/master/model>`_.
      -    and <https://pytorch.org/vision/stable/_modules/torchvision/models/mobilenetv3.html>`_.
      -    """
      -
      -    def __init__(
      -        self,
      -        layout: List[InvertedResidualConfig],
      -        input_shape: Tuple[int, int, int],
      -        include_top: bool = False,
      -        head_chans: int = 1024,
      -        num_classes: int = 1000,
      -    ) -> None:
      -
      -        _layers = [
      -            Sequential(conv_sequence(layout[0].input_channels, hard_swish, True, kernel_size=3, strides=2,
      -                       input_shape=input_shape), name="stem")
      -        ]
      -
      -        for idx, conf in enumerate(layout):
      -            _layers.append(
      -                InvertedResidual(conf, name=f"inverted_{idx}"),
      -            )
      -
      -        _layers.append(
      -            Sequential(
      -                conv_sequence(6 * layout[-1].out_channels, hard_swish, True, kernel_size=1),
      -                name="final_block"
      -            )
      -        )
      -
      -        if include_top:
      -            _layers.extend([
      -                layers.GlobalAveragePooling2D(),
      -                layers.Dense(head_chans, activation=hard_swish),
      -                layers.Dropout(0.2),
      -                layers.Dense(num_classes),
      -            ])
      -
      -        super().__init__(_layers)
      -
      -
      -def _mobilenet_v3(
      -    arch: str,
      -    pretrained: bool,
      -    input_shape: Optional[Tuple[int, int, int]] = None,
      -    **kwargs: Any
      -) -> MobileNetV3:
      -    input_shape = input_shape or default_cfgs[arch]['input_shape']
      -
      -    # cf. Table 1 & 2 of the paper
      -    if arch.startswith("mobilenet_v3_small"):
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, True, "RE", 2),  # C1
      -            InvertedResidualConfig(16, 3, 72, 24, False, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(24, 3, 88, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 96, 40, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 240, 40, True, "HS", 1),
      -            InvertedResidualConfig(40, 5, 120, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 144, 48, True, "HS", 1),
      -            InvertedResidualConfig(48, 5, 288, 96, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -            InvertedResidualConfig(96, 5, 576, 96, True, "HS", 1),
      -        ]
      -        head_chans = 1024
      -    else:
      -        inverted_residual_setting = [
      -            InvertedResidualConfig(16, 3, 16, 16, False, "RE", 1),
      -            InvertedResidualConfig(16, 3, 64, 24, False, "RE", 2),  # C1
      -            InvertedResidualConfig(24, 3, 72, 24, False, "RE", 1),
      -            InvertedResidualConfig(24, 5, 72, 40, True, "RE", (2, 1) if arch.endswith("_r") else 2),  # C2
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 5, 120, 40, True, "RE", 1),
      -            InvertedResidualConfig(40, 3, 240, 80, False, "HS", (2, 1) if arch.endswith("_r") else 2),  # C3
      -            InvertedResidualConfig(80, 3, 200, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 184, 80, False, "HS", 1),
      -            InvertedResidualConfig(80, 3, 480, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 3, 672, 112, True, "HS", 1),
      -            InvertedResidualConfig(112, 5, 672, 160, True, "HS", (2, 1) if arch.endswith("_r") else 2),  # C4
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -            InvertedResidualConfig(160, 5, 960, 160, True, "HS", 1),
      -        ]
      -        head_chans = 1280
      -
      -    kwargs['num_classes'] = kwargs.get('num_classes', len(default_cfgs[arch]['vocab']))
      -
      -    # Build the model
      -    model = MobileNetV3(
      -        inverted_residual_setting,
      -        input_shape,
      -        head_chans=head_chans,
      -        **kwargs,
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_small(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Small architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_, with rectangular pooling. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_small_r - >>> model = mobilenet_v3_small_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - - return _mobilenet_v3('mobilenet_v3_small_r', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenetv3_large - >>> model = mobilenetv3_large(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large', pretrained, **kwargs)
      - - - -
      -[docs] -def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3: - """MobileNetV3-Large architecture as described in - `"Searching for MobileNetV3", - <https://arxiv.org/pdf/1905.02244.pdf>`_. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import mobilenet_v3_large_r - >>> model = mobilenet_v3_large_r(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - a keras.Model - """ - return _mobilenet_v3('mobilenet_v3_large_r', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/backbones/resnet/tensorflow.html b/v0.8.1/_modules/doctr/models/backbones/resnet/tensorflow.html deleted file mode 100644 index d959be9a0f..0000000000 --- a/v0.8.1/_modules/doctr/models/backbones/resnet/tensorflow.html +++ /dev/null @@ -1,522 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.resnet.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.resnet.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, List, Optional, Tuple
      -
      -import tensorflow as tf
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['ResNet', 'resnet31', 'ResnetStage']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'resnet31': {'num_blocks': (1, 2, 5, 3), 'output_channels': (256, 256, 512, 512),
      -                 'conv_seq': (True, True, True, True), 'pooling': ((2, 2), (2, 1), None, None),
      -                 'url': None},
      -}
      -
      -
      -class ResnetBlock(layers.Layer):
      -
      -    """Implements a resnet31 block with shortcut
      -
      -    Args:
      -        conv_shortcut: Use of shortcut
      -        output_channels: number of channels to use in Conv2D
      -        kernel_size: size of square kernels
      -        strides: strides to use in the first convolution of the block
      -    """
      -    def __init__(
      -        self,
      -        output_channels: int,
      -        conv_shortcut: bool,
      -        strides: int = 1,
      -        **kwargs
      -    ) -> None:
      -
      -        super().__init__(**kwargs)
      -        if conv_shortcut:
      -            self.shortcut = Sequential(
      -                [
      -                    layers.Conv2D(
      -                        filters=output_channels,
      -                        strides=strides,
      -                        padding='same',
      -                        kernel_size=1,
      -                        use_bias=False,
      -                        kernel_initializer='he_normal'
      -                    ),
      -                    layers.BatchNormalization()
      -                ]
      -            )
      -        else:
      -            self.shortcut = layers.Lambda(lambda x: x)
      -        self.conv_block = Sequential(
      -            self.conv_resnetblock(output_channels, 3, strides)
      -        )
      -        self.act = layers.Activation('relu')
      -
      -    @staticmethod
      -    def conv_resnetblock(
      -        output_channels: int,
      -        kernel_size: int,
      -        strides: int = 1,
      -    ) -> List[layers.Layer]:
      -        return [
      -            *conv_sequence(output_channels, activation='relu', bn=True, strides=strides, kernel_size=kernel_size),
      -            layers.Conv2D(output_channels, kernel_size, padding='same', use_bias=False, kernel_initializer='he_normal'),
      -            layers.BatchNormalization(),
      -        ]
      -
      -    def call(
      -        self,
      -        inputs: tf.Tensor
      -    ) -> tf.Tensor:
      -        clone = self.shortcut(inputs)
      -        conv_out = self.conv_block(inputs)
      -        out = self.act(clone + conv_out)
      -
      -        return out
      -
      -
      -class ResnetStage(Sequential):
      -
      -    """Implements a resnet31 stage
      -
      -    Args:
      -        num_blocks: number of blocks inside the stage
      -        output_channels: number of channels to use in Conv2D
      -        downsample: if true, performs a /2 downsampling at the first block of the stage
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: int,
      -        output_channels: int,
      -        downsample: bool = False,
      -    ) -> None:
      -
      -        super().__init__()
      -        final_blocks = [
      -            ResnetBlock(output_channels, conv_shortcut=False) for _ in range(1, num_blocks)
      -        ]
      -        if downsample is True:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True, strides=2))
      -        else:
      -            self.add(ResnetBlock(output_channels, conv_shortcut=True))
      -        for final_block in final_blocks:
      -            self.add(final_block)
      -
      -
      -class ResNet(Sequential):
      -
      -    """Resnet class with two convolutions and a maxpooling before the first stage
      -
      -    Args:
      -        num_blocks: number of resnet block in each stage
      -        output_channels: number of channels in each stage
      -        conv_seq: wether to add a conv_sequence after each stage
      -        pooling: pooling to add after each stage (if None, no pooling)
      -        input_shape: shape of inputs
      -        include_top: whether the classifier head should be instantiated
      -    """
      -
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int],
      -        output_channels: Tuple[int, int, int, int],
      -        conv_seq: Tuple[bool, bool, bool, bool],
      -        pooling: Tuple[
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]],
      -            Optional[Tuple[int, int]]
      -        ],
      -        input_shape: Tuple[int, int, int] = (640, 640, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = [
      -            *conv_sequence(out_channels=64, activation='relu', bn=True, kernel_size=3, input_shape=input_shape),
      -            *conv_sequence(out_channels=128, activation='relu', bn=True, kernel_size=3),
      -            layers.MaxPool2D(pool_size=2, strides=2, padding='valid'),
      -        ]
      -        for n_blocks, out_channels, conv, pool in zip(num_blocks, output_channels, conv_seq, pooling):
      -            _layers.append(ResnetStage(n_blocks, out_channels))
      -            if conv:
      -                _layers.extend(conv_sequence(out_channels, activation='relu', bn=True, kernel_size=3))
      -            if pool:
      -                _layers.append(layers.MaxPool2D(pool_size=pool, strides=pool, padding='valid'))
      -        super().__init__(_layers)
      -
      -
      -def _resnet(arch: str, pretrained: bool, **kwargs: Any) -> ResNet:
      -
      -    # Build the model
      -    model = ResNet(
      -        default_cfgs[arch]['num_blocks'],
      -        default_cfgs[arch]['output_channels'],
      -        default_cfgs[arch]['conv_seq'],
      -        default_cfgs[arch]['pooling'],
      -        **kwargs
      -    )
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def resnet31(pretrained: bool = False, **kwargs: Any) -> ResNet: - """Resnet31 architecture with rectangular pooling windows as described in - `"Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition", - <https://arxiv.org/pdf/1811.00751.pdf>`_. Downsizing: (H, W) --> (H/8, W/4) - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import resnet31 - >>> model = resnet31(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained: boolean, True if model is pretrained - - Returns: - A resnet31 model - """ - - return _resnet('resnet31', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_modules/doctr/models/backbones/vgg/tensorflow.html b/v0.8.1/_modules/doctr/models/backbones/vgg/tensorflow.html deleted file mode 100644 index 48c285257a..0000000000 --- a/v0.8.1/_modules/doctr/models/backbones/vgg/tensorflow.html +++ /dev/null @@ -1,413 +0,0 @@ - - - - - - - - - - - - doctr.models.backbones.vgg.tensorflow - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
      -
      -
      - -
      - -
      -
      - -
      - -
      -
      - -
      -
      -
      - - - - - Back to top - -
      -
      - -
      - -
      -
      -

      Source code for doctr.models.backbones.vgg.tensorflow

      -# Copyright (C) 2021, Mindee.
      -
      -# This program is licensed under the Apache License version 2.
      -# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
      -
      -from typing import Any, Dict, Tuple
      -
      -from tensorflow.keras import layers
      -from tensorflow.keras.models import Sequential
      -
      -from ...utils import conv_sequence, load_pretrained_params
      -
      -__all__ = ['VGG', 'vgg16_bn']
      -
      -
      -default_cfgs: Dict[str, Dict[str, Any]] = {
      -    'vgg16_bn': {'num_blocks': (2, 2, 3, 3, 3), 'planes': (64, 128, 256, 512, 512),
      -                 'rect_pools': (False, False, True, True, True),
      -                 'url': None},
      -}
      -
      -
      -class VGG(Sequential):
      -    """Implements the VGG architecture from `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
      -    <https://arxiv.org/pdf/1409.1556.pdf>`_.
      -
      -    Args:
      -        num_blocks: number of convolutional block in each stage
      -        planes: number of output channels in each stage
      -        rect_pools: whether pooling square kernels should be replace with rectangular ones
      -        input_shape: shapes of the input tensor
      -        include_top: whether the classifier head should be instantiated
      -    """
      -    def __init__(
      -        self,
      -        num_blocks: Tuple[int, int, int, int, int],
      -        planes: Tuple[int, int, int, int, int],
      -        rect_pools: Tuple[bool, bool, bool, bool, bool],
      -        input_shape: Tuple[int, int, int] = (512, 512, 3),
      -        include_top: bool = False,
      -    ) -> None:
      -
      -        _layers = []
      -        # Specify input_shape only for the first layer
      -        kwargs = {"input_shape": input_shape}
      -        for nb_blocks, out_chan, rect_pool in zip(num_blocks, planes, rect_pools):
      -            for _ in range(nb_blocks):
      -                _layers.extend(conv_sequence(out_chan, 'relu', True, kernel_size=3, **kwargs))  # type: ignore[arg-type]
      -                kwargs = {}
      -            _layers.append(layers.MaxPooling2D((2, 1 if rect_pool else 2)))
      -        super().__init__(_layers)
      -
      -
      -def _vgg(arch: str, pretrained: bool, **kwargs: Any) -> VGG:
      -
      -    # Build the model
      -    model = VGG(default_cfgs[arch]['num_blocks'], default_cfgs[arch]['planes'],
      -                default_cfgs[arch]['rect_pools'], **kwargs)
      -    # Load pretrained parameters
      -    if pretrained:
      -        load_pretrained_params(model, default_cfgs[arch]['url'])
      -
      -    return model
      -
      -
      -
      -[docs] -def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: - """VGG-16 architecture as described in `"Very Deep Convolutional Networks for Large-Scale Image Recognition" - <https://arxiv.org/pdf/1409.1556.pdf>`_, modified by adding batch normalization. - - Example:: - >>> import tensorflow as tf - >>> from doctr.models import vgg16_bn - >>> model = vgg16_bn(pretrained=False) - >>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32) - >>> out = model(input_tensor) - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - - Returns: - VGG feature extractor - """ - - return _vgg('vgg16_bn', pretrained, **kwargs)
      - -
      -
      -
      -
      - - -
      -
      - - Made with Sphinx and @pradyunsg's - - Furo - -
      -
      - -
      -
      - -
      -
      - -
      -
      - - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/_sources/datasets.rst.txt b/v0.8.1/_sources/datasets.rst.txt index 8a00eeaedd..354122f1e5 100644 --- a/v0.8.1/_sources/datasets.rst.txt +++ b/v0.8.1/_sources/datasets.rst.txt @@ -11,42 +11,22 @@ can be a significant save of time. Available Datasets ------------------ -Here are all datasets that are available through docTR: +The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL. +.. autoclass:: doctr.datasets.datasets.VisionDataset -Public datasets -^^^^^^^^^^^^^^^ + +Here are all datasets that are available through DocTR: .. autoclass:: FUNSD .. autoclass:: SROIE .. autoclass:: CORD -.. autoclass:: IIIT5K -.. autoclass:: SVT -.. autoclass:: SVHN -.. autoclass:: SynthText -.. autoclass:: IC03 -.. autoclass:: IC13 - -docTR synthetic datasets -^^^^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: DocArtefacts -.. autoclass:: CharacterGenerator -.. autoclass:: WordGenerator - -docTR private datasets -^^^^^^^^^^^^^^^^^^^^^^ - -Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same. - -.. autoclass:: DetectionDataset -.. autoclass:: RecognitionDataset .. autoclass:: OCRDataset Data Loading ------------ -Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR. +Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR. .. autoclass:: doctr.datasets.loader.DataLoader @@ -56,10 +36,10 @@ Each dataset has its specific way to load a sample, but handling batch aggregati Supported Vocabs ---------------- -Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs. -.. list-table:: docTR Vocabs +.. list-table:: DocTR Vocabs :widths: 20 5 50 :header-rows: 1 @@ -79,25 +59,10 @@ of vocabs. - 5 - £€¥¢฿ * - latin - - 94 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ - * - english - - 100 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ - * - legacy_french - - 123 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ + - 96 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~° * - french - - 126 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ - * - portuguese - - 131 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿ - * - spanish - - 116 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿ - * - german - - 108 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ + - 154 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿ .. autofunction:: encode_sequences diff --git a/v0.8.1/_sources/installing.rst.txt b/v0.8.1/_sources/installing.rst.txt index 8197df660d..5c8779dc1c 100644 --- a/v0.8.1/_sources/installing.rst.txt +++ b/v0.8.1/_sources/installing.rst.txt @@ -3,7 +3,7 @@ Installation ************ -This library requires `Python `_ 3.6 or higher. +This library requires Python 3.6 or higher. Prerequisites @@ -11,12 +11,12 @@ Prerequisites Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so: -* `TensorFlow 2 `_ -* `PyTorch `_ +* TensorFlow: `installation page `_. +* PyTorch: `installation page `_. If you are running another OS than Linux, you will need a few extra dependencies. -For MacOS users, you can install them using `Homebrew `_ as follows: +For MacOS users, you can install them as follows: .. code:: shell @@ -28,23 +28,13 @@ For Windows users, those dependencies are included in GTK. You can find the late Via Python Package ================== -Install the last stable release of the package using `pip `_: +Install the last stable release of the package using pip: .. code:: bash pip install python-doctr -We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows: - -.. code:: bash - - # for TensorFlow - pip install "python-doctr[tf]" - # for PyTorch - pip install "python-doctr[torch]" - - Via Git ======= @@ -54,13 +44,3 @@ Install the library in developper mode: git clone https://github.com/mindee/doctr.git pip install -e doctr/. - -Again, for framework-specific builds: - -.. code:: bash - - git clone https://github.com/mindee/doctr.git - # for TensorFlow - pip install -e doctr/.[tf] - # for PyTorch - pip install -e doctr/.[torch] diff --git a/v0.8.1/_sources/io.rst.txt b/v0.8.1/_sources/io.rst.txt deleted file mode 100644 index 8fa887e9f9..0000000000 --- a/v0.8.1/_sources/io.rst.txt +++ /dev/null @@ -1,94 +0,0 @@ -doctr.io -======== - - -.. currentmodule:: doctr.io - -The io module enables users to easily access content from documents and export analysis -results to structured formats. - -.. _document_structure: - -Document structure ------------------- - -Structural organization of the documents. - -Word -^^^^ -A Word is an uninterrupted sequence of characters. - -.. autoclass:: Word - -Line -^^^^ -A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines). - -.. autoclass:: Line - -Artefact -^^^^^^^^ - -An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.). - -.. autoclass:: Artefact - -Block -^^^^^ -A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath). - -.. autoclass:: Block - -Page -^^^^ - -A Page is a collection of Blocks that were on the same physical page. - -.. autoclass:: Page - - .. automethod:: show - - -Document -^^^^^^^^ - -A Document is a collection of Pages. - -.. autoclass:: Document - - .. automethod:: show - - -File reading ------------- - -High-performance file reading and conversion to processable structured data. - -.. autofunction:: read_pdf - -.. autofunction:: read_img_as_numpy - -.. autofunction:: read_img_as_tensor - -.. autofunction:: decode_img_as_tensor - -.. autofunction:: read_html - - -.. autoclass:: DocumentFile - - .. automethod:: from_pdf - - .. automethod:: from_url - - .. automethod:: from_images - -.. autoclass:: PDF - - .. automethod:: as_images - - .. automethod:: get_words - - .. automethod:: get_lines - - .. automethod:: get_artefacts diff --git a/v0.8.1/_sources/models.rst.txt b/v0.8.1/_sources/models.rst.txt index d4f36df9bb..9830c6c153 100644 --- a/v0.8.1/_sources/models.rst.txt +++ b/v0.8.1/_sources/models.rst.txt @@ -1,62 +1,215 @@ doctr.models ============ -.. currentmodule:: doctr.models - - -doctr.models.classification ----------------------- +The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture. -.. autofunction:: doctr.models.classification.vgg16_bn_r +.. currentmodule:: doctr.models -.. autofunction:: doctr.models.classification.resnet18 +For a given task, DocTR provides a Predictor, which is composed of 2 components: -.. autofunction:: doctr.models.classification.resnet31 +* PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model. +* Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable. -.. autofunction:: doctr.models.classification.mobilenet_v3_small -.. autofunction:: doctr.models.classification.mobilenet_v3_large +Text Detection +-------------- +Localizing text elements in images -.. autofunction:: doctr.models.classification.mobilenet_v3_small_r ++---------------------------------------------------+----------------------------+----------------------------+---------+ +| | FUNSD | CORD | | ++==================+=================+==============+============+===============+============+===============+=========+ +| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ +| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | ++------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -.. autofunction:: doctr.models.classification.mobilenet_v3_large_r +All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. -.. autofunction:: doctr.models.classification.mobilenet_v3_small_orientation +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* -.. autofunction:: doctr.models.classification.magc_resnet31 +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. -.. autofunction:: doctr.models.classification.crop_orientation_predictor +Pre-processing for detection +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for detection is the following: +1. resize each input image to the target size (bilinear interpolation by default) with potential deformation. +2. batch images together +3. normalize the batch using the training data statistics -doctr.models.detection ----------------------- -.. autofunction:: doctr.models.detection.linknet_resnet18 +Detection models +^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: .. autofunction:: doctr.models.detection.db_resnet50 +.. autofunction:: doctr.models.detection.linknet16 -.. autofunction:: doctr.models.detection.db_mobilenet_v3_large +Detection predictors +^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information. .. autofunction:: doctr.models.detection.detection_predictor -doctr.models.recognition ------------------------- +Text Recognition +---------------- +Identifying strings in images + +.. list-table:: Text recognition model zoo + :widths: 20 20 15 10 10 10 + :header-rows: 1 + + * - Architecture + - Input shape + - # params + - FUNSD + - CORD + - FPS + * - crnn_vgg16_bn + - (32, 128, 3) + - 15.8M + - 86.02 + - 91.3 + - 12.8 + * - sar_vgg16_bn + - (32, 128, 3) + - 21.5M + - 86.2 + - 91.7 + - 3.3 + * - sar_resnet31 + - (32, 128, 3) + - 53.1M + - **86.3** + - **92.1** + - 2.7 + +All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All these recognition models are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Pre-processing for recognition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +In DocTR, the pre-processing scheme for recognition is the following: + +1. resize each input image to the target size (bilinear interpolation by default) without deformation. +2. pad the image to the target size (with zeros by default) +3. batch images together +4. normalize the batch using the training data statistics + +Recognition models +^^^^^^^^^^^^^^^^^^ +Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models: + .. autofunction:: doctr.models.recognition.crnn_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_vgg16_bn +.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_small -.. autofunction:: doctr.models.recognition.crnn_mobilenet_v3_large +Recognition predictors +^^^^^^^^^^^^^^^^^^^^^^ +Combining the right components around a given architecture for easier usage. -.. autofunction:: doctr.models.recognition.sar_resnet31 +.. autofunction:: doctr.models.recognition.recognition_predictor -.. autofunction:: doctr.models.recognition.master -.. autofunction:: doctr.models.recognition.recognition_predictor +End-to-End OCR +-------------- +Predictors that localize and identify text elements in images ++-----------------------------+--------------------------------------+--------------------------------------+ +| | FUNSD | CORD | ++=============================+============+===============+=========+============+===============+=========+ +| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_vgg16_bn | 70.08 | 74.77 | 0.85 | 82.19 | **79.67** | 1.6 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_vgg16_bn | N/A | N/A | 0.49 | N/A | N/A | 1.0 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + sar_resnet31 | N/A | N/A | 0.27 | N/A | N/A | 0.83 | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ +| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | ++-----------------------------+------------+---------------+---------+------------+---------------+---------+ + +All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). +Explanations about the metrics being used are available in :ref:`metrics`. + +All recognition models of predictors are trained with our french vocab (cf. :ref:`vocabs`). + +*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* + +FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments. + +Results on private ocr datasets + ++------------------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | ++====================================+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | **78.90** | **81.01** | 65.68 | **69.86** | **49.48** | **50.46** | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | ++------------------------------------+------------+---------------+------------+---------------+------------+---------------+ + + +Two-stage approaches +^^^^^^^^^^^^^^^^^^^^ +Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. + +.. autofunction:: doctr.models.zoo.ocr_predictor + + +Model export +------------ +Utility functions to make the most of document analysis models. + +.. currentmodule:: doctr.models.export + +Model compression +^^^^^^^^^^^^^^^^^ + +.. autofunction:: convert_to_tflite + +.. autofunction:: convert_to_fp16 + +.. autofunction:: quantize_model + +Using SavedModel +^^^^^^^^^^^^^^^^ + +Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +`SavedModel `_ format as follows: + + + >>> import tensorflow as tf + >>> from doctr.models import db_resnet50 + >>> model = db_resnet50(pretrained=True) + >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) + >>> _ = model(input_t, training=False) + >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') + +And loaded just as easily: -doctr.models.zoo ----------------- -.. autofunction:: doctr.models.ocr_predictor + >>> import tensorflow as tf + >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.1/_sources/notebooks.md.txt b/v0.8.1/_sources/notebooks.md.txt deleted file mode 100644 index ea43ac0f39..0000000000 --- a/v0.8.1/_sources/notebooks.md.txt +++ /dev/null @@ -1,9 +0,0 @@ -# docTR Notebooks - -Here are some notebooks compiled for users to better leverage the library capabilities: - -| Notebook | Description | | -|:----------|:-------------|------:| -| [Quicktour](https://github.com/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | A presentation of the main features of docTR | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb) | -| [Export as PDF/A](https://github.com/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | Produce searchable PDFs from docTR results | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/export_as_pdfa.ipynb) | -[Artefact detection](https://github.com/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | Object detection for artefacts in documents | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/artefact_detection.ipynb) | diff --git a/v0.8.1/_sources/transforms.rst.txt b/v0.8.1/_sources/transforms.rst.txt index ff11a3a38e..0230fe75f5 100644 --- a/v0.8.1/_sources/transforms.rst.txt +++ b/v0.8.1/_sources/transforms.rst.txt @@ -8,7 +8,7 @@ Data transformations are part of both training and inference procedure. Drawing Supported transformations ------------------------- -Here are all transformations that are available through docTR: +Here are all transformations that are available through DocTR: .. autoclass:: Resize .. autoclass:: Normalize @@ -21,11 +21,6 @@ Here are all transformations that are available through docTR: .. autoclass:: RandomHue .. autoclass:: RandomGamma .. autoclass:: RandomJpegQuality -.. autoclass:: RandomRotate -.. autoclass:: RandomCrop -.. autoclass:: GaussianBlur -.. autoclass:: ChannelShuffle -.. autoclass:: GaussianNoise Composing transformations diff --git a/v0.8.1/_sources/using_model_export.rst.txt b/v0.8.1/_sources/using_model_export.rst.txt deleted file mode 100644 index 992f4e9866..0000000000 --- a/v0.8.1/_sources/using_model_export.rst.txt +++ /dev/null @@ -1,71 +0,0 @@ -Preparing your model for inference -================================== - -A well-trained model is a good achievement but you might want to tune a few things to make it production-ready! - -.. currentmodule:: doctr.models.export - - -Model compression ------------------ - -This section is meant to help you perform inference with compressed versions of your model. - - -TensorFlow Lite -^^^^^^^^^^^^^^^ - -TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows: - - >>> import tensorflow as tf - >>> from tensorflow.keras import Sequential - >>> from doctr.models import conv_sequence - >>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3))) - >>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model) - >>> serialized_model = converter.convert() - -Half-precision -^^^^^^^^^^^^^^ - -If you want to convert it to half-precision using your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> converter.target_spec.supported_types = [tf.float16] - >>> serialized_model = converter.convert() - - -Post-training quantization -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Finally if you wish to quantize the model with your TFLite converter - - >>> converter.optimizations = [tf.lite.Optimize.DEFAULT] - >>> # Float fallback for operators that do not have an integer implementation - >>> def representative_dataset(): - >>> for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)] - >>> converter.representative_dataset = representative_dataset - >>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - >>> converter.inference_input_type = tf.int8 - >>> converter.inference_output_type = tf.int8 - >>> serialized_model = converter.convert() - - -Using SavedModel ----------------- - -Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -`SavedModel `_ format as follows: - - - >>> import tensorflow as tf - >>> from doctr.models import db_resnet50 - >>> model = db_resnet50(pretrained=True) - >>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32) - >>> _ = model(input_t, training=False) - >>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/') - -And loaded just as easily: - - - >>> import tensorflow as tf - >>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/') diff --git a/v0.8.1/_sources/using_models.rst.txt b/v0.8.1/_sources/using_models.rst.txt deleted file mode 100644 index 1c0752463f..0000000000 --- a/v0.8.1/_sources/using_models.rst.txt +++ /dev/null @@ -1,329 +0,0 @@ -Choosing the right model -======================== - -The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture. - -.. currentmodule:: doctr.models - -For a given task, docTR provides a Predictor, which is composed of 2 components: - -* PreProcessor: a module in charge of making inputs directly usable by the deep learning model. -* Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable. - - -Text Detection --------------- - -The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don't). - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `linknet_resnet18 `_ -* `db_resnet50 `_ -* `db_mobilenet_v3_large `_ - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -+------------------------------------------------------------------+----------------------------+----------------------------+---------+ -| | FUNSD | CORD | | -+=================================+=================+==============+============+===============+============+===============+=========+ -| **Architecture** | **Input shape** | **# params** | **Recall** | **Precision** | **Recall** | **Precision** | **FPS** | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_resnet50 | (1024, 1024, 3) | 25.2 M | 82.14 | 87.64 | 92.49 | 89.66 | 2.1 | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ -| db_mobilenet_v3_large | (1024, 1024, 3) | 4.2 M | 79.35 | 84.03 | 81.14 | 66.85 | | -+---------------------------------+-----------------+--------------+------------+---------------+------------+---------------+---------+ - - -All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Detection predictors -^^^^^^^^^^^^^^^^^^^^ - -`detection_predictor `_ wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import detection_predictor - >>> predictor = detection_predictor('db_resnet50') - >>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -Text Recognition ----------------- - -The task consists of transcribing the character sequence in a given image. - - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -The following architectures are currently supported: - -* `crnn_vgg16_bn `_ -* `crnn_mobilenet_v3_small `_ -* `crnn_mobilenet_v3_large `_ -* `sar_resnet31 `_ -* `master `_ - - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - - -.. list-table:: Text recognition model zoo - :header-rows: 1 - - * - Architecture - - Input shape - - # params - - FUNSD - - CORD - - FPS - * - crnn_vgg16_bn - - (32, 128, 3) - - 15.8M - - 87.18 - - 92.93 - - 12.8 - * - crnn_mobilenet_v3_small - - (32, 128, 3) - - 2.1M - - 86.21 - - 90.56 - - - * - crnn_mobilenet_v3_large - - (32, 128, 3) - - 4.5M - - 86.95 - - 92.03 - - - * - sar_resnet31 - - (32, 128, 3) - - 56.2M - - **87.70** - - **93.41** - - 2.7 - * - master - - (32, 128, 3) - - 67.7M - - 87.62 - - 93.27 - - - -All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metric being used (exact match) are available in :ref:`metrics`. - -While most of our recognition models were trained on our french vocab (cf. :ref:`vocabs`), you can easily access the vocab of any model as follows: - - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> print(predictor.model.cfg['vocab']) - - -*Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - - -Recognition predictors -^^^^^^^^^^^^^^^^^^^^^^ -`recognition_predictor `_ wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly. - - >>> import numpy as np - >>> from doctr.models import recognition_predictor - >>> predictor = recognition_predictor('crnn_vgg16_bn') - >>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8) - >>> out = model([dummy_img]) - - -End-to-End OCR --------------- - -The task consists of both localizing and transcribing textual elements in a given image. - -Available architectures -^^^^^^^^^^^^^^^^^^^^^^^ - -You can use any combination of detection and recognition models supporte by docTR. - -For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets: - -+----------------------------------------+--------------------------------------+--------------------------------------+ -| | FUNSD | CORD | -+========================================+============+===============+=========+============+===============+=========+ -| **Architecture** | **Recall** | **Precision** | **FPS** | **Recall** | **Precision** | **FPS** | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_vgg16_bn | 71.25 | 76.02 | 0.85 | 84.00 | 81.42 | 1.6 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + master | 71.03 | 76.06 | | 84.49 | 81.94 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_resnet50 + crnn_mobilenet_v3_large | 70.57 | 75.57 | | 82.57 | 80.08 | 0.83 | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| Gvision doc. text detection | 64.00 | 53.30 | | 68.90 | 61.10 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ -| AWS textract | **78.10** | **83.00** | | **87.50** | 66.00 | | -+----------------------------------------+------------+---------------+---------+------------+---------------+---------+ - -All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. :ref:`datasets`). -Explanations about the metrics being used are available in :ref:`metrics`. - -*Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities* - -FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a `c5.x12large `_ AWS instance (CPU Xeon Platinum 8275L). - -Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. - - -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | -+==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_mobilenet_v3_large (ours) | 78.01 | 80.39 | 65.36 | 70.11 | 48.00 | 49.43 | 79.39 | 92.62 | 87.68 | 89.00 | 85.65 | 86.67 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ - - -Two-stage approaches -^^^^^^^^^^^^^^^^^^^^ -Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with `ocr_predictor `_. - - >>> import numpy as np - >>> from doctr.models import ocr_predictor - >>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True) - >>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8) - >>> out = model([input_page]) - - -What should I do with the output? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The ocr_predictor returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`). -To get a better understanding of our document model, check our :ref:`document_structure` section - -Here is a typical `Document` layout:: - - Document( - (pages): [Page( - dimensions=(340, 600) - (blocks): [Block( - (lines): [Line( - (words): [ - Word(value='No.', confidence=0.91), - Word(value='RECEIPT', confidence=0.99), - Word(value='DATE', confidence=0.96), - ] - )] - (artefacts): [] - )] - )] - ) - -You can also export them as a nested dict, more appropriate for JSON format:: - - json_output = result.export() - -For reference, here is the JSON export for the same `Document` as above:: - - { - 'pages': [ - { - 'page_idx': 0, - 'dimensions': (340, 600), - 'orientation': {'value': None, 'confidence': None}, - 'language': {'value': None, 'confidence': None}, - 'blocks': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'lines': [ - { - 'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)), - 'words': [ - { - 'value': 'No.', - 'confidence': 0.914085328578949, - 'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)) - }, - { - 'value': 'RECEIPT', - 'confidence': 0.9949972033500671, - 'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)) - }, - { - 'value': 'DATE', - 'confidence': 0.9578408598899841, - 'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)) - } - ] - } - ], - 'artefacts': [] - } - ] - } - ] - } - -To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: - - xml_output = result.export_as_xml() - for output in xml_output: - xml_bytes_string = output[0] - xml_element = output[1] - -For reference, here is a sample XML byte string output:: - - - - - docTR - hOCR - - - - - -
      -
      -

      - - Hello - XML - World - -

      -
      - - \ No newline at end of file diff --git a/v0.8.1/_sources/utils.rst.txt b/v0.8.1/_sources/utils.rst.txt index ac0b13d9df..69c1abe0eb 100644 --- a/v0.8.1/_sources/utils.rst.txt +++ b/v0.8.1/_sources/utils.rst.txt @@ -14,8 +14,6 @@ Easy-to-use functions to make sense of your model's predictions. .. autofunction:: visualize_page -.. autofunction:: synthesize_page - .. _metrics: @@ -27,20 +25,12 @@ Implementations of task-specific metrics to easily assess your model performance .. autoclass:: TextMatch - .. automethod:: update .. automethod:: summary .. autoclass:: LocalizationConfusion - .. automethod:: update .. automethod:: summary .. autoclass:: OCRMetric - .. automethod:: update - .. automethod:: summary - -.. autoclass:: DetectionMetric - - .. automethod:: update .. automethod:: summary diff --git a/v0.8.1/datasets.html b/v0.8.1/datasets.html index 1f5855cc82..640791680a 100644 --- a/v0.8.1/datasets.html +++ b/v0.8.1/datasets.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.datasets - docTR documentation @@ -227,28 +227,21 @@ @@ -294,12 +287,16 @@

      doctr.datasets

      Available Datasets

      -

      Here are all datasets that are available through docTR:

      -
      -

      Public datasets

      +

      The datasets from DocTR inherit from an abstract class that handles verified downloading from a given URL.

      +
      +
      +class doctr.datasets.datasets.VisionDataset(url: str, file_name: str | None = None, file_hash: str | None = None, extract_archive: bool = False, download: bool = False, overwrite: bool = False)[source]
      +
      + +

      Here are all datasets that are available through DocTR:

      -class doctr.datasets.FUNSD(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.FUNSD(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      FUNSD dataset from “FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents”.

      Example::
      >>> from doctr.datasets import FUNSD
      @@ -313,7 +310,8 @@ 

      Public datasetsParameters:
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      @@ -322,7 +320,7 @@

      Public datasets
      -class doctr.datasets.SROIE(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.SROIE(train: bool = True, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      SROIE dataset from “ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction”.

      Example::
      - -
      -
      -class doctr.datasets.IIIT5K(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IIIT-5K character-level localization dataset from -“BMVC 2012 Scene Text Recognition using Higher Order Language Priors”.

      -
      -
      Example::
      >>> # NOTE: this dataset is for character-level localization
      ->>> from doctr.datasets import IIIT5K
      ->>> train_set = IIIT5K(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVT(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVT dataset from “The Street View Text Dataset - UCSD Computer Vision”.

      -
      -
      Example::
      >>> from doctr.datasets import SVT
      ->>> train_set = SVT(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SVHN(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SVHN dataset from “The Street View House Numbers (SVHN) Dataset”.

      -
      -
      Example::
      >>> from doctr.datasets import SVHN
      ->>> train_set = SVHN(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.SynthText(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      SynthText dataset from “Synthetic Data for Text Localisation in Natural Images” | “repository” | -“website”.

      -
      -
      Example::
      >>> from doctr.datasets import SynthText
      ->>> train_set = SynthText(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC03(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC03 dataset from “ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions”.

      -
      -
      Example::
      >>> from doctr.datasets import IC03
      ->>> train_set = IC03(train=True, download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      • **kwargs – keyword arguments from VisionDataset.

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.IC13(img_folder: str, label_folder: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      IC13 dataset from “ICDAR 2013 Robust Reading Competition”. -Example:

      -
      >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
      ->>> from doctr.datasets import IC13
      ->>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
      ->>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
      ->>> img, target = train_set[0]
      ->>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
      ->>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
      ->>> img, target = test_set[0]
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_folder – folder with all annotation files for the images

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -

      -
      -

      docTR synthetic datasets

      -
      -
      -class doctr.datasets.DocArtefacts(train: bool = True, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Object detection dataset for non-textual elements in documents. -The dataset includes a variety of synthetic document pages with non-textual elements.

      -
      -
      Example::
      >>> from doctr.datasets import DocArtefacts
      ->>> train_set = DocArtefacts(download=True)
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • train – whether the subset should be the training one

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • **kwargs – keyword arguments from VisionDataset.

      -
      -
      -class doctr.datasets.CharacterGenerator(*args, **kwargs)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import CharacterGenerator
      ->>> ds = CharacterGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.WordGenerator(vocab: str, min_chars: int, max_chars: int, num_samples: int, cache_samples: bool = False, font_family: str | List[str] | None = None, img_transforms: Callable[[Any], Any] | None = None, sample_transforms: Callable[[Any, Any], Tuple[Any, Any]] | None = None)[source]
      -

      Implements a character image generation dataset

      -
      -
      Example::
      >>> from doctr.datasets import WordGenerator
      ->>> ds = WordGenerator(vocab='abdef')
      ->>> img, target = ds[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • vocab – vocabulary to take the character from

      • -
      • min_chars – minimum number of characters in a word

      • -
      • max_chars – maximum number of characters in a word

      • -
      • num_samples – number of samples that will be generated iterating over the dataset

      • -
      • cache_samples – whether generated images should be cached firsthand

      • -
      • font_family – font to use to generate the text images

      • -
      • img_transforms – composable transformations that will be applied to each image

      • -
      • sample_transforms – composable transformations that will be applied to both the image and the target

      • -
      -
      -
      -
      - -
      -
      -

      docTR private datasets

      -

      Since many documents include sensitive / personal information, we are not able to share all the data that has been used for this project. However, we provide some guidance on how to format your own dataset into the same format so that you can use all docTR tools all the same.

      -
      -
      -class doctr.datasets.DetectionDataset(img_folder: str, label_path: str, use_polygons: bool = False, **kwargs: Any)[source]
      -

      Implements a text detection dataset

      -
      -
      Example::
      >>> from doctr.datasets import DetectionDataset
      ->>> train_set = DetectionDataset(img_folder="/path/to/images", label_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – folder with all the images of the dataset

      • -
      • label_path – path to the annotations of each image

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • -
      -
      -
      -
      - -
      -
      -class doctr.datasets.RecognitionDataset(img_folder: str, labels_path: str, **kwargs: Any)[source]
      -

      Dataset implementation for text recognition tasks

      -
      -
      Example::
      >>> from doctr.datasets import RecognitionDataset
      ->>> train_set = RecognitionDataset(img_folder="/path/to/images", labels_path="/path/to/labels.json")
      ->>> img, target = train_set[0]
      -
      -
      -
      -
      -
      -
      Parameters:
      -
        -
      • img_folder – path to the images folder

      • -
      • labels_path – pathe to the json file containing all labels (character sequences)

      • -
      -
      -
      -
      -
      -class doctr.datasets.OCRDataset(img_folder: str, label_file: str, use_polygons: bool = False, **kwargs: Any)[source]
      +class doctr.datasets.OCRDataset(img_folder: str, label_file: str, sample_transforms: Callable[[Any], Any] | None = None, rotated_bbox: bool = False, **kwargs: Any)[source]

      Implements an OCR dataset

      Parameters:
      • img_folder – local path to image folder (all jpg at the root)

      • label_file – local path to the label file

      • -
      • use_polygons – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • sample_transforms – composable transformations that will be applied to each image

      • +
      • rotated_bbox – whether polygons should be considered as rotated bounding box (instead of straight ones)

      • +
      • **kwargs – keyword arguments from VisionDataset.

      -

    Data Loading

    -

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in docTR.

    +

    Each dataset has its specific way to load a sample, but handling batch aggregation and the underlying iterator is a task deferred to another object in DocTR.

    -class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, num_workers: int | None = None, collate_fn: Callable | None = None)[source]
    +class doctr.datasets.loader.DataLoader(dataset, shuffle: bool = True, batch_size: int = 1, drop_last: bool = False, workers: int | None = None)[source]

    Implements a dataset wrapper for fast data loading

    Example::
    >>> from doctr.datasets import FUNSD, DataLoader
    @@ -681,7 +408,7 @@ 

    Data Loading

    Supported Vocabs

    -

    Since textual content has to be encoded properly for models to interpret them efficiently, docTR supports multiple sets +

    Since textual content has to be encoded properly for models to interpret them efficiently, DocTR supports multiple sets of vocabs.

    - +@@ -724,39 +451,19 @@

    Data Loading

    - - - - - - - - - - + + - - - - - - - - - - - - - - + +
    docTR VocabsDocTR Vocabs

    latin

    94

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~

    english

    100

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿

    legacy_french

    123

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    96

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°

    french

    126

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ

    portuguese

    131

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áàâãéêëíïóôõúüçÁÀÂÃÉËÍÏÓÔÕÚÜÇ¡¿

    spanish

    116

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿áéíóúüñÁÉÍÓÚÜÑ¡¿

    german

    108

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&’()*+,-./:;<=>?@[]^_`{|}~°£€¥¢฿äöüßÄÖÜẞ

    154

    0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!”#$%&'()*+,-./:;<=>?@[\]^_`{|}~°àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ£€¥¢฿

    -doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, dynamic_seq_length: bool = False, **kwargs: Any) ndarray[source]
    +doctr.datasets.encode_sequences(sequences: List[str], vocab: str, target_size: int | None = None, eos: int = -1, sos: int | None = None, pad: int | None = None, **kwargs: Any) ndarray[source]

    Encode character sequences using a given vocab as mapping

    Parameters:
    @@ -767,7 +474,6 @@

    Data LoadingReturns: @@ -784,23 +490,23 @@

    Data Loading - +
    Next
    -
    doctr.io
    +
    doctr.documents
    - +
    Previous
    -
    Preparing your model for inference
    +
    Changelog
    @@ -836,32 +542,13 @@

    Data Loadingdoctr.datasets

    diff --git a/v0.8.1/installing.html b/v0.8.1/installing.html index b79f453bd6..8068adc0ba 100644 --- a/v0.8.1/installing.html +++ b/v0.8.1/installing.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + Installation - docTR documentation @@ -227,28 +227,21 @@ @@ -290,16 +283,16 @@

    Installation

    -

    This library requires Python 3.6 or higher.

    +

    This library requires Python 3.6 or higher.

    Prerequisites

    Whichever OS you are running, you will need to install at least TensorFlow or PyTorch. You can refer to their corresponding installation pages to do so:

    If you are running another OS than Linux, you will need a few extra dependencies.

    -

    For MacOS users, you can install them using Homebrew as follows:

    +

    For MacOS users, you can install them as follows:

    brew install cairo pango gdk-pixbuf libffi
     
    @@ -307,17 +300,10 @@

    Prerequisites

    Via Python Package

    -

    Install the last stable release of the package using pip:

    +

    Install the last stable release of the package using pip:

    pip install python-doctr
     
    -

    We strive towards reducing framework-specific dependencies to a minimum, but some necessary features are developed by third-parties for specific frameworks. To avoid missing some dependencies for a specific framework, you can install specific builds as follows:

    -
    # for TensorFlow
    -pip install "python-doctr[tf]"
    -# for PyTorch
    -pip install "python-doctr[torch]"
    -
    -

    Via Git

    @@ -326,14 +312,6 @@

    Via Git¶ pip install -e doctr/.

    -

    Again, for framework-specific builds:

    -
    git clone https://github.com/mindee/doctr.git
    -# for TensorFlow
    -pip install -e doctr/.[tf]
    -# for PyTorch
    -pip install -e doctr/.[torch]
    -
    -
    @@ -342,12 +320,12 @@

    Via Git

    +

    diff --git a/v0.8.1/io.html b/v0.8.1/io.html deleted file mode 100644 index a61f5b20af..0000000000 --- a/v0.8.1/io.html +++ /dev/null @@ -1,839 +0,0 @@ - - - - - - - - - - - - - doctr.io - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    doctr.io

    -

    The io module enables users to easily access content from documents and export analysis -results to structured formats.

    -
    -

    Document structure

    -

    Structural organization of the documents.

    -
    -

    Word

    -

    A Word is an uninterrupted sequence of characters.

    -
    -
    -class doctr.io.Word(value: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray)[source]
    -

    Implements a word element

    -
    -
    Parameters:
    -
      -
    • value – the text string of the word

    • -
    • confidence – the confidence associated with the text prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to

    • -
    • size (the page's)

    • -
    -
    -
    -
    - -
    -
    -

    Line

    -

    A Line is a collection of Words aligned spatially and meant to be read together (on a two-column page, on the same horizontal, we will consider that there are two Lines).

    -
    -
    -class doctr.io.Line(words: List[Word], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a line element as a collection of words

    -
    -
    Parameters:
    -
      -
    • words – list of word elements

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all words in it.

    • -
    -
    -
    -
    - -
    -
    -

    Artefact

    -

    An Artefact is a non-textual element (e.g. QR code, picture, chart, signature, logo, etc.).

    -
    -
    -class doctr.io.Artefact(artefact_type: str, confidence: float, geometry: Tuple[Tuple[float, float], Tuple[float, float]])[source]
    -

    Implements a non-textual element

    -
    -
    Parameters:
    -
      -
    • artefact_type – the type of artefact

    • -
    • confidence – the confidence of the type prediction

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size.

    • -
    -
    -
    -
    - -
    -
    -

    Block

    -

    A Block is a collection of Lines (e.g. an address written on several lines) and Artefacts (e.g. a graph with its title underneath).

    -
    -
    -class doctr.io.Block(lines: List[Line] = [], artefacts: List[Artefact] = [], geometry: Tuple[Tuple[float, float], Tuple[float, float]] | ndarray | None = None)[source]
    -

    Implements a block element as a collection of lines and artefacts

    -
    -
    Parameters:
    -
      -
    • lines – list of line elements

    • -
    • artefacts – list of artefacts

    • -
    • geometry – bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to -the page’s size. If not specified, it will be resolved by default to the smallest bounding box enclosing -all lines and artefacts in it.

    • -
    -
    -
    -
    - -
    -
    -

    Page

    -

    A Page is a collection of Blocks that were on the same physical page.

    -
    -
    -class doctr.io.Page(blocks: List[Block], page_idx: int, dimensions: Tuple[int, int], orientation: Dict[str, Any] | None = None, language: Dict[str, Any] | None = None)[source]
    -

    Implements a page element as a collection of blocks

    -
    -
    Parameters:
    -
      -
    • blocks – list of block elements

    • -
    • page_idx – the index of the page in the input raw document

    • -
    • dimensions – the page size in pixels in format (height, width)

    • -
    • orientation – a dictionary with the value of the rotation angle in degress and confidence of the prediction

    • -
    • language – a dictionary with the language value and confidence of the prediction

    • -
    -
    -
    -
    -
    -show(page: ndarray, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -
      -
    • page – image encoded as a numpy array in uint8

    • -
    • interactive – whether the display should be interactive

    • -
    • preserve_aspect_ratio – pass True if you passed True to the predictor

    • -
    -
    -
    -
    - -
    - -
    -
    -

    Document

    -

    A Document is a collection of Pages.

    -
    -
    -class doctr.io.Document(pages: List[Page])[source]
    -

    Implements a document element as a collection of pages

    -
    -
    Parameters:
    -

    pages – list of page elements

    -
    -
    -
    -
    -show(pages: List[ndarray], **kwargs) None[source]
    -

    Overlay the result on a given image

    -
    -
    Parameters:
    -

    pages – list of images encoded as numpy arrays in uint8

    -
    -
    -
    - -
    - -
    -
    -
    -

    File reading

    -

    High-performance file reading and conversion to processable structured data.

    -
    -
    -doctr.io.read_pdf(file: str | Path | bytes, **kwargs: Any) Document[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_pdf
    ->>> doc = read_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_numpy(file: str | Path | bytes, output_size: Tuple[int, int] | None = None, rgb_output: bool = True) ndarray[source]
    -

    Read an image file into numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_img
    ->>> page = read_img("path/to/your/doc.jpg")
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • file – the path to the image file

    • -
    • output_size – the expected output size of each page in format H x W

    • -
    • rgb_output – whether the output ndarray channel order should be RGB instead of BGR.

    • -
    -
    -
    Returns:
    -

    the page decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -doctr.io.read_img_as_tensor(img_path: str | Path, dtype: DType = tf.float32) Tensor[source]
    -

    Read an image file as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_path – location of the image file

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.decode_img_as_tensor(img_content: bytes, dtype: DType = tf.float32) Tensor[source]
    -

    Read a byte stream as a TensorFlow tensor

    -
    -
    Parameters:
    -
      -
    • img_content – bytes of a decoded image

    • -
    • dtype – the desired data type of the output tensor. If it is float-related, values will be divided by 255.

    • -
    -
    -
    Returns:
    -

    decoded image as a tensor

    -
    -
    -
    - -
    -
    -doctr.io.read_html(url: str, **kwargs: Any) bytes[source]
    -

    Read a PDF file and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import read_html
    ->>> doc = read_html("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – URL of the target web page

    -
    -
    Returns:
    -

    decoded PDF file as a bytes stream

    -
    -
    -
    - -
    -
    -class doctr.io.DocumentFile[source]
    -

    Read a document from multiple extensions

    -
    -
    -classmethod from_pdf(file: str | Path | bytes, **kwargs) PDF[source]
    -

    Read a PDF file

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    file – the path to the PDF file or a binary stream

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_url(url: str, **kwargs) PDF[source]
    -

    Interpret a web page as a PDF document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> doc = DocumentFile.from_url("https://www.yoursite.com")
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    url – the URL of the target web page

    -
    -
    Returns:
    -

    a PDF document

    -
    -
    -
    - -
    -
    -classmethod from_images(files: Sequence[str | Path | bytes] | str | Path | bytes, **kwargs) List[ndarray][source]
    -

    Read an image file (or a collection of image files) and convert it into an image in numpy format

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    files – the path to the image file or a binary stream, or a collection of those

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    - -
    -
    -class doctr.io.PDF(doc: Document)[source]
    -

    PDF document template

    -
    -
    Parameters:
    -

    doc – input PDF document

    -
    -
    -
    -
    -as_images(**kwargs) List[ndarray][source]
    -

    Convert all document pages to images

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> pages = DocumentFile.from_pdf("path/to/your/doc.pdf").as_images()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of convert_page_to_numpy

    -
    -
    Returns:
    -

    the list of pages decoded as numpy ndarray of shape H x W x 3

    -
    -
    -
    - -
    -
    -get_words(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all words in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> words = DocumentFile.from_pdf("path/to/your/doc.pdf").get_words()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_lines(**kwargs) List[List[Tuple[Tuple[float, float, float, float], str]]][source]
    -

    Get the annotations for all lines in the document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> lines = DocumentFile.from_pdf("path/to/your/doc.pdf").get_lines()
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    kwargs – keyword arguments of fitz.Page.get_text_words

    -
    -
    Returns:
    -

    the list of pages annotations, represented as a list of tuple (bounding box, value)

    -
    -
    -
    - -
    -
    -get_artefacts() List[List[Tuple[float, float, float, float]]][source]
    -

    Get the artefacts for the entire document

    -
    -
    Example::
    >>> from doctr.documents import DocumentFile
    ->>> artefacts = DocumentFile.from_pdf("path/to/your/doc.pdf").get_artefacts()
    -
    -
    -
    -
    -
    -
    Returns:
    -

    the list of pages artefacts, represented as a list of bounding boxes

    -
    -
    -
    - -
    - -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/models.html b/v0.8.1/models.html index 04ff61d44e..270664068f 100644 --- a/v0.8.1/models.html +++ b/v0.8.1/models.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.models - docTR documentation @@ -227,28 +227,21 @@ @@ -290,286 +283,64 @@

    doctr.models

    -
    -

    doctr.models.classification

    -
    -
    -doctr.models.classification.vgg16_bn_r(pretrained: bool = False, **kwargs: Any) VGG[source]
    -

    VGG-16 architecture as described in “Very Deep Convolutional Networks for Large-Scale Image Recognition”, modified by adding batch normalization, rectangular pooling and a simpler -classification head.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import vgg16_bn_r
    ->>> model = vgg16_bn_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on ImageNet

    -
    -
    Returns:
    -

    VGG feature extractor

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet18(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet-18 architecture as described in “Deep Residual Learning for Image Recognition”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet18
    ->>> model = resnet18(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with rectangular pooling windows as described in -“Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition”,. Downsizing: (H, W) –> (H/8, W/4)

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import resnet31
    ->>> model = resnet31(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 224, 224, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A classification model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_small(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenetv3_large
    ->>> model = mobilenetv3_large(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,, with rectangular pooling.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_r
    ->>> model = mobilenet_v3_small_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Large architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_large_r
    ->>> model = mobilenet_v3_large_r(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) MobileNetV3[source]
    -

    MobileNetV3-Small architecture as described in -“Searching for MobileNetV3”,.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import mobilenet_v3_small_orientation
    ->>> model = mobilenet_v3_small_orientation(pretrained=False)
    ->>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    a keras.Model

    -
    -
    -
    - -
    -
    -doctr.models.classification.magc_resnet31(pretrained: bool = False, **kwargs: Any) ResNet[source]
    -

    Resnet31 architecture with Multi-Aspect Global Context Attention as described in -“MASTER: Multi-Aspect Non-local Network for Scene Text Recognition”,.

    -
    -
    Example::
    >>> import torch
    ->>> from doctr.models import magc_resnet31
    ->>> model = magc_resnet31(pretrained=False)
    ->>> input_tensor = torch.rand((1, 3, 224, 224), dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained – boolean, True if model is pretrained

    -
    -
    Returns:
    -

    A feature extractor model

    -
    -
    -
    - -
    -
    -doctr.models.classification.crop_orientation_predictor(arch: str = 'mobilenet_v3_small_orientation', pretrained: bool = False, **kwargs: Any) CropOrientationPredictor[source]
    -

    Orientation classification architecture.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.models import crop_orientation_predictor
    ->>> model = crop_orientation_predictor(arch='classif_mobilenet_v3_small', pretrained=True)
    ->>> input_crop = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
    ->>> out = model([input_crop])
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • arch – name of the architecture to use (e.g. ‘mobilenet_v3_small’)

    • -
    • pretrained – If True, returns a model pre-trained on our recognition crops dataset

    • +

      The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. +Either performed at once or separately, to each task corresponds a type of deep learning architecture.

      +

      For a given task, DocTR provides a Predictor, which is composed of 2 components:

      +
        +
      • PreProcessor: a module in charge of making inputs directly usable by the TensorFlow model.

      • +
      • Model: a deep learning model, implemented with TensorFlow backend along with its specific post-processor to make outputs structured and reusable.

      -
    -
    Returns:
    -

    CropOrientationPredictor

    -
    -
    -
    - -
    -
    -

    doctr.models.detection

    -
    -
    -doctr.models.detection.linknet_resnet18(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    -

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import linknet_resnet18
    ->>> model = linknet_resnet18(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    +
    +

    Text Detection

    +

    Localizing text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text detection dataset

    -
    -
    Returns:
    -

    text detection architecture

    -
    -
    -
    - +

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 1024, 1024, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 1024, 1024, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for detection

    +

    In DocTR, the pre-processing scheme for detection is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) with potential deformation.

    2. +
    3. batch images together

    4. +
    5. normalize the batch using the training data statistics

    6. +
    +
    +
    +

    Detection models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.detection.db_resnet50(pretrained: bool = False, **kwargs: Any) DBNet[source]
    @@ -595,13 +366,13 @@

    doctr.models.detection

    -
    -doctr.models.detection.db_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) DBNet[source]
    -

    DBNet as described in “Real-time Scene Text Detection with Differentiable Binarization”, using a mobilenet v3 large backbone.

    +
    +doctr.models.detection.linknet16(pretrained: bool = False, **kwargs: Any) LinkNet[source]
    +

    LinkNet as described in “LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import db_mobilenet_v3_large
    ->>> model = db_mobilenet_v3_large(pretrained=True)
    +>>> from doctr.models import linknet16
    +>>> model = linknet16(pretrained=True)
     >>> input_tensor = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -618,14 +389,18 @@

    doctr.models.detection

    +
    +
    +

    Detection predictors

    +

    Combining the right components around a given architecture for easier usage, predictors lets you pass numpy images as inputs and return structured information.

    -doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, assume_straight_pages: bool = True, **kwargs: Any) DetectionPredictor[source]
    +doctr.models.detection.detection_predictor(arch: str = 'db_resnet50', pretrained: bool = False, **kwargs: Any) DetectionPredictor[source]

    Text detection architecture.

    Example::
    >>> import numpy as np
     >>> from doctr.models import detection_predictor
    ->>> model = detection_predictor(arch='db_resnet50', pretrained=True)
    +>>> model = detection_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -635,9 +410,8 @@

    doctr.models.detection
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘db_resnet50’)

    • +
    • arch – name of the architecture to use (‘db_resnet50’)

    • pretrained – If True, returns a model pre-trained on our text detection dataset

    • -
    • assume_straight_pages – If True, fit straight boxes to the page

    Returns:
    @@ -647,8 +421,74 @@

    doctr.models.detection

    -
    -

    doctr.models.recognition

    +
    +
    +

    Text Recognition

    +

    Identifying strings in images

    +
    + + ++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    86.02

    91.3

    12.8

    sar_vgg16_bn

    (32, 128, 3)

    21.5M

    86.2

    91.7

    3.3

    sar_resnet31

    (32, 128, 3)

    53.1M

    86.3

    92.1

    2.7

    +
    +

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All these recognition models are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the model, we feed the model with 100 random tensors of shape [1, 32, 128, 3] as a warm-up. Then, we measure the average speed of the model on 1000 batches of 1 frame (random tensors of shape [1, 32, 128, 3]). +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +
    +

    Pre-processing for recognition

    +

    In DocTR, the pre-processing scheme for recognition is the following:

    +
      +
    1. resize each input image to the target size (bilinear interpolation by default) without deformation.

    2. +
    3. pad the image to the target size (with zeros by default)

    4. +
    5. batch images together

    6. +
    7. normalize the batch using the training data statistics

    8. +
    +
    +
    +

    Recognition models

    +

    Models expect a TensorFlow tensor as input and produces one in return. DocTR includes implementations and pretrained versions of the following models:

    doctr.models.recognition.crnn_vgg16_bn(pretrained: bool = False, **kwargs: Any) CRNN[source]
    @@ -675,40 +515,15 @@

    doctr.models.recognition -
    -doctr.models.recognition.crnn_mobilenet_v3_small(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Small backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    -
    -
    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_small
    ->>> model = crnn_mobilenet_v3_small(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    ->>> out = model(input_tensor)
    -
    -
    -
    -
    -
    -
    Parameters:
    -

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    -
    -
    Returns:
    -

    text recognition architecture

    -
    -
    -

    - -
    -
    -doctr.models.recognition.crnn_mobilenet_v3_large(pretrained: bool = False, **kwargs: Any) CRNN[source]
    -

    CRNN with a MobileNet V3 Large backbone as described in “An End-to-End Trainable Neural Network for Image-based -Sequence Recognition and Its Application to Scene Text Recognition”.

    +
    +doctr.models.recognition.sar_vgg16_bn(pretrained: bool = False, **kwargs: Any) SAR[source]
    +

    SAR with a VGG16 feature extractor as described in “Show, Attend and Read:A Simple and Strong +Baseline for Irregular Text Recognition”.

    Example::
    >>> import tensorflow as tf
    ->>> from doctr.models import crnn_mobilenet_v3_large
    ->>> model = crnn_mobilenet_v3_large(pretrained=True)
    ->>> input_tensor = tf.random.uniform(shape=[1, 32, 128, 3], maxval=1, dtype=tf.float32)
    +>>> from doctr.models import sar_vgg16_bn
    +>>> model = sar_vgg16_bn(pretrained=False)
    +>>> input_tensor = tf.random.uniform(shape=[1, 64, 256, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    @@ -750,17 +565,15 @@

    doctr.models.recognition
    doctr.models.recognition.master(pretrained: bool = False, **kwargs: Any) MASTER[source]
    -

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_.

    -
    -
    Example::
    >>> import tensorflow as tf
    +

    MASTER as described in paper: <https://arxiv.org/pdf/1910.02562.pdf>`_. +Example:

    +
    >>> import tensorflow as tf
     >>> from doctr.models import master
     >>> model = master(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 48, 160, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
     
    -
    -
    Parameters:

    pretrained (bool) – If True, returns a model pre-trained on our text recognition dataset

    @@ -771,6 +584,10 @@

    doctr.models.recognition +

    Recognition predictors

    +

    Combining the right components around a given architecture for easier usage.

    doctr.models.recognition.recognition_predictor(arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) RecognitionPredictor[source]
    @@ -788,7 +605,7 @@

    doctr.models.recognition
    Parameters:
      -
    • arch – name of the architecture to use (e.g. ‘crnn_vgg16_bn’)

    • +
    • arch – name of the architecture to use (‘crnn_vgg16_bn’, ‘crnn_resnet31’, ‘sar_vgg16_bn’, ‘sar_resnet31’)

    • pretrained – If True, returns a model pre-trained on our text recognition dataset

    @@ -799,16 +616,141 @@

    doctr.models.recognition -

    doctr.models.zoo

    +

    +
    +

    End-to-End OCR

    +

    Predictors that localize and identify text elements in images

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    70.08

    74.77

    0.85

    82.19

    79.67

    1.6

    db_resnet50 + sar_vgg16_bn

    N/A

    N/A

    0.49

    N/A

    N/A

    1.0

    db_resnet50 + sar_resnet31

    N/A

    N/A

    0.27

    N/A

    N/A

    0.83

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    +
    +

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). +Explanations about the metrics being used are available in Task evaluation.

    +

    All recognition models of predictors are trained with our french vocab (cf. Supported Vocabs).

    +

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    +

    FPS (Frames per second) is computed this way: we instantiate the predictor, we warm-up the model and then we measure the average speed of the end-to-end predictor on the datasets, with a batch size of 1. +We used a c5.x12large from AWS instances (CPU Xeon Platinum 8275L) to perform experiments.

    +

    Results on private ocr datasets

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Receipts

    Invoices

    IDs

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.90

    81.01

    65.68

    69.86

    49.48

    50.46

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    +
    +
    +

    Two-stage approaches

    +

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block.

    -
    -doctr.models.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, assume_straight_pages: bool = True, export_as_straight_boxes: bool = False, preserve_aspect_ratio: bool = False, **kwargs: Any) OCRPredictor[source]
    +
    +doctr.models.zoo.ocr_predictor(det_arch: str = 'db_resnet50', reco_arch: str = 'crnn_vgg16_bn', pretrained: bool = False, **kwargs: Any) OCRPredictor[source]

    End-to-end OCR architecture using one model for localization, and another for text recognition.

    Example::
    >>> import numpy as np
     >>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    +>>> model = ocr_predictor(pretrained=True)
     >>> input_page = (255 * np.random.rand(600, 800, 3)).astype(np.uint8)
     >>> out = model([input_page])
     
    @@ -818,15 +760,8 @@

    doctr.models.zoo
    Parameters:
      -
    • det_arch – name of the detection architecture to use (e.g. ‘db_resnet50’, ‘db_mobilenet_v3_large’)

    • -
    • reco_arch – name of the recognition architecture to use (e.g. ‘crnn_vgg16_bn’, ‘sar_resnet31’)

    • +
    • arch – name of the architecture to use (‘db_sar_vgg’, ‘db_sar_resnet’, ‘db_crnn_vgg’, ‘db_crnn_resnet’)

    • pretrained – If True, returns a model pre-trained on our OCR dataset

    • -
    • assume_straight_pages – if True, speeds up the inference by assuming you only pass straight pages -without rotated textual elements.

    • -
    • export_as_straight_boxes – when assume_straight_pages is set to False, export final predictions -(potentially rotated) as straight bounding boxes.

    • -
    • preserve_aspect_ratio – If True, pad the input document image to preserve the aspect ratio before -running the detection model on it.

    Returns:
    @@ -835,6 +770,113 @@

    doctr.models.zoo +

    Model export

    +

    Utility functions to make the most of document analysis models.

    +
    +

    Model compression

    +
    +
    +doctr.models.export.convert_to_tflite(tf_model: Model) bytes[source]
    +

    Converts a model to TFLite format

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_tflite, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_tflite(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.convert_to_fp16(tf_model: Model) bytes[source]
    +

    Converts a model to half precision

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import convert_to_fp16, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = convert_to_fp16(model)
    +
    +
    +
    +
    +
    +
    Parameters:
    +

    tf_model – a keras model

    +
    +
    Returns:
    +

    the serialized FP16 model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +doctr.models.export.quantize_model(tf_model: Model, input_shape: Tuple[int, int, int]) bytes[source]
    +

    Quantize a Tensorflow model

    +
    +
    Example::
    >>> from tensorflow.keras import Sequential
    +>>> from doctr.models import quantize_model, conv_sequence
    +>>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    +>>> serialized_model = quantize_model(model, (224, 224, 3))
    +
    +
    +
    +
    +
    +
    Parameters:
    +
      +
    • tf_model – a keras model

    • +
    • input_shape – shape of the expected input tensor (excluding batch dimension) with channel last order

    • +
    +
    +
    Returns:
    +

    the serialized quantized model

    +
    +
    Return type:
    +

    bytes

    +
    +
    +
    + +
    +
    +

    Using SavedModel

    +

    Additionally, models in DocTR inherit TensorFlow 2 model properties and can be exported to +SavedModel format as follows:

    +
    >>> import tensorflow as tf
    +>>> from doctr.models import db_resnet50
    +>>> model = db_resnet50(pretrained=True)
    +>>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    +>>> _ = model(input_t, training=False)
    +>>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    +
    +
    +

    And loaded just as easily:

    +
    >>> import tensorflow as tf
    +>>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    +
    +
    +

    @@ -852,14 +894,14 @@

    doctr.models.zoo - +
    Previous
    -
    doctr.io
    +
    doctr.documents
    @@ -894,37 +936,49 @@

    doctr.models.zoo

    diff --git a/v0.8.1/searchindex.js b/v0.8.1/searchindex.js index ba6dbffe85..315b91e76b 100644 --- a/v0.8.1/searchindex.js +++ b/v0.8.1/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id100"], [7, "id105"], [7, "id110"], [7, "id114"], [7, "id118"], [7, "id123"], [7, "id128"], [7, "id133"], [7, "id137"], [7, "id141"], [7, "id146"], [7, "id150"], [7, "id154"], [7, "id158"], [7, "id160"], [7, "id162"], [7, "id164"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id117"], [7, "id121"], [7, "id126"], [7, "id131"], [7, "id136"], [7, "id140"], [7, "id144"], [7, "id149"], [7, "id153"], [7, "id157"], [7, "id159"], [7, "id161"], [7, "id163"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "fast_base"], [7, 1, 1, "", "fast_small"], [7, 1, 1, "", "fast_tiny"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "2m": [], "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": 16, "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": 16, "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": 16, "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "There": [], "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_helper": [], "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "amazon": [], "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "anywher": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [4, 7], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "bash": [], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": 3, "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "below": [], "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [3, 6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "catch": [], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": 3, "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "cleaner": [], "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compli": [], "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": [3, 7], "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "constraint": [], "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convent": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [3, 6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [4, 7], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doe": [], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "ec2": [], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enivron": [], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fallback": [], "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [7, 16], "fast_smal": [7, 16], "fast_tini": [7, 16], "faster": [4, 7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flake8": [], "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float16": [], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": 3, "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_keras_model": [], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_lin": [], "get_text_word": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "incom": [], "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inference_input_typ": [], "inference_output_typ": [], "inform": [1, 2, 4, 5, 14], "inherit": [], "ini": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "int8": [], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "isort": [], "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "keep": [], "kei": [4, 5], "kera": [7, 15], "kernel": [4, 7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": [], "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "look": [], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [4, 7], "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "mobilenetv3_larg": [], "mobilenetv3_smal": [], "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "mypi": [], "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "oper": [], "opinion": 1, "opsset": [], "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdf_render": [], "pdfdocument": [], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pr": [], "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pydocstyl": [], "pypdfium2": [3, 6], "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "render_pdf_topil": [], "render_to": [], "reorder": [], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "representative_dataset": [], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "resum": [], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "road": [], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sane": [], "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 3, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [4, 6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": 3, "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "style": [], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "supported_op": [], "supported_typ": [], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "target_spec": [], "task": [4, 5, 7, 13, 14, 16], "task2": 5, "tax": [], "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "tflite_builtins_int8": [], "tfliteconvert": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unfortun": [], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "variou": [], "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn": [], "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "yield": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 16, "annot": [], "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "backbon": [], "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": 3, "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "docstr": [], "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "format": [], "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "import": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "lint": [], "linux": 3, "lite": [], "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": 3, "onnx": 15, "optim": 15, "option": 16, "order": [], "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "privat": [], "process": [], "public": [], "push": 13, "python": 3, "qualiti": 2, "quantiz": [], "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "tensorflow": [], "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "type": [], "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1. Correction": [[1, "correction"]], "2. Warning": [[1, "warning"]], "3. Temporary Ban": [[1, "temporary-ban"]], "4. Permanent Ban": [[1, "permanent-ban"]], "AWS Lambda": [[12, null]], "Advanced options": [[16, "advanced-options"]], "Args:": [[5, "args"], [5, "id4"], [5, "id7"], [5, "id10"], [5, "id13"], [5, "id16"], [5, "id19"], [5, "id22"], [5, "id25"], [5, "id29"], [5, "id32"], [5, "id37"], [5, "id40"], [5, "id46"], [5, "id49"], [5, "id50"], [5, "id51"], [5, "id54"], [5, "id57"], [5, "id60"], [5, "id61"], [6, "args"], [6, "id2"], [6, "id3"], [6, "id4"], [6, "id5"], [6, "id6"], [6, "id7"], [6, "id10"], [6, "id12"], [6, "id14"], [6, "id16"], [6, "id20"], [6, "id24"], [6, "id28"], [7, "args"], [7, "id3"], [7, "id8"], [7, "id13"], [7, "id17"], [7, "id21"], [7, "id26"], [7, "id31"], [7, "id36"], [7, "id41"], [7, "id45"], [7, "id49"], [7, "id54"], [7, "id58"], [7, "id63"], [7, "id68"], [7, "id72"], [7, "id76"], [7, "id81"], [7, "id86"], [7, "id90"], [7, "id95"], [7, "id100"], [7, "id105"], [7, "id110"], [7, "id114"], [7, "id118"], [7, "id123"], [7, "id128"], [7, "id133"], [7, "id137"], [7, "id141"], [7, "id146"], [7, "id150"], [7, "id154"], [7, "id158"], [7, "id160"], [7, "id162"], [7, "id164"], [8, "args"], [8, "id1"], [8, "id2"], [8, "id3"], [8, "id4"], [8, "id5"], [8, "id6"], [8, "id7"], [8, "id8"], [8, "id9"], [8, "id10"], [8, "id11"], [8, "id12"], [8, "id13"], [8, "id14"], [8, "id15"], [8, "id16"], [8, "id17"], [8, "id18"], [9, "args"], [9, "id3"], [9, "id5"], [9, "id6"], [9, "id7"], [9, "id8"], [9, "id9"], [9, "id10"], [9, "id11"]], "Artefact": [[6, "artefact"]], "Attribution": [[1, "attribution"]], "Available Datasets": [[14, "available-datasets"]], "Available architectures": [[16, "available-architectures"], [16, "id1"], [16, "id2"]], "Block": [[6, "block"]], "Changelog": [[0, null]], "Choose a ready to use dataset": [[14, null]], "Choosing the right model": [[16, null]], "Classification": [[13, "classification"]], "Code quality": [[2, "code-quality"]], "Code style verification": [[2, "code-style-verification"]], "Codebase structure": [[2, "codebase-structure"]], "Commits": [[2, "commits"]], "Composing transformations": [[8, "composing-transformations"]], "Continuous Integration": [[2, "continuous-integration"]], "Contributing to docTR": [[2, null]], "Contributor Covenant Code of Conduct": [[1, null]], "Custom dataset loader": [[5, "custom-dataset-loader"]], "Data Loading": [[14, "data-loading"]], "Dataloader": [[5, "dataloader"]], "Detection": [[13, "detection"], [14, "detection"]], "Detection predictors": [[16, "detection-predictors"]], "Developer mode installation": [[2, "developer-mode-installation"]], "Developing docTR": [[2, "developing-doctr"]], "Document": [[6, "document"]], "Document structure": [[6, "document-structure"]], "End-to-End OCR": [[16, "end-to-end-ocr"]], "Enforcement": [[1, "enforcement"]], "Enforcement Guidelines": [[1, "enforcement-guidelines"]], "Enforcement Responsibilities": [[1, "enforcement-responsibilities"]], "Export to ONNX": [[15, "export-to-onnx"]], "Feature requests & bug report": [[2, "feature-requests-bug-report"]], "Feedback": [[2, "feedback"]], "File reading": [[6, "file-reading"]], "Half-precision": [[15, "half-precision"]], "Installation": [[3, null]], "Let\u2019s connect": [[2, "let-s-connect"]], "Line": [[6, "line"]], "Loading from Huggingface Hub": [[13, "loading-from-huggingface-hub"]], "Loading your custom trained model": [[11, "loading-your-custom-trained-model"]], "Main Features": [[4, "main-features"]], "Model optimization": [[15, "model-optimization"]], "Model zoo": [[4, "model-zoo"]], "Modifying the documentation": [[2, "modifying-the-documentation"]], "Naming conventions": [[13, "naming-conventions"]], "Object Detection": [[14, "object-detection"]], "Our Pledge": [[1, "our-pledge"]], "Our Standards": [[1, "our-standards"]], "Page": [[6, "page"]], "Preparing your model for inference": [[15, null]], "Prerequisites": [[3, "prerequisites"]], "Pretrained community models": [[13, "pretrained-community-models"]], "Pushing to the Huggingface Hub": [[13, "pushing-to-the-huggingface-hub"]], "Questions": [[2, "questions"]], "Recognition": [[13, "recognition"], [14, "recognition"]], "Recognition predictors": [[16, "recognition-predictors"]], "Returns:": [[5, "returns"], [6, "returns"], [6, "id11"], [6, "id13"], [6, "id15"], [6, "id19"], [6, "id23"], [6, "id27"], [6, "id31"], [7, "returns"], [7, "id6"], [7, "id11"], [7, "id16"], [7, "id20"], [7, "id24"], [7, "id29"], [7, "id34"], [7, "id39"], [7, "id44"], [7, "id48"], [7, "id52"], [7, "id57"], [7, "id61"], [7, "id66"], [7, "id71"], [7, "id75"], [7, "id79"], [7, "id84"], [7, "id89"], [7, "id93"], [7, "id98"], [7, "id103"], [7, "id108"], [7, "id113"], [7, "id117"], [7, "id121"], [7, "id126"], [7, "id131"], [7, "id136"], [7, "id140"], [7, "id144"], [7, "id149"], [7, "id153"], [7, "id157"], [7, "id159"], [7, "id161"], [7, "id163"], [9, "returns"], [9, "id4"]], "Scope": [[1, "scope"]], "Share your model with the community": [[13, null]], "Supported Vocabs": [[5, "supported-vocabs"]], "Supported datasets": [[4, "supported-datasets"]], "Supported transformations": [[8, "supported-transformations"]], "Synthetic dataset generator": [[5, "synthetic-dataset-generator"], [14, "synthetic-dataset-generator"]], "Task evaluation": [[9, "task-evaluation"]], "Text Detection": [[16, "text-detection"]], "Text Recognition": [[16, "text-recognition"]], "Text detection models": [[4, "text-detection-models"]], "Text recognition models": [[4, "text-recognition-models"]], "Train your own model": [[11, null]], "Two-stage approaches": [[16, "two-stage-approaches"]], "Unit tests": [[2, "unit-tests"]], "Use your own datasets": [[14, "use-your-own-datasets"]], "Using your ONNX exported model in docTR": [[15, "using-your-onnx-exported-model-in-doctr"]], "Via Conda (Only for Linux)": [[3, "via-conda-only-for-linux"]], "Via Git": [[3, "via-git"]], "Via Python Package": [[3, "via-python-package"]], "Visualization": [[9, "visualization"]], "What should I do with the output?": [[16, "what-should-i-do-with-the-output"]], "Word": [[6, "word"]], "docTR Notebooks": [[10, null]], "docTR Vocabs": [[5, "id62"]], "docTR: Document Text Recognition": [[4, null]], "doctr.datasets": [[5, null], [5, "datasets"]], "doctr.io": [[6, null]], "doctr.models": [[7, null]], "doctr.models.classification": [[7, "doctr-models-classification"]], "doctr.models.detection": [[7, "doctr-models-detection"]], "doctr.models.factory": [[7, "doctr-models-factory"]], "doctr.models.recognition": [[7, "doctr-models-recognition"]], "doctr.models.zoo": [[7, "doctr-models-zoo"]], "doctr.transforms": [[8, null]], "doctr.utils": [[9, null]], "v0.1.0 (2021-03-05)": [[0, "v0-1-0-2021-03-05"]], "v0.1.1 (2021-03-18)": [[0, "v0-1-1-2021-03-18"]], "v0.2.0 (2021-05-11)": [[0, "v0-2-0-2021-05-11"]], "v0.2.1 (2021-05-28)": [[0, "v0-2-1-2021-05-28"]], "v0.3.0 (2021-07-02)": [[0, "v0-3-0-2021-07-02"]], "v0.3.1 (2021-08-27)": [[0, "v0-3-1-2021-08-27"]], "v0.4.0 (2021-10-01)": [[0, "v0-4-0-2021-10-01"]], "v0.4.1 (2021-11-22)": [[0, "v0-4-1-2021-11-22"]], "v0.5.0 (2021-12-31)": [[0, "v0-5-0-2021-12-31"]], "v0.5.1 (2022-03-22)": [[0, "v0-5-1-2022-03-22"]], "v0.6.0 (2022-09-29)": [[0, "v0-6-0-2022-09-29"]], "v0.7.0 (2023-09-09)": [[0, "v0-7-0-2023-09-09"]], "v0.8.0 (2024-02-28)": [[0, "v0-8-0-2024-02-28"]]}, "docnames": ["changelog", "contributing/code_of_conduct", "contributing/contributing", "getting_started/installing", "index", "modules/datasets", "modules/io", "modules/models", "modules/transforms", "modules/utils", "notebooks", "using_doctr/custom_models_training", "using_doctr/running_on_aws", "using_doctr/sharing_models", "using_doctr/using_datasets", "using_doctr/using_model_export", "using_doctr/using_models"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.viewcode": 1}, "filenames": ["changelog.rst", "contributing/code_of_conduct.md", "contributing/contributing.md", "getting_started/installing.rst", "index.rst", "modules/datasets.rst", "modules/io.rst", "modules/models.rst", "modules/transforms.rst", "modules/utils.rst", "notebooks.rst", "using_doctr/custom_models_training.rst", "using_doctr/running_on_aws.rst", "using_doctr/sharing_models.rst", "using_doctr/using_datasets.rst", "using_doctr/using_model_export.rst", "using_doctr/using_models.rst"], "indexentries": {"artefact (class in doctr.io)": [[6, "doctr.io.Artefact", false]], "block (class in doctr.io)": [[6, "doctr.io.Block", false]], "channelshuffle (class in doctr.transforms)": [[8, "doctr.transforms.ChannelShuffle", false]], "charactergenerator (class in doctr.datasets)": [[5, "doctr.datasets.CharacterGenerator", false]], "colorinversion (class in doctr.transforms)": [[8, "doctr.transforms.ColorInversion", false]], "compose (class in doctr.transforms)": [[8, "doctr.transforms.Compose", false]], "cord (class in doctr.datasets)": [[5, "doctr.datasets.CORD", false]], "crnn_mobilenet_v3_large() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_large", false]], "crnn_mobilenet_v3_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_mobilenet_v3_small", false]], "crnn_vgg16_bn() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.crnn_vgg16_bn", false]], "crop_orientation_predictor() (in module doctr.models.classification)": [[7, "doctr.models.classification.crop_orientation_predictor", false]], "dataloader (class in doctr.datasets.loader)": [[5, "doctr.datasets.loader.DataLoader", false]], "db_mobilenet_v3_large() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_mobilenet_v3_large", false]], "db_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.db_resnet50", false]], "decode_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.decode_img_as_tensor", false]], "detection_predictor() (in module doctr.models.detection)": [[7, "doctr.models.detection.detection_predictor", false]], "detectiondataset (class in doctr.datasets)": [[5, "doctr.datasets.DetectionDataset", false]], "detectionmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.DetectionMetric", false]], "docartefacts (class in doctr.datasets)": [[5, "doctr.datasets.DocArtefacts", false]], "document (class in doctr.io)": [[6, "doctr.io.Document", false]], "documentfile (class in doctr.io)": [[6, "doctr.io.DocumentFile", false]], "encode_sequences() (in module doctr.datasets)": [[5, "doctr.datasets.encode_sequences", false]], "fast_base() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_base", false]], "fast_small() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_small", false]], "fast_tiny() (in module doctr.models.detection)": [[7, "doctr.models.detection.fast_tiny", false]], "from_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.from_hub", false]], "from_images() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_images", false]], "from_pdf() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_pdf", false]], "from_url() (doctr.io.documentfile class method)": [[6, "doctr.io.DocumentFile.from_url", false]], "funsd (class in doctr.datasets)": [[5, "doctr.datasets.FUNSD", false]], "gaussianblur (class in doctr.transforms)": [[8, "doctr.transforms.GaussianBlur", false]], "gaussiannoise (class in doctr.transforms)": [[8, "doctr.transforms.GaussianNoise", false]], "ic03 (class in doctr.datasets)": [[5, "doctr.datasets.IC03", false]], "ic13 (class in doctr.datasets)": [[5, "doctr.datasets.IC13", false]], "iiit5k (class in doctr.datasets)": [[5, "doctr.datasets.IIIT5K", false]], "iiithws (class in doctr.datasets)": [[5, "doctr.datasets.IIITHWS", false]], "imgur5k (class in doctr.datasets)": [[5, "doctr.datasets.IMGUR5K", false]], "kie_predictor() (in module doctr.models)": [[7, "doctr.models.kie_predictor", false]], "lambdatransformation (class in doctr.transforms)": [[8, "doctr.transforms.LambdaTransformation", false]], "line (class in doctr.io)": [[6, "doctr.io.Line", false]], "linknet_resnet18() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet18", false]], "linknet_resnet34() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet34", false]], "linknet_resnet50() (in module doctr.models.detection)": [[7, "doctr.models.detection.linknet_resnet50", false]], "localizationconfusion (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.LocalizationConfusion", false]], "login_to_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.login_to_hub", false]], "magc_resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.magc_resnet31", false]], "master() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.master", false]], "mjsynth (class in doctr.datasets)": [[5, "doctr.datasets.MJSynth", false]], "mobilenet_v3_large() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large", false]], "mobilenet_v3_large_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_large_r", false]], "mobilenet_v3_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small", false]], "mobilenet_v3_small_orientation() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_orientation", false]], "mobilenet_v3_small_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.mobilenet_v3_small_r", false]], "normalize (class in doctr.transforms)": [[8, "doctr.transforms.Normalize", false]], "ocr_predictor() (in module doctr.models)": [[7, "doctr.models.ocr_predictor", false]], "ocrdataset (class in doctr.datasets)": [[5, "doctr.datasets.OCRDataset", false]], "ocrmetric (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.OCRMetric", false]], "oneof (class in doctr.transforms)": [[8, "doctr.transforms.OneOf", false]], "page (class in doctr.io)": [[6, "doctr.io.Page", false]], "parseq() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.parseq", false]], "push_to_hf_hub() (in module doctr.models.factory)": [[7, "doctr.models.factory.push_to_hf_hub", false]], "randomapply (class in doctr.transforms)": [[8, "doctr.transforms.RandomApply", false]], "randombrightness (class in doctr.transforms)": [[8, "doctr.transforms.RandomBrightness", false]], "randomcontrast (class in doctr.transforms)": [[8, "doctr.transforms.RandomContrast", false]], "randomcrop (class in doctr.transforms)": [[8, "doctr.transforms.RandomCrop", false]], "randomgamma (class in doctr.transforms)": [[8, "doctr.transforms.RandomGamma", false]], "randomhorizontalflip (class in doctr.transforms)": [[8, "doctr.transforms.RandomHorizontalFlip", false]], "randomhue (class in doctr.transforms)": [[8, "doctr.transforms.RandomHue", false]], "randomjpegquality (class in doctr.transforms)": [[8, "doctr.transforms.RandomJpegQuality", false]], "randomrotate (class in doctr.transforms)": [[8, "doctr.transforms.RandomRotate", false]], "randomsaturation (class in doctr.transforms)": [[8, "doctr.transforms.RandomSaturation", false]], "randomshadow (class in doctr.transforms)": [[8, "doctr.transforms.RandomShadow", false]], "read_html() (in module doctr.io)": [[6, "doctr.io.read_html", false]], "read_img_as_numpy() (in module doctr.io)": [[6, "doctr.io.read_img_as_numpy", false]], "read_img_as_tensor() (in module doctr.io)": [[6, "doctr.io.read_img_as_tensor", false]], "read_pdf() (in module doctr.io)": [[6, "doctr.io.read_pdf", false]], "recognition_predictor() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.recognition_predictor", false]], "recognitiondataset (class in doctr.datasets)": [[5, "doctr.datasets.RecognitionDataset", false]], "resize (class in doctr.transforms)": [[8, "doctr.transforms.Resize", false]], "resnet18() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet18", false]], "resnet31() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet31", false]], "resnet34() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet34", false]], "resnet50() (in module doctr.models.classification)": [[7, "doctr.models.classification.resnet50", false]], "sar_resnet31() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.sar_resnet31", false]], "show() (doctr.io.document method)": [[6, "doctr.io.Document.show", false]], "show() (doctr.io.page method)": [[6, "doctr.io.Page.show", false]], "sroie (class in doctr.datasets)": [[5, "doctr.datasets.SROIE", false]], "summary() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.summary", false]], "summary() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.summary", false]], "summary() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.summary", false]], "summary() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.summary", false]], "svhn (class in doctr.datasets)": [[5, "doctr.datasets.SVHN", false]], "svt (class in doctr.datasets)": [[5, "doctr.datasets.SVT", false]], "synthesize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.synthesize_page", false]], "synthtext (class in doctr.datasets)": [[5, "doctr.datasets.SynthText", false]], "textmatch (class in doctr.utils.metrics)": [[9, "doctr.utils.metrics.TextMatch", false]], "textnet_base() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_base", false]], "textnet_small() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_small", false]], "textnet_tiny() (in module doctr.models.classification)": [[7, "doctr.models.classification.textnet_tiny", false]], "togray (class in doctr.transforms)": [[8, "doctr.transforms.ToGray", false]], "update() (doctr.utils.metrics.detectionmetric method)": [[9, "doctr.utils.metrics.DetectionMetric.update", false]], "update() (doctr.utils.metrics.localizationconfusion method)": [[9, "doctr.utils.metrics.LocalizationConfusion.update", false]], "update() (doctr.utils.metrics.ocrmetric method)": [[9, "doctr.utils.metrics.OCRMetric.update", false]], "update() (doctr.utils.metrics.textmatch method)": [[9, "doctr.utils.metrics.TextMatch.update", false]], "vgg16_bn_r() (in module doctr.models.classification)": [[7, "doctr.models.classification.vgg16_bn_r", false]], "visualize_page() (in module doctr.utils.visualization)": [[9, "doctr.utils.visualization.visualize_page", false]], "vit_b() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_b", false]], "vit_s() (in module doctr.models.classification)": [[7, "doctr.models.classification.vit_s", false]], "vitstr_base() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_base", false]], "vitstr_small() (in module doctr.models.recognition)": [[7, "doctr.models.recognition.vitstr_small", false]], "wildreceipt (class in doctr.datasets)": [[5, "doctr.datasets.WILDRECEIPT", false]], "word (class in doctr.io)": [[6, "doctr.io.Word", false]], "wordgenerator (class in doctr.datasets)": [[5, "doctr.datasets.WordGenerator", false]]}, "objects": {"doctr.datasets": [[5, 0, 1, "", "CORD"], [5, 0, 1, "", "CharacterGenerator"], [5, 0, 1, "", "DetectionDataset"], [5, 0, 1, "", "DocArtefacts"], [5, 0, 1, "", "FUNSD"], [5, 0, 1, "", "IC03"], [5, 0, 1, "", "IC13"], [5, 0, 1, "", "IIIT5K"], [5, 0, 1, "", "IIITHWS"], [5, 0, 1, "", "IMGUR5K"], [5, 0, 1, "", "MJSynth"], [5, 0, 1, "", "OCRDataset"], [5, 0, 1, "", "RecognitionDataset"], [5, 0, 1, "", "SROIE"], [5, 0, 1, "", "SVHN"], [5, 0, 1, "", "SVT"], [5, 0, 1, "", "SynthText"], [5, 0, 1, "", "WILDRECEIPT"], [5, 0, 1, "", "WordGenerator"], [5, 1, 1, "", "encode_sequences"]], "doctr.datasets.loader": [[5, 0, 1, "", "DataLoader"]], "doctr.io": [[6, 0, 1, "", "Artefact"], [6, 0, 1, "", "Block"], [6, 0, 1, "", "Document"], [6, 0, 1, "", "DocumentFile"], [6, 0, 1, "", "Line"], [6, 0, 1, "", "Page"], [6, 0, 1, "", "Word"], [6, 1, 1, "", "decode_img_as_tensor"], [6, 1, 1, "", "read_html"], [6, 1, 1, "", "read_img_as_numpy"], [6, 1, 1, "", "read_img_as_tensor"], [6, 1, 1, "", "read_pdf"]], "doctr.io.Document": [[6, 2, 1, "", "show"]], "doctr.io.DocumentFile": [[6, 2, 1, "", "from_images"], [6, 2, 1, "", "from_pdf"], [6, 2, 1, "", "from_url"]], "doctr.io.Page": [[6, 2, 1, "", "show"]], "doctr.models": [[7, 1, 1, "", "kie_predictor"], [7, 1, 1, "", "ocr_predictor"]], "doctr.models.classification": [[7, 1, 1, "", "crop_orientation_predictor"], [7, 1, 1, "", "magc_resnet31"], [7, 1, 1, "", "mobilenet_v3_large"], [7, 1, 1, "", "mobilenet_v3_large_r"], [7, 1, 1, "", "mobilenet_v3_small"], [7, 1, 1, "", "mobilenet_v3_small_orientation"], [7, 1, 1, "", "mobilenet_v3_small_r"], [7, 1, 1, "", "resnet18"], [7, 1, 1, "", "resnet31"], [7, 1, 1, "", "resnet34"], [7, 1, 1, "", "resnet50"], [7, 1, 1, "", "textnet_base"], [7, 1, 1, "", "textnet_small"], [7, 1, 1, "", "textnet_tiny"], [7, 1, 1, "", "vgg16_bn_r"], [7, 1, 1, "", "vit_b"], [7, 1, 1, "", "vit_s"]], "doctr.models.detection": [[7, 1, 1, "", "db_mobilenet_v3_large"], [7, 1, 1, "", "db_resnet50"], [7, 1, 1, "", "detection_predictor"], [7, 1, 1, "", "fast_base"], [7, 1, 1, "", "fast_small"], [7, 1, 1, "", "fast_tiny"], [7, 1, 1, "", "linknet_resnet18"], [7, 1, 1, "", "linknet_resnet34"], [7, 1, 1, "", "linknet_resnet50"]], "doctr.models.factory": [[7, 1, 1, "", "from_hub"], [7, 1, 1, "", "login_to_hub"], [7, 1, 1, "", "push_to_hf_hub"]], "doctr.models.recognition": [[7, 1, 1, "", "crnn_mobilenet_v3_large"], [7, 1, 1, "", "crnn_mobilenet_v3_small"], [7, 1, 1, "", "crnn_vgg16_bn"], [7, 1, 1, "", "master"], [7, 1, 1, "", "parseq"], [7, 1, 1, "", "recognition_predictor"], [7, 1, 1, "", "sar_resnet31"], [7, 1, 1, "", "vitstr_base"], [7, 1, 1, "", "vitstr_small"]], "doctr.transforms": [[8, 0, 1, "", "ChannelShuffle"], [8, 0, 1, "", "ColorInversion"], [8, 0, 1, "", "Compose"], [8, 0, 1, "", "GaussianBlur"], [8, 0, 1, "", "GaussianNoise"], [8, 0, 1, "", "LambdaTransformation"], [8, 0, 1, "", "Normalize"], [8, 0, 1, "", "OneOf"], [8, 0, 1, "", "RandomApply"], [8, 0, 1, "", "RandomBrightness"], [8, 0, 1, "", "RandomContrast"], [8, 0, 1, "", "RandomCrop"], [8, 0, 1, "", "RandomGamma"], [8, 0, 1, "", "RandomHorizontalFlip"], [8, 0, 1, "", "RandomHue"], [8, 0, 1, "", "RandomJpegQuality"], [8, 0, 1, "", "RandomRotate"], [8, 0, 1, "", "RandomSaturation"], [8, 0, 1, "", "RandomShadow"], [8, 0, 1, "", "Resize"], [8, 0, 1, "", "ToGray"]], "doctr.utils.metrics": [[9, 0, 1, "", "DetectionMetric"], [9, 0, 1, "", "LocalizationConfusion"], [9, 0, 1, "", "OCRMetric"], [9, 0, 1, "", "TextMatch"]], "doctr.utils.metrics.DetectionMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.LocalizationConfusion": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.OCRMetric": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.metrics.TextMatch": [[9, 2, 1, "", "summary"], [9, 2, 1, "", "update"]], "doctr.utils.visualization": [[9, 1, 1, "", "synthesize_page"], [9, 1, 1, "", "visualize_page"]]}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "function", "Python function"], "2": ["py", "method", "Python method"]}, "objtypes": {"0": "py:class", "1": "py:function", "2": "py:method"}, "terms": {"": [1, 6, 7, 9, 13], "0": [1, 3, 5, 8, 9, 11, 14, 16], "00": 16, "01": 16, "0123456789": 5, "0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "0123456789\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "02": [], "02562": 7, "03": 16, "035": 16, "0361328125": 16, "04": [], "05": 16, "06": 16, "06640625": 16, "07": 16, "08": [8, 16], "09": 16, "0966796875": 16, "1": [3, 5, 6, 7, 8, 9, 11, 14, 16], "10": [5, 9, 16], "100": [5, 8, 9, 14, 16], "1000": 16, "101": 5, "1024": [7, 9, 11, 16], "104": 5, "106": 5, "108": 5, "1095": 14, "11": 16, "110": 9, "1107": 14, "114": 5, "115": [], "1156": 14, "116": 5, "118": 5, "11800h": 16, "11th": 16, "12": [3, 16], "120": 5, "123": 5, "126": 5, "1268": 14, "128": [7, 11, 15, 16], "13": [9, 16], "130": 5, "13068": 14, "131": 5, "1337891": 14, "1357421875": 16, "1396484375": 16, "14": 16, "1420": 16, "14470v1": 5, "149": 14, "15": 16, "150": [9, 16], "154": [], "1552": 16, "16": [7, 15, 16], "160": [], "1630859375": 16, "1684": 16, "16x16": 7, "17": 16, "1778": 16, "1782": 16, "18": 7, "185546875": 16, "19": [], "1900": 16, "1910": 7, "19342": 14, "19370": 14, "195": 5, "19598": 14, "199": 16, "1999": 16, "1m": [], "2": [3, 4, 5, 6, 8, 16], "20": 16, "200": 9, "2000": 14, "2003": [4, 5], "2012": 5, "2013": [4, 5], "2015": 5, "2019": 4, "2021": [], "207901": 14, "21": 16, "2103": 5, "2186": 14, "21888": 14, "22": 16, "224": [7, 8], "225": 8, "22672": 14, "229": [8, 14], "23": 16, "233": 14, "234": 5, "236": [], "24": 16, "246": 14, "249": 14, "25": 16, "2504": 16, "255": [6, 7, 8, 9, 16], "256": 7, "257": 14, "26": 16, "26032": 14, "264": 11, "27": 16, "2700": 14, "2710": 16, "2749": 11, "28": 16, "287": 11, "29": 16, "296": 11, "299": 11, "2d": 16, "3": [3, 4, 6, 7, 8, 9, 15, 16], "30": 16, "300": 14, "3000": 14, "301": 11, "30595": 16, "30ghz": 16, "31": 7, "32": [5, 7, 8, 11, 14, 15, 16], "3232421875": 16, "33": [8, 16], "33402": 14, "33608": 14, "34": [7, 16], "340": 16, "3456": 16, "35": [], "3515625": 16, "36": [], "360": 14, "37": [5, 16], "38": 16, "39": 16, "4": [7, 8, 9, 16], "40": 16, "406": 8, "41": 16, "42": 16, "43": 16, "44": 16, "45": 16, "456": 8, "46": 16, "47": 16, "472": 14, "48": [5, 16], "485": 8, "49": 16, "49377": 14, "5": [5, 8, 9, 16], "50": [7, 14, 16], "51": 16, "51171875": 16, "512": 7, "52": [5, 16], "529": 16, "53": 16, "533": [], "54": 16, "540": 16, "5478515625": 16, "55": 16, "56": 16, "57": 16, "58": 16, "580": 16, "5810546875": 16, "583": 16, "59": 16, "595": [], "597": 16, "5k": [4, 5], "5m": 16, "6": [8, 16], "60": 8, "600": [7, 9, 16], "61": 16, "611": [], "62": 16, "625": [], "626": 14, "629": [], "63": 16, "630": [], "64": [7, 8, 16], "640": [], "641": 16, "647": 14, "65": 16, "66": 16, "660": [], "664": [], "666": [], "67": 16, "672": [], "68": 16, "689": [], "69": 16, "693": 11, "694": 11, "695": 11, "6m": 16, "7": 16, "70": [9, 16], "700": [], "701": [], "702": [], "707470": 14, "71": 16, "7100000": 14, "713": [], "7141797": 14, "7149": 14, "72": 16, "72dpi": 6, "73": 16, "73257": 14, "733": [], "74": 16, "745": [], "75": [8, 16], "753": [], "7581382": 14, "76": 16, "77": 16, "772": 11, "772875": 14, "78": 16, "780": [], "781": [], "783": [], "785": 11, "789": [], "79": 16, "793533": 14, "796": 14, "798": 11, "7m": 16, "8": [3, 7, 8, 16], "80": 16, "800": [7, 9, 14, 16], "81": 16, "817": [], "82": 16, "8275l": [], "83": 16, "830": [], "84": 16, "849": 14, "85": 16, "8564453125": 16, "857": 16, "85875": 14, "86": 16, "860": [], "8603515625": 16, "862": [], "863": [], "87": 16, "8707": 14, "875": [], "88": 16, "89": 16, "8m": [], "9": 16, "90": 16, "90k": 5, "90kdict32px": 5, "91": 16, "913": [], "914085328578949": 16, "917": [], "92": 16, "921": [], "93": 16, "94": [5, 16], "95": [9, 16], "9578408598899841": 16, "96": 16, "97": [], "98": 16, "99": 16, "9949972033500671": 16, "A": [1, 2, 4, 5, 6, 7, 10, 15], "And": [], "As": 2, "Be": 16, "Being": 1, "By": 12, "For": [1, 2, 3, 11, 16], "If": [2, 3, 6, 7, 11, 16], "In": [2, 5, 14], "It": [8, 13, 15], "Its": [4, 7], "No": [1, 16], "Of": 5, "Or": [], "The": [1, 2, 5, 6, 9, 12, 16], "Then": 7, "To": [2, 3, 12, 13, 16], "_": [1, 5, 7], "__call__": 16, "_build": 2, "_i": 9, "ab": 5, "abc": [], "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz": 5, "abdef": [5, 14], "abl": [14, 16], "about": [1, 14, 16], "abov": 16, "abstract": [], "abstractdataset": 5, "abus": 1, "accent": [], "accept": 1, "access": [4, 6, 14, 16], "account": [1, 13], "accur": 16, "accuraci": 9, "achiev": 15, "act": 1, "action": 1, "activ": 4, "ad": [2, 7, 8], "adapt": 1, "add": [8, 9, 13, 16], "add_hook": 16, "add_label": 9, "addit": [2, 3, 6], "addition": [2, 16], "address": [1, 6], "adjust": 8, "advanc": 1, "advantag": 15, "advis": 2, "aesthet": [4, 5], "affect": 1, "after": [13, 16], "ag": 1, "again": 7, "aggreg": [9, 14], "aggress": 1, "align": [1, 6], "all": [1, 2, 5, 6, 8, 9, 14, 16], "allow": 1, "along": 16, "alreadi": 2, "also": [1, 7, 13, 14, 16], "alwai": 14, "an": [1, 2, 4, 5, 6, 7, 9, 15, 16], "analysi": 6, "ancient_greek": 5, "angl": [6, 8], "ani": [1, 5, 6, 7, 8, 9, 16], "annot": 5, "anot": 14, "anoth": [3, 7, 11, 14], "answer": 1, "anyascii": [], "anyon": 4, "anyth": [], "api": [2, 4], "apolog": 1, "apologi": 1, "app": 2, "appear": 1, "appli": [1, 5, 8], "applic": [4, 7], "appoint": 1, "appreci": 13, "appropri": [1, 2, 16], "ar": [1, 2, 3, 5, 6, 8, 9, 10, 14, 16], "arab": 5, "arabic_diacrit": 5, "arabic_lett": 5, "arabic_punctu": 5, "arbitrarili": [4, 7], "arch": [7, 13], "architectur": [4, 7, 13], "archiv": [], "area": 16, "arg": [], "argument": [5, 6, 7, 9, 16], "around": 1, "arrai": [6, 8, 9], "art": 4, "artefact": [9, 10, 16], "artefact_typ": 6, "artifici": [4, 5], "arxiv": [5, 7], "as_imag": [], "asarrai": 9, "ascii_lett": 5, "aspect": [4, 7, 8, 16], "assess": 9, "assign": 9, "associ": 6, "assum": 7, "assume_straight_pag": [7, 16], "astyp": [7, 9, 16], "attack": 1, "attend": [4, 7], "attent": [1, 7], "autoclass": [], "autom": 4, "automat": 16, "autoregress": [4, 7], "avail": [1, 4, 8], "averag": [8, 16], "avoid": [1, 3], "aw": [4, 16], "awar": 16, "azur": 16, "b": [7, 9, 16], "b_j": 9, "back": 2, "backbon": 7, "backend": 16, "background": 14, "bangla": [], "bar": [], "bar_cod": 14, "base": [4, 7], "baselin": [4, 7, 16], "batch": [5, 7, 8, 14, 16], "batch_siz": [5, 11, 14, 15], "bblanchon": 3, "bbox": 16, "becaus": 12, "been": [2, 9, 14, 16], "befor": [5, 7, 8, 16], "begin": 9, "behavior": [1, 16], "being": [9, 16], "belong": 16, "benchmark": 16, "best": 1, "beta": [], "better": [10, 16], "between": [8, 9, 16], "bgr": 6, "bilinear": 8, "bin_thresh": 16, "binar": [4, 7, 16], "binari": [6, 15, 16], "bit": 15, "blank": 9, "block": [9, 16], "block_1_1": 16, "blue": 9, "blur": 8, "bmvc": 5, "bn": 13, "bodi": [1, 16], "bool": [5, 6, 7, 8, 9], "boolean": [7, 16], "both": [4, 5, 8, 14, 16], "bottom": [7, 16], "bound": [5, 6, 7, 8, 9, 16], "box": [5, 6, 7, 8, 9, 14, 16], "box_thresh": 16, "brew": 3, "bright": 8, "broadcast": 9, "browser": [2, 4], "build": [2, 3], "built": 2, "byte": [6, 16], "c": [3, 6, 9], "c5": [], "c_j": 9, "cach": [2, 5, 12], "cache_sampl": 5, "cairo": 3, "call": [], "callabl": [5, 8], "can": [2, 3, 11, 12, 13, 14, 16], "capabl": [2, 10, 16], "case": [5, 9], "cf": 16, "cfg": 16, "challeng": 5, "challenge2_test_task12_imag": 5, "challenge2_test_task1_gt": 5, "challenge2_training_task12_imag": 5, "challenge2_training_task1_gt": 5, "chang": 12, "changelog": [], "channel": [1, 2, 6, 8], "channel_prior": 3, "channelshuffl": 8, "charact": [4, 5, 6, 9, 14, 16], "charactergener": [5, 14], "characterist": 1, "charg": 16, "charset": 16, "chart": 6, "check": [2, 13, 16], "checkpoint": 7, "chip": 3, "ci": 2, "clarifi": 1, "clariti": 1, "class": [1, 5, 6, 8, 9, 16], "class_nam": 11, "classif": 14, "classif_mobilenet_v3_smal": 7, "classmethod": 6, "clear": 2, "clone": 3, "close": 2, "co": 13, "code": [4, 6], "codecov": 2, "colab": 10, "collate_fn": 5, "collect": 6, "color": [8, 9], "colorinvers": 8, "column": 6, "com": [1, 3, 6, 7, 13], "combin": 16, "come": 15, "command": 2, "comment": 1, "commit": 1, "common": [1, 8, 9, 15], "commun": 1, "compar": 4, "comparison": [9, 16], "competit": 5, "compil": [10, 16], "complaint": 1, "complementari": 9, "complet": 2, "compon": 16, "compos": [5, 16], "comprehens": 16, "comput": [5, 9, 15, 16], "conf_threshold": [], "confid": [6, 9, 16], "config": [3, 7], "configur": 7, "confus": 9, "consecut": [8, 16], "consequ": 1, "consid": [1, 2, 5, 6, 9, 16], "consist": 16, "consolid": [4, 5], "constant": 8, "construct": 1, "consum": 9, "contact": 1, "contain": [5, 14], "content": [5, 6, 9, 16], "context": 7, "contib": [], "continu": 1, "contrast": 8, "contrast_factor": 8, "contrib": [], "contribut": 1, "contributor": 2, "conv_sequ": [], "convers": 6, "convert": [6, 8], "convert_page_to_numpi": [], "convert_to_fp16": [], "convert_to_tflit": [], "convolut": 7, "coordin": [6, 16], "cord": [4, 5, 14, 16], "core": [9, 16], "corner": 16, "correct": 8, "correspond": [3, 6, 16], "could": 1, "counterpart": 9, "cover": 2, "coverag": 2, "cpu": [4, 11], "creat": 13, "crnn": [4, 7, 13], "crnn_mobilenet_v3_larg": [7, 13, 16], "crnn_mobilenet_v3_smal": [7, 15, 16], "crnn_resnet31": [], "crnn_vgg16_bn": [7, 11, 13, 16], "crop": [7, 8, 14, 16], "crop_orient": [], "crop_orientation_predictor": 7, "crop_param": [], "croporientationpredictor": 7, "cuda": 15, "currenc": 5, "current": [2, 16], "custom": [13, 16], "custom_crop_orientation_model": [], "custom_page_orientation_model": [], "customhook": 16, "cvit": 4, "czczup": 7, "czech": 5, "d": [5, 14], "daili": [], "danish": 5, "data": [4, 5, 6, 8, 9, 11, 13], "dataload": 14, "dataset": [7, 11, 16], "dataset_info": 5, "date": [11, 16], "db": 13, "db_crnn_resnet": [], "db_crnn_vgg": [], "db_mobilenet_v3_larg": [7, 13, 16], "db_resnet34": 16, "db_resnet50": [7, 11, 13, 16], "db_resnet50_rot": [], "db_sar_resnet": [], "db_sar_vgg": [], "dbnet": [4, 7], "deal": [], "decis": 1, "decod": 6, "decode_img_as_tensor": 6, "dedic": [], "deem": 1, "deep": [7, 16], "def": 16, "default": [3, 6, 9, 11, 12, 16], "defer": 14, "defin": [9, 15], "deform": [], "degre": 8, "degress": 6, "delet": 2, "delimit": 16, "delta": 8, "demo": [2, 4], "demonstr": 1, "depend": [2, 3, 4], "deploi": 2, "deploy": 4, "derogatori": 1, "describ": [7, 9], "descript": 10, "design": 8, "desir": 6, "det_arch": [7, 11, 13, 15], "det_b": [], "det_model": [11, 13], "det_param": 11, "det_predictor": [11, 16], "detail": [11, 16], "detect": [5, 9, 10, 11], "detect_languag": 7, "detect_orient": 7, "detection_predictor": [7, 16], "detection_task": [], "detectiondataset": [5, 14], "detectionmetr": 9, "detectionpredictor": [7, 11], "detector": [4, 7], "deterior": 7, "determin": 1, "dev": [2, 12], "develop": 3, "developp": [], "deviat": 8, "devic": 15, "dict": [6, 9, 16], "dictionari": [6, 9], "differ": 1, "differenti": [4, 7], "digit": [4, 5, 14], "dimens": [6, 9, 16], "dimension": 8, "direct": 5, "directli": [13, 16], "directori": [2, 12], "disabl": [1, 12, 16], "disable_crop_orient": [], "disable_page_orient": [], "disclaim": 16, "discuss": 2, "disk": [], "disparag": 1, "displai": [6, 9], "display_artefact": 9, "distanc": [], "distribut": 8, "div": 16, "divers": 1, "divid": 6, "do": [2, 3, 7], "doc": [2, 6, 15, 16], "docartefact": [5, 14], "docstr": 2, "doctr": [3, 11, 12, 13, 14, 16], "doctr_cache_dir": 12, "doctr_multiprocessing_dis": 12, "document": [5, 7, 9, 10, 14, 16], "documentbuild": 16, "documentfil": [6, 13], "doesn": [], "don": [11, 16], "done": 8, "download": [5, 14], "downsiz": 7, "draw": [8, 9], "draw_proba": 9, "drop": 5, "drop_last": 5, "dtype": [6, 7, 8, 9, 15], "dual": [4, 5], "dummi": 13, "dummy_img": 16, "dummy_input": 15, "dure": 1, "dutch": 5, "dynam": 5, "dynamic_seq_length": 5, "e": [1, 2, 3, 6, 7], "each": [4, 5, 6, 7, 8, 9, 14, 16], "eas": 2, "easi": [4, 9, 13], "easier": [], "easili": [6, 9, 11, 13, 14, 16], "econom": 1, "edit": 1, "educ": 1, "effect": [], "effici": [2, 4, 5, 7], "either": [9, 16], "element": [5, 6, 7, 9, 16], "els": 2, "email": 1, "empathi": 1, "en": 16, "enabl": [5, 6], "enclos": 6, "encod": [4, 5, 6, 7, 16], "encode_sequ": 5, "encount": 2, "encrypt": 6, "end": [4, 5, 7, 9], "english": [5, 14], "enough": [2, 16], "ensur": 2, "entir": [], "entri": 5, "environ": [1, 12], "eo": 5, "equiv": 16, "error": [], "estim": 7, "etc": 6, "ethnic": 1, "evalu": [14, 16], "event": 1, "everyon": 1, "everyth": [2, 16], "exact": [9, 16], "exactmatch": [], "exampl": [1, 2, 4, 5, 7, 13, 16], "exchang": 15, "exclud": [], "execut": 16, "exist": 13, "expand": 8, "expect": [6, 8, 9], "experi": 1, "explan": [1, 16], "explicit": 1, "exploit": [4, 7], "export": [6, 7, 9, 10, 16], "export_as_straight_box": [7, 16], "export_as_xml": 16, "export_model_to_onnx": 15, "express": [1, 8], "extens": 6, "extern": [1, 14], "extra": 3, "extract": [4, 5], "extract_arch": [], "extractor": 7, "f_": 9, "f_a": 9, "factor": 8, "fair": 1, "fairli": 1, "fals": [5, 6, 7, 8, 9, 11, 16], "famili": 9, "faq": 1, "fascan": 13, "fast": [4, 5, 7], "fast_bas": [7, 16], "fast_smal": [7, 16], "fast_tini": [7, 16], "faster": [4, 7, 15], "fasterrcnn_mobilenet_v3_large_fpn": 7, "favorit": 16, "featur": [3, 7, 9, 10], "feed": [], "feedback": 1, "feel": [2, 13], "felix92": 13, "few": [3, 15, 16], "figsiz": 9, "figur": 9, "file": [2, 5], "file_hash": [], "file_nam": [], "final": 7, "find": [2, 3, 14], "fine": [], "finnish": 5, "first": [2, 5], "firsthand": 5, "fit": [7, 16], "fitz": [], "flag": 16, "flexibl": [], "flip": 8, "float": [6, 8, 9, 15], "float32": [6, 7, 8, 15], "fn": 8, "focu": 13, "focus": [1, 5], "folder": 5, "follow": [1, 2, 3, 5, 8, 9, 11, 12, 13, 16], "font": [5, 9], "font_famili": [5, 9], "font_siz": 9, "foral": 9, "forc": 2, "forg": 3, "form": [4, 5, 16], "format": [6, 9, 11, 14, 15, 16], "forpost": [4, 5], "forum": 2, "fp": [], "fp16": 15, "frac": 9, "frame": [], "framework": [3, 13, 14, 16], "free": [1, 2, 13], "french": [5, 11, 13, 16], "friendli": 4, "from": [1, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16], "from_hub": [7, 13], "from_imag": [6, 13], "from_pdf": 6, "from_url": 6, "full": [5, 9, 16], "fulli": [], "function": [5, 8, 9], "funsd": [4, 5, 14, 16], "further": 14, "futur": 5, "g": [6, 7], "g_": 9, "g_x": 9, "gamma": 8, "gaussian": 8, "gaussianblur": 8, "gaussiannois": 8, "gdk": 3, "gen": 16, "gender": 1, "gener": [2, 4, 7], "generic_cyrillic_lett": [], "geometri": [4, 6, 16], "geq": 9, "german": [5, 11, 13], "get": 16, "get_artefact": [], "get_word": [], "gettextword": [], "git": 13, "github": [2, 3, 7, 13], "give": 1, "given": [5, 6, 8, 9, 16], "global": 7, "go": 16, "good": 15, "googl": 2, "googlevis": 4, "gpu": [4, 15], "gracefulli": 1, "graph": [4, 5, 6], "grayscal": 8, "ground": 9, "groung": 9, "group": [4, 16], "gt": 9, "gt_box": 9, "gt_label": 9, "gtk": 3, "guid": 2, "guidanc": 14, "gvision": 16, "h": [6, 7, 8], "h_": 9, "ha": [2, 5, 9, 14], "half": [], "handl": [14, 16], "handwrit": 5, "handwritten": 14, "harass": 1, "hardwar": [], "harm": 1, "hat": 9, "have": [1, 2, 9, 11, 13, 14, 16], "head": [7, 16], "healthi": 1, "hebrew": 5, "height": 6, "hello": [9, 16], "help": 15, "here": [3, 8, 10, 14, 16], "hf": 7, "hf_hub_download": 7, "high": 6, "higher": [3, 5, 16], "hindi": [], "hindi_digit": 5, "hocr": 16, "homebrew": 3, "hook": 16, "horizont": [6, 8], "hous": 5, "how": [2, 11, 13, 14], "howev": 14, "hsv": 8, "html": [1, 2, 6, 16], "http": [1, 3, 5, 6, 7, 13, 16], "hub": 7, "hue": 8, "huggingfac": 7, "hw": 5, "i": [1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 15], "i7": 16, "ic03": [4, 5, 14], "ic13": [4, 5, 14], "icdar": [4, 5], "icdar2019": 5, "id": 16, "ident": 1, "identifi": 4, "ignor": [], "ignore_acc": [], "ignore_cas": [], "iiit": [4, 5], "iiit5k": [5, 14], "iiithw": [4, 5, 14], "imag": [4, 5, 6, 7, 8, 9, 13, 14, 16], "imagenet": 7, "imageri": 1, "images_90k_norm": 5, "img": [5, 8, 14], "img_cont": 6, "img_fold": [5, 14], "img_path": 6, "img_transform": 5, "imgur5k": [4, 5, 14], "imgur5k_annot": 5, "imlist": 5, "impact": 1, "implement": [5, 6, 7, 8, 9, 16], "import": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16], "improv": 7, "inappropri": 1, "incid": 1, "includ": [1, 3, 5, 14, 15], "inclus": 1, "increas": 8, "independ": [], "index": [2, 6], "indic": 9, "individu": 1, "infer": [4, 7, 8], "inform": [1, 2, 4, 5, 14], "inherit": [], "input": [2, 6, 7, 8, 15, 16], "input_crop": 7, "input_pag": [7, 9, 16], "input_shap": 15, "input_t": [], "input_tensor": 7, "inspir": [1, 8], "instal": 13, "instanc": [1, 16], "instanti": [7, 16], "instead": [5, 6, 7], "insult": 1, "int": [5, 6, 8, 9], "int64": [8, 9], "integ": 9, "integr": [4, 13, 14], "intel": 16, "interact": [1, 6, 9], "interfac": 13, "interoper": 15, "interpol": 8, "interpret": [5, 6], "intersect": 9, "invert": 8, "investig": 1, "invis": 1, "invoic": [], "involv": [1, 16], "io": 13, "iou": 9, "iou_thresh": 9, "iou_threshold": [], "irregular": [4, 7, 14], "isn": 5, "issu": [1, 2, 13], "italian": 5, "iter": [5, 8, 14, 16], "its": [6, 7, 8, 9, 14, 16], "itself": [7, 13], "j": 9, "job": 2, "join": 2, "jpeg": 8, "jpegqual": 8, "jpg": [5, 6, 13], "json": [5, 14, 16], "json_output": 16, "jump": 2, "just": 1, "kei": [4, 5], "kera": [7, 15], "kernel": [4, 7, 8], "kernel_s": [], "kernel_shap": 8, "keywoard": 7, "keyword": [5, 6, 7, 9], "kie": [7, 11], "kie_predictor": [7, 11], "kiepredictor": 7, "kind": 1, "know": 2, "kwarg": [5, 6, 7, 9], "l": 9, "l_j": 9, "label": [5, 8, 9, 14], "label_fil": [5, 14], "label_fold": 5, "label_path": [5, 14], "labels_path": [5, 14], "ladder": 1, "lambda": 8, "lambdatransform": 8, "lang": 16, "languag": [1, 4, 5, 6, 7, 13, 16], "larg": [7, 13], "largest": 9, "last": [3, 5], "latenc": 7, "later": 2, "latest": [3, 16], "latin": 5, "layer": 15, "layout": 16, "lead": 1, "leader": 1, "learn": [1, 4, 7, 15, 16], "least": 3, "left": [9, 16], "legacy_french": 5, "length": [5, 16], "less": [15, 16], "let": [], "letter": [], "level": [1, 5, 9, 16], "levenshtein": [], "leverag": 10, "lf": 13, "libffi": 3, "librari": [2, 3, 10, 11], "light": 4, "lightweight": [], "like": 1, "limits_": 9, "line": [4, 7, 9, 16], "line_1_1": 16, "link": 11, "linknet": [4, 7], "linknet16": [], "linknet_resnet18": [7, 11, 16], "linknet_resnet18_rot": [], "linknet_resnet34": [7, 15, 16], "linknet_resnet50": [7, 16], "linux": [], "list": [5, 6, 8, 9, 13], "ll": 9, "load": [4, 5, 7], "load_state_dict": 11, "load_weight": 11, "loader": [], "loc_pr": 16, "local": [2, 4, 5, 7, 9, 14, 16], "localis": 5, "localizationconfus": 9, "locat": [2, 6, 16], "login": 7, "login_to_hub": [7, 13], "logo": [6, 14], "love": 13, "lower": [8, 9, 16], "m": [2, 9, 16], "m1": 3, "macbook": 3, "machin": 15, "maco": 3, "made": 4, "magc_resnet31": 7, "mai": [1, 2], "mail": 1, "main": 10, "maintain": 4, "mainten": 2, "make": [1, 2, 9, 12, 13, 15, 16], "mani": [14, 16], "manipul": 16, "map": [5, 7], "map_loc": 11, "mask_shap": 9, "master": [4, 7, 16], "match": [9, 16], "mathcal": 9, "matplotlib": [6, 9], "max": [5, 8, 9], "max_angl": 8, "max_area": 8, "max_char": [5, 14], "max_delta": 8, "max_dist": [], "max_gain": 8, "max_gamma": 8, "max_qual": 8, "max_ratio": 8, "maximum": [5, 8], "maxval": [7, 8], "mbox": 9, "mean": [8, 9, 11], "meaniou": 9, "meant": [6, 15], "measur": 16, "media": 1, "median": 7, "meet": 11, "member": 1, "memori": [9, 12, 15], "mention": 16, "merg": 5, "messag": 2, "meta": 16, "metadata": 15, "metal": 3, "method": [6, 8, 16], "metric": [9, 16], "middl": 16, "might": [15, 16], "min": 8, "min_area": 8, "min_char": [5, 14], "min_gain": 8, "min_gamma": 8, "min_qual": 8, "min_ratio": 8, "min_val": 8, "minde": [1, 3, 4, 7], "minim": [2, 4], "minimalist": [4, 7], "minimum": [3, 5, 8, 9, 16], "minval": 8, "miss": 3, "mistak": 1, "mix": [], "mixed_float16": 15, "mixed_precis": 15, "mjsynth": [4, 5, 14], "mnt": 5, "mobilenet": [7, 13], "mobilenet_v3_larg": 7, "mobilenet_v3_large_r": 7, "mobilenet_v3_smal": 7, "mobilenet_v3_small_crop_orient": [], "mobilenet_v3_small_orient": 7, "mobilenet_v3_small_page_orient": [], "mobilenet_v3_small_r": 7, "mobilenetv3": 7, "modal": [4, 5], "mode": 3, "model": [5, 9, 12, 14], "model_nam": [7, 13, 15], "model_path": 15, "moder": 1, "modif": 2, "modifi": [7, 12, 16], "modul": [6, 7, 8, 9, 16], "moment": [], "more": [2, 9, 14, 16], "most": 16, "mozilla": 1, "multi": [4, 7], "multilingu": [5, 13], "multipl": [5, 6, 8, 16], "multipli": 8, "multiprocess": 12, "my": 7, "my_awesome_model": 13, "my_hook": 16, "n": [5, 9], "na": [], "name": [5, 7, 15, 16], "nation": 1, "natur": [1, 4, 5], "nb": [], "ndarrai": [5, 6, 8, 9], "necessari": [3, 11, 12], "need": [2, 3, 5, 9, 11, 12, 13, 16], "neg": 8, "nest": 16, "nestedobject": [], "network": [4, 5, 7, 15], "neural": [4, 5, 7, 15], "new": [2, 9], "newer": [], "next": [5, 14], "nois": 8, "noisi": [4, 5], "non": [4, 5, 6, 7, 8, 9], "none": [5, 6, 7, 8, 9, 16], "normal": [7, 8], "norwegian": 5, "note": [0, 2, 5, 7, 13, 15], "now": 2, "np": [7, 8, 9, 16], "num_output_channel": 8, "num_sampl": [5, 14], "num_work": 5, "number": [5, 8, 9, 16], "numpi": [6, 7, 9, 16], "o": 3, "obb": [], "obj_detect": 13, "object": [5, 9, 10, 16], "objectness_scor": [], "oblig": 1, "obtain": 16, "occupi": 15, "ocr": [4, 5, 7, 9, 13, 14], "ocr_carea": 16, "ocr_db_crnn": 9, "ocr_lin": 16, "ocr_pag": 16, "ocr_par": 16, "ocr_predictor": [7, 11, 13, 15, 16], "ocrdataset": [5, 14], "ocrmetr": 9, "ocrpredictor": [7, 11], "ocrx_word": 16, "offens": 1, "offici": [1, 7], "offlin": 1, "offset": 8, "onc": 16, "one": [2, 5, 7, 8, 11, 13, 16], "oneof": 8, "ones": [5, 8, 9], "onli": [2, 7, 8, 9, 13, 14, 15, 16], "onlin": 1, "onnx": [], "onnxruntim": [], "onnxtr": [], "opac": 8, "opacity_rang": 8, "open": [1, 2, 13, 15], "opinion": 1, "optic": [4, 16], "optim": 4, "option": [5, 7, 11], "order": [2, 5, 6, 8], "org": [1, 5, 7, 16], "organ": 6, "orient": [1, 6, 7, 16], "orientationpredictor": [], "other": [1, 2], "otherwis": [1, 6, 9], "our": [2, 7, 16], "out": [2, 7, 8, 9, 16], "outpout": 16, "output": [6, 8, 15], "output_s": [6, 8], "outsid": 12, "over": [3, 5, 9, 16], "overal": [1, 7], "overlai": 6, "overview": [], "overwrit": [], "overwritten": 13, "own": 4, "p": [8, 9, 16], "packag": [2, 4, 9, 12, 14], "pad": [5, 7, 8, 16], "page": [3, 5, 7, 9, 16], "page1": 6, "page2": 6, "page_1": 16, "page_idx": [6, 16], "page_orientation_predictor": [], "page_param": [], "pair": 9, "pango": 3, "paper": 7, "par_1_1": 16, "paragraph": 16, "paragraph_break": 16, "param": [8, 16], "paramet": [4, 6, 7, 15], "pars": [4, 5], "parseq": [4, 7, 13, 16], "part": [5, 8, 16], "parti": 3, "partial": 16, "particip": 1, "pass": [5, 6, 7, 16], "password": 6, "patch": [7, 9], "path": [5, 6, 14], "path_to_checkpoint": 11, "path_to_custom_model": [], "path_to_pt": 11, "pattern": 1, "pdf": [6, 7, 10], "pdfpage": 6, "peopl": 1, "per": [8, 16], "perform": [4, 6, 7, 8, 9, 12, 15, 16], "period": 1, "permiss": 1, "permut": [4, 7], "persian_lett": 5, "person": [1, 14], "phase": 16, "photo": 14, "physic": [1, 6], "pick": 8, "pictur": 6, "pip": [2, 3], "pipelin": 16, "pixbuf": 3, "pixel": [6, 8, 16], "platinum": [], "pleas": 2, "plot": 9, "plt": 9, "plug": 13, "plugin": 3, "png": 6, "point": 15, "polici": 12, "polish": 5, "polit": 1, "polygon": [5, 9, 16], "pool": 7, "portugues": 5, "posit": [1, 9], "possibl": [2, 9, 13, 16], "post": [1, 16], "postprocessor": 16, "potenti": 7, "power": 4, "ppageno": 16, "pre": [2, 7], "precis": [9, 16], "pred": 9, "pred_box": 9, "pred_label": 9, "predefin": 14, "predict": [6, 7, 9, 16], "predictor": [4, 6, 7, 11, 13, 15], "prefer": 14, "preinstal": [], "preprocessor": [11, 16], "prerequisit": 13, "present": 10, "preserv": [7, 8, 16], "preserve_aspect_ratio": [6, 7, 8, 11, 16], "pretrain": [4, 7, 9, 11, 15, 16], "pretrained_backbon": [7, 11], "print": 16, "prior": 5, "privaci": 1, "privat": 1, "probabl": 8, "problem": 2, "procedur": 8, "process": [2, 4, 6, 11, 16], "processor": 16, "produc": [10, 16], "product": 15, "profession": 1, "project": [2, 14], "promptli": 1, "proper": 2, "properli": 5, "properti": [], "provid": [1, 2, 4, 13, 14, 16], "public": [1, 4], "publicli": 16, "publish": 1, "pull": 13, "punctuat": 5, "pure": 5, "purpos": 2, "push_to_hf_hub": [7, 13], "py": 13, "pypdfium2": [3, 6], "pyplot": [6, 9], "python": 2, "python3": 13, "pytorch": [3, 4, 7, 8, 11, 13, 15, 16], "q": 2, "qr": 6, "qr_code": 14, "qualiti": 8, "quantiz": [], "quantize_model": [], "question": 1, "quickli": 4, "quicktour": 10, "r": 16, "race": 1, "ramdisk": 5, "rand": [7, 8, 9, 15, 16], "random": [7, 8, 9, 16], "randomappli": 8, "randombright": 8, "randomcontrast": 8, "randomcrop": 8, "randomgamma": 8, "randomhorizontalflip": 8, "randomhu": 8, "randomjpegqu": 8, "randomli": 8, "randomres": [], "randomrot": 8, "randomsatur": 8, "randomshadow": 8, "rang": 8, "rassi": 13, "ratio": [7, 8, 16], "raw": [6, 9], "re": 15, "read": [4, 5, 7], "read_html": 6, "read_img": [], "read_img_as_numpi": 6, "read_img_as_tensor": 6, "read_pdf": 6, "readi": 15, "real": [4, 7, 8], "reason": [1, 4, 5], "rebuild": 2, "rebuilt": 2, "recal": [9, 16], "receipt": [4, 5, 16], "reco_arch": [7, 11, 13, 15], "reco_b": [], "reco_model": [11, 13], "reco_param": 11, "reco_predictor": 11, "recogn": 16, "recognit": [5, 9, 11], "recognition_predictor": [7, 16], "recognition_task": [5, 14], "recognitiondataset": [5, 14], "recognitionpredictor": [7, 11], "rectangular": 7, "recurr": [], "red": 9, "reduc": [3, 8], "refer": [2, 3, 11, 13, 14, 16], "regardless": 1, "region": 16, "regroup": 9, "regular": 14, "reject": 1, "rel": [6, 8, 9, 16], "relat": 6, "releas": [0, 3], "relev": [], "religion": 1, "relu": [], "remov": 1, "render": [6, 16], "repo": 7, "repo_id": [7, 13], "report": 1, "repositori": [5, 7, 13], "repres": [1, 9, 15, 16], "represent": [4, 7], "request": [1, 13], "requir": [3, 8], "research": 4, "residu": 7, "resiz": [8, 16], "resnet": 7, "resnet18": [7, 13], "resnet31": 7, "resnet34": 7, "resnet50": [7, 13], "resolv": 6, "resolve_block": 16, "resolve_lin": 16, "resourc": 14, "respect": 1, "respons": 9, "rest": [2, 8, 9], "restrict": 12, "result": [2, 5, 6, 10, 13, 16], "return": 16, "reusabl": 16, "review": 1, "rgb": [6, 8], "rgb_mode": 6, "rgb_output": 6, "right": [1, 7, 9], "robust": [4, 5], "root": 5, "rotat": [5, 6, 7, 8, 9, 14, 16], "rotated_bbox": [], "run": [2, 3, 7], "same": [2, 6, 9, 14, 16], "sampl": [5, 14, 16], "sample_transform": 5, "sar": [4, 7], "sar_resnet31": [7, 16], "sar_vgg16_bn": [], "satur": 8, "save": [7, 14], "saved_model": [], "scale": [6, 7, 8, 9], "scale_rang": [], "scan": [4, 5], "scene": [4, 5, 7], "scheme": [], "score": 9, "scratch": [], "script": [2, 14], "seamless": 4, "seamlessli": [4, 16], "search": 7, "searchabl": 10, "sec": 16, "second": 16, "section": [11, 13, 15, 16], "secur": [1, 12], "see": [1, 2], "seemlessli": [], "seen": 16, "segment": [4, 7, 16], "self": 16, "semant": [4, 7], "send": 16, "sens": 9, "sensit": 14, "separ": 16, "sequenc": [4, 5, 6, 7, 9, 16], "sequenti": [8, 16], "seri": 1, "serial": [], "serialized_model": [], "seriou": 1, "set": [1, 3, 5, 7, 9, 12, 16], "set_global_polici": 15, "sever": [6, 8, 16], "sex": 1, "sexual": 1, "sha256": [], "shade": 8, "shape": [4, 6, 7, 8, 9, 16], "share": [12, 14], "shift": 8, "shm": 12, "should": [2, 5, 6, 8, 9], "show": [4, 6, 7, 9, 11, 13], "showcas": 2, "shuffl": [5, 8], "side": 9, "signatur": 6, "signific": 14, "simpl": [4, 7], "simpler": 7, "sinc": [5, 14], "singl": [1, 2, 4, 5], "single_img_doc": [], "size": [1, 5, 6, 8, 9, 16], "skew": 16, "slack": 2, "slightli": 7, "small": [2, 7], "smallest": 6, "snapshot_download": 7, "snippet": 16, "so": [2, 3, 5, 7, 13, 14], "social": 1, "socio": 1, "some": [3, 10, 13, 14], "someth": 2, "somewher": 2, "soon": 15, "sort": 1, "sourc": [5, 6, 7, 8, 9, 13], "space": [1, 16], "span": 16, "spanish": 5, "spatial": [4, 5, 6, 9], "special": [], "specif": [2, 3, 9, 11, 14, 16], "specifi": [1, 5, 6], "speed": [4, 7], "sphinx": 2, "sroie": [4, 5, 14], "stabl": 3, "stackoverflow": 2, "stage": 4, "standard": 8, "start": 5, "state": [4, 9], "static": 9, "statist": [], "statu": 1, "std": [8, 11], "step": 12, "still": 16, "str": [5, 6, 7, 8, 9], "straight": [5, 7, 14, 16], "straighten": [], "straighten_pag": 7, "straigten_pag": [], "stream": 6, "street": [4, 5], "strict": 3, "strictli": 9, "string": [5, 6, 9, 16], "strive": 3, "strong": [4, 7], "structur": [15, 16], "subset": [5, 16], "suggest": [2, 13], "sum": 9, "summari": 9, "support": [15, 16], "sustain": 1, "svhn": [4, 5, 14], "svt": [5, 14], "swedish": 5, "symbol": [], "symmetr": [7, 8, 16], "symmetric_pad": [7, 8, 16], "synthes": 9, "synthesize_pag": 9, "synthet": 4, "synthtext": [4, 5, 14], "system": 16, "t": [2, 5, 11, 16], "tabl": 13, "take": [1, 5, 16], "target": [5, 6, 8, 9, 14], "target_s": 5, "task": [4, 5, 7, 13, 14, 16], "task2": 5, "team": 3, "techminde": 3, "templat": [2, 4], "tensor": [5, 6, 8, 16], "tensorflow": [3, 4, 6, 7, 8, 11, 13, 15, 16], "tensorspec": 15, "term": 1, "test": [5, 14], "test_set": 5, "text": [5, 6, 7, 9, 14], "text_output": 16, "textmatch": 9, "textnet": 7, "textnet_bas": 7, "textnet_smal": 7, "textnet_tini": 7, "textract": [4, 16], "textstylebrush": [4, 5], "textual": [4, 5, 6, 7, 16], "tf": [3, 6, 7, 8, 13, 15], "tf_model": [], "tflite": [], "than": [2, 3, 9, 13], "thank": 2, "thei": [1, 9], "them": [3, 5, 16], "thi": [1, 2, 3, 5, 9, 11, 12, 13, 14, 15, 16], "thing": [15, 16], "third": 3, "those": [1, 3, 6, 16], "threaten": 1, "threshold": 16, "through": [1, 8, 14], "tilman": 13, "time": [1, 4, 7, 9, 14], "tini": 7, "titl": [6, 16], "tm": 16, "tmp": 12, "togeth": [2, 6], "tograi": 8, "tool": 14, "top": [9, 16], "topic": 2, "torch": [3, 8, 11, 13, 15], "torchvis": 8, "total": 11, "toward": [1, 3], "train": [2, 5, 7, 8, 13, 14, 15, 16], "train_it": [5, 14], "train_load": [5, 14], "train_pytorch": 13, "train_set": [5, 14], "train_tensorflow": 13, "trainabl": [4, 7], "tranform": 8, "transcrib": 16, "transfer": [4, 5], "transfo": 8, "transform": [4, 5, 7], "translat": 1, "troll": 1, "true": [5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16], "truth": 9, "tune": 15, "tupl": [5, 6, 8, 9], "turn": [], "two": [6, 12], "txt": 5, "type": [6, 9, 13, 15, 16], "typic": 16, "u": [1, 2], "ucsd": 5, "udac": 2, "uint8": [6, 7, 9, 16], "ukrainian": [], "unaccept": 1, "underli": [14, 16], "underneath": 6, "understand": [4, 5, 16], "unidecod": 9, "uniform": [7, 8], "uniformli": 8, "uninterrupt": [6, 16], "union": 9, "unittest": 2, "unlock": 6, "unoffici": 7, "unprofession": 1, "unsolicit": 1, "unsupervis": 4, "unwelcom": 1, "up": [7, 16], "updat": 9, "upgrad": 2, "upper": [5, 8], "uppercas": 14, "url": 6, "us": [1, 2, 3, 5, 7, 9, 11, 12, 13, 16], "usabl": 16, "usag": [12, 15], "use_broadcast": 9, "use_polygon": [5, 9, 14], "useabl": 16, "user": [3, 4, 6, 10], "utf": 16, "util": 15, "v0": [], "v1": 13, "v3": [7, 13, 16], "valid": 14, "valu": [2, 6, 8, 16], "valuabl": 4, "variabl": 12, "varieti": 5, "veri": 7, "verifi": [], "version": [1, 2, 3, 15, 16], "vgg": 7, "vgg16": 13, "vgg16_bn_r": 7, "via": 1, "vietnames": 5, "view": [4, 5], "viewpoint": 1, "violat": 1, "visibl": 1, "vision": [4, 5, 7], "visiondataset": 5, "visiontransform": 7, "visual": 4, "visualize_pag": 9, "vit_": 7, "vit_b": 7, "vitstr": [4, 7, 15], "vitstr_bas": [7, 16], "vitstr_smal": [7, 11, 15, 16], "viz": [], "vocab": [11, 13, 14, 16], "vocabulari": [5, 11, 13], "w": [6, 7, 8, 9], "w3": 16, "wa": 1, "wai": [1, 4, 14], "want": [2, 15, 16], "warm": [], "warmup": 16, "wasn": 2, "we": [1, 2, 3, 4, 6, 8, 13, 14, 15, 16], "weasyprint": 6, "web": [2, 6], "websit": 5, "weight": 11, "welcom": 1, "well": [1, 15], "were": [1, 6, 16], "what": 1, "when": [1, 2, 7], "whenev": 2, "where": [2, 6, 8, 9], "whether": [2, 5, 6, 8, 9, 14, 16], "which": [1, 7, 12, 14, 16], "whichev": 3, "while": [8, 16], "why": 1, "width": 6, "wiki": 1, "wildreceipt": [4, 5, 14], "window": [3, 7, 9], "wish": 2, "within": 1, "without": [1, 5, 7], "wonder": 2, "word": [4, 5, 7, 9, 16], "word_1_1": 16, "word_1_2": 16, "word_1_3": 16, "wordgener": [5, 14], "words_onli": 9, "work": [12, 16], "worker": 5, "workflow": 2, "worklow": 2, "world": [9, 16], "worth": 7, "wrap": 16, "wrapper": [5, 8], "write": 12, "written": [1, 6], "www": [1, 6, 16], "x": [6, 8, 9], "x12larg": [], "x_ascend": 16, "x_descend": 16, "x_i": 9, "x_size": 16, "x_wconf": 16, "xeon": [], "xhtml": 16, "xmax": 6, "xmin": 6, "xml": 16, "xml_bytes_str": 16, "xml_element": 16, "xml_output": 16, "xmln": 16, "y": 9, "y_i": 9, "y_j": 9, "yet": [], "ymax": 6, "ymin": 6, "yolov8": [], "you": [2, 3, 5, 6, 7, 11, 12, 13, 14, 15, 16], "your": [2, 4, 6, 9, 16], "yoursit": 6, "zero": [8, 9], "zoo": [], "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7\u00e0\u00e2\u00e9\u00e8\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00e7": 5, "\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7\u00e0\u00e2\u00e9\u00e8\u00ea\u00eb\u00ee\u00ef\u00f4\u00f9\u00fb\u00fc\u00e7": 5, "\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa\u00e0\u00e8\u00e9\u00ec\u00ed\u00ee\u00f2\u00f3\u00f9\u00fa": 5, "\u00e1\u00e0\u00e2\u00e3\u00e9\u00ea\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7\u00e1\u00e0\u00e2\u00e3\u00e9\u00eb\u00ed\u00ef\u00f3\u00f4\u00f5\u00fa\u00fc\u00e7": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": 5, "\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5\u00e1\u00e0\u1ea3\u1ea1\u00e3\u0103\u1eaf\u1eb1\u1eb3\u1eb5\u1eb7\u00e2\u1ea5\u1ea7\u1ea9\u1eab\u1ead\u0111\u00e9\u00e8\u1ebb\u1ebd\u1eb9\u00ea\u1ebf\u1ec1\u1ec3\u1ec5\u1ec7\u00f3\u00f2\u1ecf\u00f5\u1ecd\u00f4\u1ed1\u1ed3\u1ed5\u1ed9\u1ed7\u01a1\u1edb\u1edd\u1edf\u1ee3\u1ee1\u00fa\u00f9\u1ee7\u0169\u1ee5\u01b0\u1ee9\u1eeb\u1eed\u1eef\u1ef1i\u00ed\u00ec\u1ec9\u0129\u1ecb\u00fd\u1ef3\u1ef7\u1ef9\u1ef5": [], "\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1\u00e1\u00e9\u00ed\u00f3\u00fa\u00fc\u00f1": 5, "\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e\u00e1\u010d\u010f\u00e9\u011b\u00ed\u0148\u00f3\u0159\u0161\u0165\u00fa\u016f\u00fd\u017e": 5, "\u00e4\u00f6\u00e4\u00f6": 5, "\u00e4\u00f6\u00fc\u00df\u00e4\u00f6\u00fc\u00df": 5, "\u00e5\u00e4\u00f6\u00e5\u00e4\u00f6": 5, "\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5": 5, "\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c\u0105\u0107\u0119\u0142\u0144\u00f3\u015b\u017a\u017c": 5, "\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9\u03b1\u03b2\u03b3\u03b4\u03b5\u03b6\u03b7\u03b8\u03b9\u03ba\u03bb\u03bc\u03bd\u03be\u03bf\u03c0\u03c1\u03c3\u03c4\u03c5\u03c6\u03c7\u03c8\u03c9": 5, "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f": [], "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f0123456789": [], "\u0491\u0456\u0457\u0454\u0491\u0456\u0457\u0454": [], "\u05d0\u05d1\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05db\u05dc\u05de\u05e0\u05e1\u05e2\u05e4\u05e6\u05e7\u05e8\u05e9\u05ea": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a": 5, "\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669": 5, "\u067e\u0686\u06a2\u06a4\u06af": 5, "\u0905": [], "\u0905\u0906\u0907\u0908\u0909\u090a\u090b\u0960\u090c\u0961\u090f\u0910\u0913\u0914\u0905": [], "\u0915\u0916\u0917\u0918\u0919\u091a\u091b\u091c\u091d\u091e\u091f\u0920\u0921\u0922\u0923\u0924\u0925\u0926\u0927\u0928\u092a\u092b\u092c\u092d\u092e\u092f\u0930\u0932\u0935\u0936\u0937\u0938\u0939\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f": [], "\u0950": [], "\u0985\u0986\u0987\u0988\u0989\u098a\u098b\u098f\u0990\u0993\u0994\u0995\u0996\u0997\u0998\u0999\u099a\u099b\u099c\u099d\u099e\u099f\u09a0\u09a1\u09a2\u09a3\u09a4\u09a5\u09a6\u09a7\u09a8\u09aa\u09ab\u09ac\u09ad\u09ae\u09af\u09b0\u09b2\u09b6\u09b7\u09b8\u09b9": [], "\u09bd": [], "\u09ce": [], "\u09e6\u09e7\u09e8\u09e9\u09ea\u09eb\u09ec\u09ed\u09ee\u09ef": []}, "titles": ["Changelog", "Contributor Covenant Code of Conduct", "Contributing to docTR", "Installation", "docTR: Document Text Recognition", "doctr.datasets", "doctr.io", "doctr.models", "doctr.transforms", "doctr.utils", "docTR Notebooks", "Train your own model", "AWS Lambda", "Share your model with the community", "Choose a ready to use dataset", "Preparing your model for inference", "Choosing the right model"], "titleterms": {"": 2, "0": 0, "01": 0, "02": 0, "03": 0, "04": [], "05": 0, "07": 0, "08": 0, "09": 0, "1": [0, 1], "10": 0, "11": 0, "12": 0, "18": 0, "2": [0, 1], "2021": 0, "2022": 0, "2023": 0, "2024": 0, "22": 0, "27": 0, "28": 0, "29": 0, "3": [0, 1], "31": 0, "4": [0, 1], "5": 0, "6": 0, "7": 0, "8": 0, "9": [], "advanc": 16, "approach": 16, "architectur": 16, "arg": [5, 6, 7, 8, 9], "artefact": 6, "artefactdetect": [], "attribut": 1, "avail": [14, 16], "aw": 12, "ban": 1, "block": 6, "bug": 2, "build": [], "changelog": 0, "choos": [14, 16], "classif": [7, 13], "code": [1, 2], "codebas": 2, "commit": 2, "commun": 13, "compos": 8, "compress": [], "conda": 3, "conduct": 1, "connect": 2, "content": [], "continu": 2, "contrib": [], "contribut": 2, "contributor": 1, "convent": 13, "correct": 1, "coven": 1, "custom": [5, 11], "data": 14, "dataload": 5, "dataset": [4, 5, 14], "detect": [4, 7, 13, 14, 16], "develop": 2, "do": 16, "doctr": [2, 4, 5, 6, 7, 8, 9, 10, 15], "document": [2, 4, 6], "end": 16, "enforc": 1, "evalu": 9, "export": 15, "factori": 7, "featur": [2, 4], "feedback": 2, "file": 6, "from": 13, "gener": [5, 14], "get": [], "git": 3, "guidelin": 1, "half": 15, "hub": 13, "huggingfac": 13, "i": 16, "implement": [], "infer": 15, "instal": [2, 3], "integr": 2, "io": 6, "lambda": 12, "let": 2, "line": 6, "linux": 3, "load": [11, 13, 14], "loader": 5, "main": 4, "mode": 2, "model": [4, 7, 11, 13, 15, 16], "modifi": 2, "modul": [], "name": 13, "note": [], "notebook": 10, "object": 14, "ocr": 16, "onli": 3, "onnx": 15, "optim": 15, "option": 16, "orient": [], "our": 1, "output": 16, "own": [11, 14], "packag": 3, "page": 6, "perman": 1, "pipelin": [], "pledg": 1, "post": [], "pre": [], "precis": 15, "predictor": 16, "prepar": 15, "prerequisit": 3, "pretrain": 13, "process": [], "push": 13, "python": 3, "qualiti": 2, "question": 2, "read": 6, "readi": 14, "recognit": [4, 7, 13, 14, 16], "refer": [], "report": 2, "request": 2, "respons": 1, "return": [5, 6, 7, 9], "right": 16, "savedmodel": [], "scope": 1, "share": 13, "should": 16, "stage": 16, "standard": 1, "start": [], "structur": [2, 6], "style": 2, "support": [4, 5, 8], "synthet": [5, 14], "task": 9, "temporari": 1, "test": 2, "text": [4, 16], "train": 11, "transform": 8, "two": 16, "unit": 2, "us": [14, 15], "util": 9, "v0": 0, "verif": 2, "via": 3, "visual": 9, "vocab": 5, "warn": 1, "what": 16, "word": 6, "your": [11, 13, 14, 15], "zoo": [4, 7]}}) \ No newline at end of file diff --git a/v0.8.1/transforms.html b/v0.8.1/transforms.html index 0d1b5f7402..d42da50481 100644 --- a/v0.8.1/transforms.html +++ b/v0.8.1/transforms.html @@ -227,28 +227,21 @@ @@ -293,7 +286,7 @@

    doctr.transformstorchvision, we express transformations as composable modules.

    Supported transformations

    -

    Here are all transformations that are available through docTR:

    +

    Here are all transformations that are available through DocTR:

    class doctr.transforms.Resize(output_size: Tuple[int, int], method: str = 'bilinear', preserve_aspect_ratio: bool = False, symmetric_pad: bool = False)[source]
    @@ -364,7 +357,7 @@

    Supported transformations
    -class doctr.transforms.ToGray(num_output_channels: int = 1)[source]
    +class doctr.transforms.ToGray[source]

    Convert a RGB tensor (batch of images or image) to a 3-channels grayscale tensor

    Example::
    >>> from doctr.transforms import Normalize
    @@ -524,88 +517,6 @@ 

    Supported transformations -
    -class doctr.transforms.RandomRotate(max_angle: float = 5.0, expand: bool = False)[source]
    -

    Randomly rotate a tensor image and its boxes

    -https://github.com/mindee/doctr/releases/download/v0.4.0/rotation_illustration.png -
    -
    Parameters:
    -
      -
    • max_angle – maximum angle for rotation, in degrees. Angles will be uniformly picked in -[-max_angle, max_angle]

    • -
    • expand – whether the image should be padded before the rotation

    • -
    -
    -
    -

    - -
    -
    -class doctr.transforms.RandomCrop(scale: Tuple[float, float] = (0.08, 1.0), ratio: Tuple[float, float] = (0.75, 1.33))[source]
    -

    Randomly crop a tensor image and its boxes

    -
    -
    Parameters:
    -
      -
    • scale – tuple of floats, relative (min_area, max_area) of the crop

    • -
    • ratio – tuple of float, relative (min_ratio, max_ratio) where ratio = h/w

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.GaussianBlur(kernel_shape: int | Iterable[int], std: Tuple[float, float])[source]
    -

    Randomly adjust jpeg quality of a 3 dimensional RGB image

    -
    -
    Example::
    >>> from doctr.transforms import GaussianBlur
    ->>> import tensorflow as tf
    ->>> transfo = GaussianBlur(3, (.1, 5))
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • kernel_shape – size of the blurring kernel

    • -
    • std – min and max value of the standard deviation

    • -
    -
    -
    -
    - -
    -
    -class doctr.transforms.ChannelShuffle[source]
    -

    Randomly shuffle channel order of a given image

    -
    - -
    -
    -class doctr.transforms.GaussianNoise(mean: float = 0.0, std: float = 1.0)[source]
    -

    Adds Gaussian Noise to the input tensor

    -
    -
    Example::
    >>> from doctr.transforms import GaussianNoise
    ->>> import tensorflow as tf
    ->>> transfo = GaussianNoise(0., 1.)
    ->>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • mean – mean of the gaussian distribution

    • -
    • std – std of the gaussian distribution

    • -
    -
    -
    -
    -

    Composing transformations

    @@ -744,11 +655,6 @@

    Composing transformationsRandomHue
  • RandomGamma
  • RandomJpegQuality
  • -
  • RandomRotate
  • -
  • RandomCrop
  • -
  • GaussianBlur
  • -
  • ChannelShuffle
  • -
  • GaussianNoise
  • Composing transformations
      @@ -768,7 +674,7 @@

      Composing transformations +

  • diff --git a/v0.8.1/using_model_export.html b/v0.8.1/using_model_export.html deleted file mode 100644 index 9b0acb00fe..0000000000 --- a/v0.8.1/using_model_export.html +++ /dev/null @@ -1,436 +0,0 @@ - - - - - - - - - - - - - Preparing your model for inference - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Preparing your model for inference

    -

    A well-trained model is a good achievement but you might want to tune a few things to make it production-ready!

    -
    -

    Model compression

    -

    This section is meant to help you perform inference with compressed versions of your model.

    -
    -

    TensorFlow Lite

    -

    TensorFlow provides utilities packaged as TensorFlow Lite to take resource constraints into account. You can easily convert any Keras model into a serialized TFLite version as follows:

    -
    >>> import tensorflow as tf
    ->>> from tensorflow.keras import Sequential
    ->>> from doctr.models import conv_sequence
    ->>> model = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=(224, 224, 3)))
    ->>> converter = tf.lite.TFLiteConverter.from_keras_model(tf_model)
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Half-precision

    -

    If you want to convert it to half-precision using your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> converter.target_spec.supported_types = [tf.float16]
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -

    Post-training quantization

    -

    Finally if you wish to quantize the model with your TFLite converter

    -
    >>> converter.optimizations = [tf.lite.Optimize.DEFAULT]
    ->>> # Float fallback for operators that do not have an integer implementation
    ->>> def representative_dataset():
    ->>>     for _ in range(100): yield [np.random.rand(1, *input_shape).astype(np.float32)]
    ->>> converter.representative_dataset = representative_dataset
    ->>> converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
    ->>> converter.inference_input_type = tf.int8
    ->>> converter.inference_output_type = tf.int8
    ->>> serialized_model = converter.convert()
    -
    -
    -
    -
    -
    -

    Using SavedModel

    -

    Additionally, models in docTR inherit TensorFlow 2 model properties and can be exported to -SavedModel format as follows:

    -
    >>> import tensorflow as tf
    ->>> from doctr.models import db_resnet50
    ->>> model = db_resnet50(pretrained=True)
    ->>> input_t = tf.random.uniform(shape=[1, 1024, 1024, 3], maxval=1, dtype=tf.float32)
    ->>> _ = model(input_t, training=False)
    ->>> tf.saved_model.save(model, 'path/to/your/folder/db_resnet50/')
    -
    -
    -

    And loaded just as easily:

    -
    >>> import tensorflow as tf
    ->>> model = tf.saved_model.load('path/to/your/folder/db_resnet50/')
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/using_models.html b/v0.8.1/using_models.html deleted file mode 100644 index 53cad99cac..0000000000 --- a/v0.8.1/using_models.html +++ /dev/null @@ -1,909 +0,0 @@ - - - - - - - - - - - - - Choosing the right model - docTR documentation - - - - - - - - - - - - - - - - - - - Contents - - - - - - Menu - - - - - - - - Expand - - - - - - Light mode - - - - - - - - - - - - - - Dark mode - - - - - - - Auto light/dark, in light mode - - - - - - - - - - - - - - - Auto light/dark, in dark mode - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Skip to content - - - -
    -
    -
    - -
    - -
    -
    - -
    - -
    -
    - -
    -
    -
    - - - - - Back to top - -
    - -
    - -
    - -
    -
    -
    -

    Choosing the right model

    -

    The full Optical Character Recognition task can be seen as two consecutive tasks: text detection and text recognition. -Either performed at once or separately, to each task corresponds a type of deep learning architecture.

    -

    For a given task, docTR provides a Predictor, which is composed of 2 components:

    -
      -
    • PreProcessor: a module in charge of making inputs directly usable by the deep learning model.

    • -
    • Model: a deep learning model, implemented with all supported deep learning backends (TensorFlow & PyTorch) along with its specific post-processor to make outputs structured and reusable.

    • -
    -
    -

    Text Detection

    -

    The task consists of localizing textual elements in a given image. -While those text elements can represent many things, in docTR, we will consider uninterrupted character sequences (words). Additionally, the localization can take several forms: from straight bounding boxes (delimited by the 2D coordinates of the top-left and bottom-right corner), to polygons, or binary segmentation (flagging which pixels belong to this element, and which don’t).

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Input shape

    # params

    Recall

    Precision

    Recall

    Precision

    FPS

    db_resnet50

    (1024, 1024, 3)

    25.2 M

    82.14

    87.64

    92.49

    89.66

    2.1

    db_mobilenet_v3_large

    (1024, 1024, 3)

    4.2 M

    79.35

    84.03

    81.14

    66.85

    -
    -

    All text detection models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combined have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Detection predictors

    -

    detection_predictor wraps your detection model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import detection_predictor
    ->>> predictor = detection_predictor('db_resnet50')
    ->>> dummy_img = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    Text Recognition

    -

    The task consists of transcribing the character sequence in a given image.

    -
    -

    Available architectures

    -

    The following architectures are currently supported:

    - -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    Text recognition model zoo

    Architecture

    Input shape

    # params

    FUNSD

    CORD

    FPS

    crnn_vgg16_bn

    (32, 128, 3)

    15.8M

    87.18

    92.93

    12.8

    crnn_mobilenet_v3_small

    (32, 128, 3)

    2.1M

    86.21

    90.56

    crnn_mobilenet_v3_large

    (32, 128, 3)

    4.5M

    86.95

    92.03

    sar_resnet31

    (32, 128, 3)

    56.2M

    87.70

    93.41

    2.7

    master

    (32, 128, 3)

    67.7M

    87.62

    93.27

    -
    -

    All text recognition models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metric being used (exact match) are available in Task evaluation.

    -

    While most of our recognition models were trained on our french vocab (cf. Supported Vocabs), you can easily access the vocab of any model as follows:

    -
    >>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> print(predictor.model.cfg['vocab'])
    -
    -
    -

    Disclaimer: both FUNSD subsets combine have 30595 word-level crops which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed tensors per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -
    -
    -

    Recognition predictors

    -

    recognition_predictor wraps your recognition model to make it easily useable with your favorite deep learning framework seamlessly.

    -
    >>> import numpy as np
    ->>> from doctr.models import recognition_predictor
    ->>> predictor = recognition_predictor('crnn_vgg16_bn')
    ->>> dummy_img = (255 * np.random.rand(50, 150, 3)).astype(np.uint8)
    ->>> out = model([dummy_img])
    -
    -
    -
    -
    -
    -

    End-to-End OCR

    -

    The task consists of both localizing and transcribing textual elements in a given image.

    -
    -

    Available architectures

    -

    You can use any combination of detection and recognition models supporte by docTR.

    -

    For a comprehensive comparison, we have compiled a detailed benchmark on publicly available datasets:

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    FUNSD

    CORD

    Architecture

    Recall

    Precision

    FPS

    Recall

    Precision

    FPS

    db_resnet50 + crnn_vgg16_bn

    71.25

    76.02

    0.85

    84.00

    81.42

    1.6

    db_resnet50 + master

    71.03

    76.06

    84.49

    81.94

    db_resnet50 + sar_resnet31

    71.25

    76.29

    0.27

    84.50

    81.96

    0.83

    db_resnet50 + crnn_mobilenet_v3_small

    69.85

    74.80

    80.85

    78.42

    0.83

    db_resnet50 + crnn_mobilenet_v3_large

    70.57

    75.57

    82.57

    80.08

    0.83

    db_mobilenet_v3_large + crnn_vgg16_bn

    67.73

    71.73

    71.65

    59.03

    Gvision text detection

    59.50

    62.50

    75.30

    70.00

    Gvision doc. text detection

    64.00

    53.30

    68.90

    61.10

    AWS textract

    78.10

    83.00

    87.50

    66.00

    -
    -

    All OCR models above have been evaluated using both the training and evaluation sets of FUNSD and CORD (cf. Available Datasets). -Explanations about the metrics being used are available in Task evaluation.

    -

    Disclaimer: both FUNSD subsets combine have 199 pages which might not be representative enough of the model capabilities

    -

    FPS (Frames per second) is computed after a warmup phase of 100 tensors (where the batch size is 1), by measuring the average number of processed frames per second over 1000 samples. Those results were obtained on a c5.x12large AWS instance (CPU Xeon Platinum 8275L).

    -

    Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information.

    -
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    Receipts

    Invoices

    IDs

    US Tax Forms

    Resumes

    Road Fines

    Architecture

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    Recall

    Precision

    db_resnet50 + crnn_vgg16_bn (ours)

    78.70

    81.12

    65.80

    70.70

    50.25

    51.78

    79.08

    92.83

    db_resnet50 + master (ours)

    79.00

    81.42

    65.57

    69.86

    51.34

    52.90

    78.86

    92.57

    db_resnet50 + sar_resnet31 (ours)

    78.94

    81.37

    65.89

    70.79

    51.78

    53.35

    79.04

    92.78

    db_resnet50 + crnn_mobilenet_v3_small (ours)

    76.81

    79.15

    64.89

    69.61

    45.03

    46.38

    78.96

    92.11

    85.91

    87.20

    84.85

    85.86

    db_resnet50 + crnn_mobilenet_v3_large (ours)

    78.01

    80.39

    65.36

    70.11

    48.00

    49.43

    79.39

    92.62

    87.68

    89.00

    85.65

    86.67

    db_mobilenet_v3_large + crnn_vgg16_bn (ours)

    78.36

    74.93

    63.04

    68.41

    39.36

    41.75

    72.14

    89.97

    Gvision doc. text detection

    68.91

    59.89

    63.20

    52.85

    43.70

    29.21

    69.79

    65.68

    AWS textract

    75.77

    77.70

    70.47

    69.13

    46.39

    43.32

    84.31

    98.11

    -
    -
    -
    -

    Two-stage approaches

    -

    Those architectures involve one stage of text detection, and one stage of text recognition. The text detection will be used to produces cropped images that will be passed into the text recognition block. Everything is wrapped up with ocr_predictor.

    -
    >>> import numpy as np
    ->>> from doctr.models import ocr_predictor
    ->>> model = ocr_predictor('db_resnet50', 'crnn_vgg16_bn', pretrained=True)
    ->>> input_page = (255 * np.random.rand(800, 600, 3)).astype(np.uint8)
    ->>> out = model([input_page])
    -
    -
    -
    -
    -

    What should I do with the output?

    -

    The ocr_predictor returns a Document object with a nested structure (with Page, Block, Line, Word, Artefact). -To get a better understanding of our document model, check our Document structure section

    -

    Here is a typical Document layout:

    -
    Document(
    -  (pages): [Page(
    -    dimensions=(340, 600)
    -    (blocks): [Block(
    -      (lines): [Line(
    -        (words): [
    -          Word(value='No.', confidence=0.91),
    -          Word(value='RECEIPT', confidence=0.99),
    -          Word(value='DATE', confidence=0.96),
    -        ]
    -      )]
    -      (artefacts): []
    -    )]
    -  )]
    -)
    -
    -
    -

    You can also export them as a nested dict, more appropriate for JSON format:

    -
    json_output = result.export()
    -
    -
    -

    For reference, here is the JSON export for the same Document as above:

    -
    {
    -  'pages': [
    -      {
    -          'page_idx': 0,
    -          'dimensions': (340, 600),
    -          'orientation': {'value': None, 'confidence': None},
    -          'language': {'value': None, 'confidence': None},
    -          'blocks': [
    -              {
    -                  'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                  'lines': [
    -                      {
    -                          'geometry': ((0.1357421875, 0.0361328125), (0.8564453125, 0.8603515625)),
    -                          'words': [
    -                              {
    -                                  'value': 'No.',
    -                                  'confidence': 0.914085328578949,
    -                                  'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
    -                              },
    -                              {
    -                                  'value': 'RECEIPT',
    -                                  'confidence': 0.9949972033500671,
    -                                  'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
    -                              },
    -                              {
    -                                  'value': 'DATE',
    -                                  'confidence': 0.9578408598899841,
    -                                  'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
    -                              }
    -                          ]
    -                      }
    -                  ],
    -                  'artefacts': []
    -              }
    -          ]
    -      }
    -  ]
    -}
    -
    -
    -

    To export the outpout as XML (hocr-format) you can use the export_as_xml method:

    -
    xml_output = result.export_as_xml()
    -for output in xml_output:
    -  xml_bytes_string = output[0]
    -  xml_element = output[1]
    -
    -
    -

    For reference, here is a sample XML byte string output:

    -
    <?xml version="1.0" encoding="UTF-8"?>
    -<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
    -  <head>
    -    <title>docTR - hOCR</title>
    -    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    -    <meta name="ocr-system" content="doctr 0.5.0" />
    -    <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
    -  </head>
    -  <body>
    -    <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
    -    <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
    -      <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
    -        <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
    -          <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
    -          <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
    -          <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
    -        </span>
    -      </p>
    -    </div>
    -  </body>
    -</html>
    -
    -
    -
    -
    -
    - -
    -
    - -
    - -
    -
    - - - - - - - - \ No newline at end of file diff --git a/v0.8.1/utils.html b/v0.8.1/utils.html index 21f708c953..1908ef4ff4 100644 --- a/v0.8.1/utils.html +++ b/v0.8.1/utils.html @@ -12,7 +12,7 @@ gtag('js', new Date()); gtag('config', 'G-40DVRMX8T4'); - + doctr.utils - docTR documentation @@ -227,28 +227,21 @@ @@ -327,25 +320,6 @@

    Visualization -
    -doctr.utils.visualization.synthesize_page(page: Dict[str, Any], draw_proba: bool = False, font_size: int = 13, font_family: str | None = None) ndarray[source]
    -

    Draw a the content of the element page (OCR response) on a blank page.

    -
    -
    Parameters:
    -
      -
    • page – exported Page object to represent

    • -
    • draw_proba – if True, draw words in colors to represent confidence. Blue: p=1, red: p=0

    • -
    • font_size – size of the font, default font = 13

    • -
    • font_family – family of the font

    • -
    -
    -
    Returns:
    -

    the synthesized page

    -
    -
    -
    -

    Task evaluation

    @@ -382,20 +356,6 @@

    Visualization -
    -update(gt: List[str], pred: List[str]) None[source]
    -

    Update the state of the metric with new predictions

    -
    -
    Parameters:
    -
      -
    • gt – list of groung-truth character sequences

    • -
    • pred – list of predicted character sequences

    • -
    -
    -
    -
    -
    summary() Dict[str, float][source]
    @@ -412,14 +372,14 @@

    Visualization
    -class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    +class doctr.utils.metrics.LocalizationConfusion(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source]

    Implements common confusion metrics and mean IoU for localization evaluation.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall Y \in \mathcal{B}^N, \forall X \in \mathcal{B}^M, \\ Recall(X, Y) = \frac{1}{N} \sum\limits_{i=1}^N g_{X}(Y_i) \\ -Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M g_{X}(Y_i) \\ +Precision(X, Y) = \frac{1}{M} \sum\limits_{i=1}^N g_{X}(Y_i) \\ meanIoU(X, Y) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(X_i, Y_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -448,28 +408,9 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    -update(gts: ndarray, preds: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gts – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • preds – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    -
    -
    -
    -
    summary() Tuple[float | None, float | None, float | None][source]
    @@ -485,15 +426,15 @@

    Visualization
    -class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an end-to-end OCR metric.

    +class doctr.utils.metrics.OCRMetric(iou_thresh: float = 0.5, rotated_bbox: bool = False, mask_shape: Tuple[int, int] = (1024, 1024))[source] +

    Implements end-to-end OCR metric.

    The aggregated metrics are computed as follows:

    \[\begin{split}\forall (B, L) \in \mathcal{B}^N \times \mathcal{L}^N, \forall (\hat{B}, \hat{L}) \in \mathcal{B}^M \times \mathcal{L}^M, \\ Recall(B, \hat{B}, L, \hat{L}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ -Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,L}(\hat{B}_i, \hat{L}_i) \\ +Precision(B, \hat{B}, L, \hat{L}) = \frac{1}{M} \sum\limits_{i=1}^N h_{B,L}(\hat{B}_i, \hat{L}_i) \\ meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and @@ -525,116 +466,16 @@

    Visualization
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -

    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: List[str], pred_labels: List[str]) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – a list of N string labels

    • -
    • pred_labels – a list of M string labels

    • -
    +

    iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    -
    -
    summary() Tuple[Dict[str, float | None], Dict[str, float | None], float | None][source]

    Computes the aggregated metrics

    Returns:
    -

    a tuple with the recall & precision for each string comparison and the mean IoU

    -
    -
    -
    - - - -
    -
    -class doctr.utils.metrics.DetectionMetric(iou_thresh: float = 0.5, use_polygons: bool = False, mask_shape: Tuple[int, int] = (1024, 1024), use_broadcasting: bool = True)[source]
    -

    Implements an object detection metric.

    -

    The aggregated metrics are computed as follows:

    -
    -
    -\[\begin{split}\forall (B, C) \in \mathcal{B}^N \times \mathcal{C}^N, -\forall (\hat{B}, \hat{C}) \in \mathcal{B}^M \times \mathcal{C}^M, \\ -Recall(B, \hat{B}, C, \hat{C}) = \frac{1}{N} \sum\limits_{i=1}^N h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -Precision(B, \hat{B}, C, \hat{C}) = \frac{1}{M} \sum\limits_{i=1}^M h_{B,C}(\hat{B}_i, \hat{C}_i) \\ -meanIoU(B, \hat{B}) = \frac{1}{M} \sum\limits_{i=1}^M \max\limits_{j \in [1, N]} IoU(\hat{B}_i, B_j)\end{split}\]
    -
    -

    with the function \(IoU(x, y)\) being the Intersection over Union between bounding boxes \(x\) and -\(y\), and the function \(h_{B, C}\) defined as:

    -
    -
    -\[\begin{split}\forall (b, c) \in \mathcal{B} \times \mathcal{C}, -h_{B,C}(b, c) = \left\{ - \begin{array}{ll} - 1 & \mbox{if } b\mbox{ has been assigned to a given }B_j\mbox{ with an } \\ - & IoU \geq 0.5 \mbox{ and that for this assignment, } c = C_j\\ - 0 & \mbox{otherwise.} - \end{array} -\right.\end{split}\]
    -
    -

    where \(\mathcal{B}\) is the set of possible bounding boxes, -\(\mathcal{C}\) is the set of possible class indices, -\(N\) (number of ground truths) and \(M\) (number of predictions) are strictly positive integers.

    -
    -
    Example::
    >>> import numpy as np
    ->>> from doctr.utils import DetectionMetric
    ->>> metric = DetectionMetric(iou_thresh=0.5)
    ->>> metric.update(np.asarray([[0, 0, 100, 100]]), np.asarray([[0, 0, 70, 70], [110, 95, 200, 150]]),
    -np.zeros(1, dtype=np.int64), np.array([0, 1], dtype=np.int64))
    ->>> metric.summary()
    -
    -
    -
    -
    -
    -
    Parameters:
    -
      -
    • iou_thresh – minimum IoU to consider a pair of prediction and ground truth as a match

    • -
    • use_polygons – if set to True, predictions and targets will be expected to have rotated format

    • -
    • mask_shape – if use_polygons is True, describes the spatial shape of the image used

    • -
    • use_broadcasting – if use_polygons is True, use broadcasting for IoU computation by consuming more memory

    • -
    -
    -
    -
    -
    -update(gt_boxes: ndarray, pred_boxes: ndarray, gt_labels: ndarray, pred_labels: ndarray) None[source]
    -

    Updates the metric

    -
    -
    Parameters:
    -
      -
    • gt_boxes – a set of relative bounding boxes either of shape (N, 4) or (N, 5) if they are rotated ones

    • -
    • pred_boxes – a set of relative bounding boxes either of shape (M, 4) or (M, 5) if they are rotated ones

    • -
    • gt_labels – an array of class indices of shape (N,)

    • -
    • pred_labels – an array of class indices of shape (M,)

    • -
    -
    -
    -
    - -
    -
    -summary() Tuple[float | None, float | None, float | None][source]
    -

    Computes the aggregated metrics

    -
    -
    Returns:
    -

    a tuple with the recall & precision for each class prediction and the mean IoU

    +

    a tuple with the recall & precision for each string comparison flexibility and the mean IoU

    @@ -649,15 +490,7 @@

    Visualization - -
    -
    - Next -
    -
    Changelog
    -
    - -
    +