v 0.1.0

gsoykan · Jul 10, 2023 · 824cfa2 · 824cfa2
commit 824cfa2
Show file tree

Hide file tree

Showing 226 changed files with 26,219 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,204 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+### VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+**/.vscode
+
+# JetBrains
+.idea/
+
+# Lightning-Hydra-Template
+data/
+logs/
+wandb/
+.env
+.autoenv
+weights/
+/example_comic_pages/Hellblazer 001 (1988) (digital-Empire)/
+
+temp_sb.jpg
+panel_text_associations/
+
+temp_page_graph.pdf
+
+scripts/test.csv
+
+scripts/temp_sb_161.jpg
+
+scripts/new_ocr_comparison_986.csv
+
+scripts/new_ocr_comparison_912_detect.csv
+
+scripts/new_ocr_comparison_441.csv
+
+scripts/new_ocr_comparison_661.csv
+
+scripts/new_ocr_comparison_834.csv
+
+scripts/NEW_COMICS_ocr_file_0.csv
+
+scripts/new_comics/neuspell_checking_results.csv
+
+scripts/ground-truth-additional-textbox-postprocess.csv
+
+scripts/ground-truth-textbox-with-preds.csv
+
+scripts/ground-truth-textbox_backup.csv
+
+scripts/new_comics/contextual_spell_checking_results.csv
+
+scripts/errors_NEW_COMICS_ocr_file_0.csv
+
+scripts/.~lock.ground-truth-textbox.csv#
+
+scripts/.~lock.comics_ocr_gt.csv#
+
+scripts/new_ocr_comparison.csv
+
+scripts/**/*.csv
+scripts/**/*.jpg
+scripts/**/*.zip
+
+modern_speech_bub/
+new_ocr/
+new_ocr_backup/
+new_ocr_errors/
+new_ocr_errors_backup/
+new_ocr_errors_fixed/
+new_ocr_errors_updated/
+new_ocr_skipped/
+new_ocr_skipped_errors/
+
+*.zip
diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+to install 
+
+```shell
+pip install .[cuda] -f https://download.pytorch.org/whl/torch_stable.html
+```
diff --git a/comics_ocr/__init__.py b/comics_ocr/__init__.py
@@ -0,0 +1,42 @@
+from comics_ocr.comics_ocr import ComicsOCR
+
+__author__ = 'Gürkan Soykan'
+__email__ = '[email protected]'
+__version__ = '0.1.0'
+
+# module level doc-string
+__doc__ = """
+COMICS OCR
+================
+
+Description
+-----------
+ComicsOCR is a Python package created for easily distributing OCR models trained for golden age of comics.
+
+Example
+-------
+>>> # Import library
+>>> from comics_ocr import ComicsOCR
+>>> # Initialize
+>>> e2e_ocr_model = ComicsOCR(ocr_detector_config="/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py",
+                 ocr_detector_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/best_0_hmean-iou:hmean_epoch_5.pth',
+                 recog_config='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/master_custom_dataset.py',
+                 ocr_recognition_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/best_0_1-N.E.D_epoch_4.pth',
+                 det='FCE_CTW_DCNv2',
+                 recog='MASTER')
+or 
+>>> e2e_ocr_model = ComicsOCR(ocr_detector_config="/home/gsoykan20/Desktop/self_development/mmocr/work_dirs/fce_best/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py",
+                 ocr_detector_checkpoint='/home/gsoykan20/Desktop/self_development/mmocr/work_dirs/fce_best/best_0_hmean-iou hmean_epoch_5.pth',
+                 recog_config='/home/gsoykan20/Desktop/self_development/mmocr/work_dirs/master_best/master_custom_dataset.py',
+                 ocr_recognition_checkpoint='/home/gsoykan20/Desktop/self_development/mmocr/work_dirs/master_best/best_0_1-N.E.D_epoch_4.pth',
+                 det='FCE_CTW_DCNv2',
+                 recog='MASTER')
+>>> # Run the model
+>>> img_path = "/home/gsoykan20/Desktop/self_development/amazing-mysteries-of-gutter-demystified/data/mtl_crop/speech/0/3/9.jpg"
+>>> text, preprocessed_text, sanitized_text = e2e_ocr_model.extract_text(img_path)
+
+References
+----------
+* https://github.com/gsoykan/comics_text_plus
+
+"""
diff --git a/comics_ocr/comics_ocr.py b/comics_ocr/comics_ocr.py
@@ -0,0 +1,77 @@
+import sys
+
+# source: https://github.com/artefactory/NLPretext
+from nlpretext import Preprocessor
+from nlpretext.basic.preprocess import (normalize_whitespace, remove_eol_characters, lower_text,
+                                        fix_bad_unicode)
+from nltk.tokenize import WordPunctTokenizer
+
+# for suppressing warnings
+from comics_ocr.text_extractor import TextExtractor
+
+
+def warn(*args, **kwargs):
+    pass
+
+
+import warnings
+
+warnings.warn = warn
+
+sys.path.append('../')
+
+
+class ComicsOCR:
+    def __init__(self,
+                 ocr_detector_config="/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py",
+                 ocr_detector_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/best_0_hmean-iou:hmean_epoch_5.pth',
+                 recog_config='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/master_custom_dataset.py',
+                 ocr_recognition_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/best_0_1-N.E.D_epoch_4.pth',
+                 det='FCE_CTW_DCNv2',
+                 recog='MASTER'):
+        self.text_extractor, self.text_preprocessor = ComicsOCR.set_text_processors(
+            ocr_detector_config,
+            ocr_detector_checkpoint,
+            recog_config,
+            ocr_recognition_checkpoint,
+            det,
+            recog)
+
+    def extract_text(self, img_path: str):
+        text = self.text_extractor.extract_text(img_path)
+        preprocessed_text = self.text_preprocessor.run(text)
+        sanitized_text = ComicsOCR.sanitize_text(preprocessed_text, self.text_preprocessor)
+        return text, preprocessed_text, sanitized_text
+
+    @staticmethod
+    def sanitize_text(text, text_preprocessor) -> str:
+        punc_tokenizer = WordPunctTokenizer()
+        return ' '.join(punc_tokenizer.tokenize(text_preprocessor.run(text)))
+
+    @staticmethod
+    def set_text_processors(
+            ocr_detector_config="/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py",
+            ocr_detector_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/best_0_hmean-iou:hmean_epoch_5.pth',
+            recog_config='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/master_custom_dataset.py',
+            ocr_recognition_checkpoint='/scratch/users/gsoykan20/projects/mmocr/work_dirs/master_custom_dataset/best_0_1-N.E.D_epoch_4.pth',
+            det='FCE_CTW_DCNv2',
+            recog='MASTER',
+    ):
+        text_preprocessor = ComicsOCR.get_minimal_text_preprocessor()
+        text_extractor = TextExtractor(batch_mode=True,
+                                       det=det,
+                                       det_ckpt=ocr_detector_checkpoint,
+                                       det_config=ocr_detector_config,
+                                       recog=recog,
+                                       recog_ckpt=ocr_recognition_checkpoint,
+                                       recog_config=recog_config)
+        return text_extractor, text_preprocessor
+
+    @staticmethod
+    def get_minimal_text_preprocessor():
+        preprocessor = Preprocessor()
+        preprocessor.pipe(lower_text)
+        preprocessor.pipe(remove_eol_characters)
+        preprocessor.pipe(normalize_whitespace)
+        preprocessor.pipe(fix_bad_unicode)
+        return preprocessor