diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml index 0e55963c..9b2f1917 100644 --- a/.github/workflows/test_on_push.yaml +++ b/.github/workflows/test_on_push.yaml @@ -6,10 +6,16 @@ on: branches: - develop - master + paths-ignore: + - 'VERSION' + - 'docs/source/changelog.rst' push: branches: - develop - master + paths-ignore: + - 'VERSION' + - 'docs/source/changelog.rst' # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/.gitignore b/.gitignore index 41616e04..16467ec7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +dedoc/version.py + ### Python template # Byte-compiled / optimized / DLL files *__pycache__* diff --git a/VERSION b/VERSION index f76f9131..2774f858 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.9.2 \ No newline at end of file +0.10.0 \ No newline at end of file diff --git a/dedoc/__init__.py b/dedoc/__init__.py index e69de29b..82dbebad 100644 --- a/dedoc/__init__.py +++ b/dedoc/__init__.py @@ -0,0 +1,2 @@ +from .dedoc_manager import DedocManager # noqa +from .version import __version__ # noqa diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 0c383b3b..463ab2c9 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -13,6 +13,7 @@ class QueryParameters(BaseModel): need_content_analysis: Optional[str] recursion_deep_attachments: Optional[str] return_base64: Optional[str] + attachments_dir: Optional[str] insert_table: Optional[str] need_pdf_table_analysis: Optional[str] @@ -44,6 +45,7 @@ def __init__(self, need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), + attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # tables handling insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), @@ -79,6 +81,7 @@ def __init__(self, self.need_content_analysis: str = need_content_analysis or 'false' self.recursion_deep_attachments: str = recursion_deep_attachments or '10' self.return_base64: str = return_base64 or 'false' + self.attachments_dir: str = attachments_dir self.insert_table: str = insert_table or 'false' self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true' diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 0a8201b7..ad62a2fa 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -1,5 +1,6 @@ import importlib import os +import tempfile import uvicorn from fastapi import Response, FastAPI, Request, Depends, UploadFile, File @@ -7,12 +8,14 @@ from fastapi.staticfiles import StaticFiles from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse +import dedoc from dedoc.api.api_args import QueryParameters from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree from dedoc.common.exceptions.dedoc_exception import DedocException from dedoc.common.exceptions.missing_file_exception import MissingFileException from dedoc.config import get_config -from dedoc.manager.dedoc_thread_manager import DedocThreadedManager +from dedoc.dedoc_manager import DedocManager +from dedoc.utils.utils import save_upload_file config = get_config() PORT = config["api_port"] @@ -24,8 +27,7 @@ module_api_args = importlib.import_module(config['import_path_init_api_args']) logger = config["logger"] -version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION")) -manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip()) +manager = DedocManager(config=config) @app.get("/") @@ -47,7 +49,7 @@ def get_static_file(request: Request) -> Response: @app.get('/version') def get_version() -> Response: - return PlainTextResponse(manager.version) + return PlainTextResponse(dedoc.__version__) def _get_static_file_path(request: Request) -> str: @@ -63,13 +65,14 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D parameters = query_params.dict(by_alias=True) if not file or file.filename == "": - raise MissingFileException("Error: Missing content in request_post file parameter", version=manager.version) + raise MissingFileException("Error: Missing content in request_post file parameter", version=dedoc.__version__) # check if the post request_post has the file part - logger.info("Get file {} with parameters {}".format(file.filename, parameters)) - warnings = [] - document_tree = manager.parse_file(file, parameters=dict(parameters)) - document_tree.warnings.extend(warnings) + logger.info(f"Get file {file.filename} with parameters {parameters}") + with tempfile.TemporaryDirectory() as tmpdir: + file_path = save_upload_file(file, tmpdir) + document_tree = manager.parse(file_path, parameters=dict(parameters)) + return_format = str(parameters.get("return_format", "json")).lower() if return_format == "html": html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0) @@ -83,7 +86,7 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D html_content = json2collapsed_tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content, status_code=200) else: - logger.info("Send result. File {} with parameters {}".format(file.filename, parameters)) + logger.info(f"Send result. File {file.filename} with parameters {parameters}") return ORJSONResponse(content=document_tree.to_dict(), status_code=200) @@ -96,10 +99,7 @@ async def exception_handler(request: Request, exc: DedocException) -> Response: result["dedoc_version"] = exc.version if exc.metadata: result["metadata"] = exc.metadata - return JSONResponse( - status_code=exc.code, - content=result, - ) + return JSONResponse(status_code=exc.code, content=result) def get_api() -> FastAPI: diff --git a/dedoc/api/train_dataset/api_collect_train_dataset.py b/dedoc/api/train_dataset/api_collect_train_dataset.py index db2ac57a..96d44848 100644 --- a/dedoc/api/train_dataset/api_collect_train_dataset.py +++ b/dedoc/api/train_dataset/api_collect_train_dataset.py @@ -12,7 +12,7 @@ from dedoc.api.train_dataset.api_args import TrainDatasetParameters from dedoc.api.train_dataset.async_archive_handler import AsyncHandler from dedoc.config import get_config -from dedoc.manager.dedoc_thread_manager import DedocThreadedManager +from dedoc.dedoc_manager import DedocManager from dedoc.train_dataset.taskers.concrete_taskers.filtered_line_label_tasker import FilteredLineLabelTasker from dedoc.train_dataset.taskers.concrete_taskers.header_footer_tasker import HeaderFooterTasker from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker @@ -33,8 +33,7 @@ app.mount('/static', StaticFiles(directory=static_path), name="static") templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset")) -version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "VERSION")) -manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip()) +manager = DedocManager(config=config) project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) diff --git a/dedoc/api/train_dataset/async_archive_handler.py b/dedoc/api/train_dataset/async_archive_handler.py index e47fd8dd..43b63b1a 100644 --- a/dedoc/api/train_dataset/async_archive_handler.py +++ b/dedoc/api/train_dataset/async_archive_handler.py @@ -11,20 +11,13 @@ from fastapi import UploadFile from dedoc.common.exceptions.bad_file_exception import BadFileFormatException -from dedoc.manager.dedoc_thread_manager import DedocThreadedManager +from dedoc.dedoc_manager import DedocManager from dedoc.train_dataset.taskers.tasker import Tasker class _ArchiveHandler(Thread): - def __init__(self, - queue: Queue, - results: dict, - progress: dict, - tasker: Tasker, - manager: DedocThreadedManager, - *, - config: dict) -> None: + def __init__(self, queue: Queue, results: dict, progress: dict, tasker: Tasker, manager: DedocManager, *, config: dict) -> None: Thread.__init__(self) self.progress = progress self.config = config @@ -77,7 +70,7 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic if not path_out.endswith("/"): with open(path_out, "wb") as file_out: file_out.write(item.read()) - self.manager.parse_existing_file(path=path_out, parameters=parameters) + self.manager.parse(file_path=path_out, parameters=parameters) except BadFileFormatException as e: self.logger.warning("Can't handle file {}, exception {}".format(file, str(e))) self.logger.info("Finish handle {}".format(file)) @@ -85,7 +78,7 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic class AsyncHandler: - def __init__(self, tasker: Tasker, manager: DedocThreadedManager, *, config: dict) -> None: + def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> None: super().__init__() self.queue = Queue() self.__results = {} diff --git a/dedoc/attachments_handler/__init__.py b/dedoc/attachments_handler/__init__.py index e69de29b..61ae37d2 100644 --- a/dedoc/attachments_handler/__init__.py +++ b/dedoc/attachments_handler/__init__.py @@ -0,0 +1,3 @@ +from .attachments_handler import AttachmentsHandler + +__all__ = ["AttachmentsHandler"] diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py index a4e92f49..8dcfce0a 100644 --- a/dedoc/attachments_handler/attachments_handler.py +++ b/dedoc/attachments_handler/attachments_handler.py @@ -1,16 +1,106 @@ +import copy import logging +import os +import shutil +import tempfile +import time +from typing import List +from dedoc.attachments_extractors import AbstractAttachmentsExtractor +from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile from dedoc.data_structures.unstructured_document import UnstructuredDocument +from dedoc.utils.utils import get_empty_content class AttachmentsHandler: + """ + This class is used for handling attached files: + + - they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path); + - they may be ignored (if the option `with_attachments=false` in parameters); + - the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters) + - they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \ + the parsing recursion may be set via `recursion_deep_attachments` parameter. + """ def __init__(self, *, config: dict) -> None: + """ + :param config: configuration of the handler, e.g. logger for logging + """ self.config = config self.logger = self.config.get("logger", logging.getLogger()) - def handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None: + def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa """ - Handle attached files, for example save it on disk or S3 storage + Handle attachments of the document in the intermediate representation. + + :param document_parser: class with `parse` method for parsing attachments if needed; + :param document: intermediate representation of the document whose attachments need to be handled; + :param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \ + are important, look to the API parameters documentation for more details). + :return: list of parsed document attachments """ - pass + parsed_attachment_files = [] + recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1 + + if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0: + return parsed_attachment_files + + self._handle_attachments(document=document, parameters=parameters) + + previous_log_time = time.time() + + for i, attachment in enumerate(document.attachments): + current_time = time.time() + if current_time - previous_log_time > 3: + previous_log_time = current_time # not log too often + self.logger.info(f"Handle attachment {i} of {len(document.attachments)}") + + if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185 + continue + + parameters_copy = copy.deepcopy(parameters) + parameters_copy["is_attached"] = True + parameters_copy["attachment"] = attachment + parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments) + + try: + if attachment.need_content_analysis: + with tempfile.TemporaryDirectory() as tmpdir: + attachment_path = os.path.join(tmpdir, attachment.get_original_filename()) + shutil.copy(attachment.get_filename_in_path(), attachment_path) + parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy) + else: + parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) + except DedocException: + # return empty ParsedDocument with Meta information + parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) + + parsed_file.metadata.set_uid(attachment.uid) + parsed_attachment_files.append(parsed_file) + return parsed_attachment_files + + def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None: + """ + Handle attached files, for example save it on disk or S3 storage. + This method can be redefined by other AttachmentHandler class. + """ + attachments_dir = parameters.get("attachments_dir") + if not attachments_dir: + return + + for attachment in document.attachments: + new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1]) + shutil.move(attachment.get_filename_in_path(), new_path) + attachment.tmp_file_path = new_path + + def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa + unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[]) + attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) + unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir, + filename=attachment_name, converted_filename=attachment_name, + original_filename=attachment.get_original_filename(), + parameters=parameters) + metadata = DocumentMetadata(**unstructured_document.metadata) + return ParsedDocument(content=get_empty_content(), metadata=metadata) diff --git a/dedoc/common/exceptions/dedoc_exception.py b/dedoc/common/exceptions/dedoc_exception.py index cff0305e..1b9cbf5d 100644 --- a/dedoc/common/exceptions/dedoc_exception.py +++ b/dedoc/common/exceptions/dedoc_exception.py @@ -1,5 +1,7 @@ from typing import Optional +import dedoc + class DedocException(Exception): def __init__(self, @@ -12,7 +14,7 @@ def __init__(self, self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename - self.version = version + self.version = version if version is not None else dedoc.__version__ self.metadata = metadata def __str__(self) -> str: diff --git a/dedoc/configuration_manager.py b/dedoc/configuration_manager.py deleted file mode 100644 index c9bf36a6..00000000 --- a/dedoc/configuration_manager.py +++ /dev/null @@ -1,35 +0,0 @@ -class ConfigurationManager(object): - """ - Pattern Singleton for configuration service - INFO: Configuration class and config are created once at the first call - For initialization ConfigurationManager call ConfigurationManager.getInstance().initConfig(new_config: dict) - If you need default config, call ConfigurationManager.getInstance() - """ - __instance = None - __config = None - - @classmethod - def getInstance(cls) -> "ConfigurationManager": # noqa - """ - Actual object creation will happen when we use ConfigurationManager.getInstance() - """ - if not cls.__instance: - cls.__instance = ConfigurationManager() - - return cls.__instance - - def initConfig(self, config: dict, new_config: dict = None) -> None: - if new_config is None: - from dedoc.manager_config import get_manager_config - self.__instance.__config = get_manager_config(config) - else: - self.__instance.__config = new_config - - def getConfig(self, config: dict) -> dict: - if self.getInstance().__config is None: - self.initConfig(config) - return self.__instance.__config - - -def get_manager_config(config: dict) -> dict: - return ConfigurationManager().getConfig(config) diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py index fecb41e8..380fd508 100644 --- a/dedoc/converters/concrete_converters/pdf_converter.py +++ b/dedoc/converters/concrete_converters/pdf_converter.py @@ -11,7 +11,7 @@ class PDFConverter(AbstractConverter): """ def __init__(self, *, config: dict) -> None: super().__init__(config=config) - self.timeout = 20 + self.timeout = 60 def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool: """ diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 7a800b78..4c102e86 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -12,7 +12,7 @@ class Annotation(Serializable): Look to the concrete kind of annotations to get mode examples. """ - def __init__(self, start: int, end: int, name: str, value: str) -> None: + def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None: """ Some kind of text information about symbols between start and end. For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was writen in italic. @@ -21,11 +21,13 @@ def __init__(self, start: int, end: int, name: str, value: str) -> None: :param end: end of the annotated text (end isn't included) :param name: annotation's name :param value: information about annotated text + :param is_mergeable: is it possible to merge annotations with the same value """ self.start = start self.end = end self.name = name self.value = value + self.is_mergeable = is_mergeable def __eq__(self, o: object) -> bool: if not isinstance(o, Annotation): diff --git a/dedoc/data_structures/concrete_annotations/__init__.py b/dedoc/data_structures/concrete_annotations/__init__.py index f8b17429..529acaa0 100644 --- a/dedoc/data_structures/concrete_annotations/__init__.py +++ b/dedoc/data_structures/concrete_annotations/__init__.py @@ -3,6 +3,7 @@ from .bbox_annotation import BBoxAnnotation from .bold_annotation import BoldAnnotation from .color_annotation import ColorAnnotation +from .confidence_annotation import ConfidenceAnnotation from .indentation_annotation import IndentationAnnotation from .italic_annotation import ItalicAnnotation from .linked_text_annotation import LinkedTextAnnotation @@ -15,6 +16,6 @@ from .table_annotation import TableAnnotation from .underlined_annotation import UnderlinedAnnotation -__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'IndentationAnnotation', - 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation', 'StyleAnnotation', - 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation'] +__all__ = ['AlignmentAnnotation', 'AttachAnnotation', 'BBoxAnnotation', 'BoldAnnotation', 'ColorAnnotation', 'ConfidenceAnnotation', + 'IndentationAnnotation', 'ItalicAnnotation', 'LinkedTextAnnotation', 'SizeAnnotation', 'SpacingAnnotation', 'StrikeAnnotation', + 'StyleAnnotation', 'SubscriptAnnotation', 'SuperscriptAnnotation', 'TableAnnotation', 'UnderlinedAnnotation'] diff --git a/dedoc/data_structures/concrete_annotations/confidence_annotation.py b/dedoc/data_structures/concrete_annotations/confidence_annotation.py new file mode 100644 index 00000000..af18120f --- /dev/null +++ b/dedoc/data_structures/concrete_annotations/confidence_annotation.py @@ -0,0 +1,32 @@ +from flask_restx import fields, Api, Model + +from dedoc.data_structures.annotation import Annotation + + +class ConfidenceAnnotation(Annotation): + """ + Confidence level of some recognized with OCR text inside the line. + """ + name = "confidence" + + def __init__(self, start: int, end: int, value: str) -> None: + """ + :param start: start of the text + :param end: end of the text (not included) + :param value: confidence level in "percents" (float or integer number from 0 to 100) + """ + try: + assert 0.0 <= float(value) <= 100.0 + except ValueError: + raise ValueError("the value of confidence annotation should be float value") + except AssertionError: + raise ValueError("the value of confidence annotation should be in range [0, 100]") + super().__init__(start=start, end=end, name=ConfidenceAnnotation.name, value=value, is_mergeable=False) + + @staticmethod + def get_api_dict(api: Api) -> Model: + return api.model('BoldAnnotation', { + 'start': fields.Integer(description='annotation start index', required=True, example=0), + 'end': fields.Integer(description='annotation end index', required=True, example=4), + 'value': fields.String(description='confidence value', required=True, example="95") + }) diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index c838c28d..a05777d3 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -14,6 +14,7 @@ class DocumentMetadata(Serializable): def __init__(self, file_name: str, + temporary_file_name: str, size: int, modified_time: int, created_time: int, @@ -24,6 +25,7 @@ def __init__(self, """ :param uid: document unique identifier (useful for attached files) :param file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :param temporary_file_name: file name during parsing (unique name after rename and conversion); :param size: size of the original file in bytes :param modified_time: time of the last modification in unix time format (seconds since the epoch) :param created_time: time of the creation in unixtime @@ -32,6 +34,7 @@ def __init__(self, :param other_fields: additional fields of user metadata """ self.file_name = file_name + self.temporary_file_name = temporary_file_name self.size = size self.modified_time = modified_time self.created_time = created_time @@ -62,6 +65,7 @@ def to_dict(self) -> dict: res = OrderedDict() res["uid"] = self.uid res["file_name"] = self.file_name + res["temporary_file_name"] = self.temporary_file_name res["size"] = self.size res["modified_time"] = self.modified_time res["created_time"] = self.created_time @@ -78,6 +82,7 @@ def get_api_dict(api: Api) -> Model: return api.model('DocumentMetadata', { "uid": fields.String(description='unique document identifier', example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"), 'file_name': fields.String(description='file name', example="example.odt"), + 'temporary_file_name': fields.String(description='file name', example="123.odt"), 'size': fields.Integer(description='file size in bytes', example="20060"), 'modified_time': fields.Integer(description='modification time of the document in the format UnixTime', example="1590579805"), 'created_time': fields.Integer(description='creation time of the document in the format UnixTime', example="1590579805"), diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 912ebade..81cd1bf2 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -3,6 +3,7 @@ from flask_restx import fields, Api, Model +import dedoc from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -15,21 +16,17 @@ class ParsedDocument(Serializable): def __init__(self, metadata: DocumentMetadata, content: Optional[DocumentContent], - version: str, warnings: List[str] = None, attachments: Optional[List["ParsedDocument"]] = None) -> None: """ :param metadata: document metadata such as size, creation date and so on. :param content: text and tables :param attachments: result of analysis of attached files - :param version: the version of the program that parsed this document :param warnings: list of warnings and possible errors, arising in the process of document parsing """ self.metadata = metadata self.content = content self.attachments = [] if attachments is None else attachments - assert version is not None - self.version = version self.warnings = warnings if warnings is not None else [] def add_attachments(self, new_attachment: List["ParsedDocument"]) -> None: @@ -42,7 +39,7 @@ def set_metadata(self, metadata: DocumentMetadata) -> None: def to_dict(self, depth: int = 0) -> dict: res = OrderedDict() - res["version"] = self.version + res["version"] = dedoc.__version__ res["warnings"] = self.warnings res["content"] = self.content.to_dict() if self.content is not None else [] res["metadata"] = self.metadata.to_dict() @@ -56,7 +53,7 @@ def get_api_dict(api: Api, depth: int = 0, name: str = 'ParsedDocument') -> Mode return api.model(name, { 'content': fields.Nested(DocumentContent.get_api_dict(api), description='Document content structure'), 'metadata': fields.Nested(DocumentMetadata.get_api_dict(api), allow_null=False, skip_none=True, description='Document meta information'), - 'version': fields.String(description='the version of the program that parsed this document', example="2020.07.11"), + 'version': fields.String(description='the version of the program that parsed this document', example="0.9.1"), 'warnings': fields.List(fields.String(description='list of warnings and possible errors', example="DOCX: seems that document corrupted")), 'attachments': fields.List(fields.Nested(api.model('others_ParsedDocument', {})), description='structure of attachments', required=False) if depth == 10 # TODO delete this diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py new file mode 100644 index 00000000..62d7faa7 --- /dev/null +++ b/dedoc/dedoc_manager.py @@ -0,0 +1,134 @@ +import logging +import os.path +import shutil +import tempfile +from typing import Optional, Dict + +from dedoc.common.exceptions.dedoc_exception import DedocException +from dedoc.config import get_config +from dedoc.manager_config import get_manager_config +from dedoc.data_structures import ParsedDocument, UnstructuredDocument +from dedoc.metadata_extractors import BaseMetadataExtractor +from dedoc.train_dataset.train_dataset_utils import save_line_with_meta, get_path_original_documents +from dedoc.utils.utils import get_unique_name + + +class DedocManager: + """ + This class allows to run the whole pipeline of the document processing: + + 1. Converting + 2. Reading + 3. Metadata extraction + 4. Structure extraction + 5. Output structure construction + 6. Attachments handling + """ + + def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] = None) -> None: + """ + :param config: config for document processing + :param manager_config: dictionary with different stage document processors. + + The following keys should be in the `manager_config` dictionary: + - converter (optional) (:class:`~dedoc.converters.FileConverterComposition`) + - reader (:class:`~dedoc.readers.ReaderComposition`) + - structure_extractor (:class:`~dedoc.structure_extractors.StructureExtractorComposition`) + - structure_constructor (:class:`~dedoc.structure_constructors.StructureConstructorComposition`) + - document_metadata_extractor (:class:`~dedoc.metadata_extractors.MetadataExtractorComposition`) + - attachments_handler (:class:`~dedoc.attachments_handler.AttachmentsHandler`) + """ + self.config = get_config() if config is None else config + self.logger = self.config.get("logger", logging.getLogger()) + manager_config = get_manager_config(self.config) if manager_config is None else manager_config + + self.converter = manager_config.get("converter", None) + self.reader = manager_config.get("reader", None) + assert self.reader is not None, "Reader shouldn't be None" + self.structure_extractor = manager_config.get("structure_extractor", None) + assert self.structure_extractor is not None, "Structure extractor shouldn't be None" + self.structure_constructor = manager_config.get("structure_constructor", None) + assert self.structure_constructor is not None, "Structure constructor shouldn't be None" + self.document_metadata_extractor = manager_config.get("document_metadata_extractor", None) + assert self.document_metadata_extractor is not None, "Document metadata extractor shouldn't be None" + self.attachments_handler = manager_config.get("attachments_handler", None) + assert self.attachments_handler is not None, "Attachments handler shouldn't be None" + + def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> ParsedDocument: + """ + Run the whole pipeline of the document processing. + If some error occurred, file metadata are stored in the exception's metadata field. + + :param file_path: full path where the file is located + :param parameters: any parameters, specify how to parse file (see API parameters documentation for more details) + :return: parsed document + """ + parameters = {} if parameters is None else parameters + + try: + return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) + except DedocException as e: + file_dir, file_name = os.path.split(file_path) + e.filename = file_name + e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) + raise e + + def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) -> ParsedDocument: + """ + Function of complete document parsing without errors handling. + + :param file_path: full path where the file is located + :param parameters: any parameters, specify how to parse file + :return: parsed document + """ + if not os.path.isfile(path=file_path): + raise FileNotFoundError() + self.logger.info(f"Start handle {file_path}") + file_dir, file_name = os.path.split(file_path) + unique_filename = get_unique_name(file_name) + + with tempfile.TemporaryDirectory() as tmp_dir: + shutil.copy(file_path, os.path.join(tmp_dir, unique_filename)) + + # Step 1 - Converting + converted_filename = self.converter.do_converting(tmp_dir, unique_filename, parameters=parameters) + self.logger.info(f"Finish conversion {file_name} -> {converted_filename}") + + # Step 2 - Reading content + unstructured_document = self.reader.parse_file(tmp_dir=tmp_dir, filename=converted_filename, parameters=parameters) + self.logger.info(f"Finish parse file {file_name}") + + # Step 3 - Adding meta-information + unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, + directory=tmp_dir, + filename=unique_filename, + converted_filename=converted_filename, + original_filename=file_name, + parameters=parameters, + other_fields=unstructured_document.metadata) + self.logger.info(f"Add metadata of file {file_name}") + + # Step 4 - Extract structure + unstructured_document = self.structure_extractor.extract_structure(unstructured_document, parameters) + self.logger.info(f"Extract structure from file {file_name}") + + if self.config.get("labeling_mode", False): + self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document) + + # Step 5 - Form the output structure + parsed_document = self.structure_constructor.structure_document(document=unstructured_document, + structure_type=parameters.get("structure_type"), + parameters=parameters) + self.logger.info(f"Get structured document {file_name}") + + # Step 6 - Get attachments + attachments = self.attachments_handler.handle_attachments(document_parser=self, document=unstructured_document, parameters=parameters) + parsed_document.add_attachments(attachments) + self.logger.info(f"Get attachments {file_name}") + + self.logger.info(f"Finish handle {file_name}") + return parsed_document + + def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: + save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) + shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path))) diff --git a/dedoc/download_models.py b/dedoc/download_models.py index 376d82bc..dd511463 100644 --- a/dedoc/download_models.py +++ b/dedoc/download_models.py @@ -12,7 +12,7 @@ Keys are the names of repositories with models. """ model_hash_dict = dict( - catboost_detect_tl_correctness="cafb0684f59d49c9daca0bfd2ede216955cb457e", + txtlayer_classifier="93b10fea2b661d7eca79381b47e5c4ebe2a22e75", scan_orientation_efficient_net_b0="0160965f8a920d12afacf62b8a5a8a3b365b11ef", font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07", paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864", @@ -28,8 +28,8 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str def download(resources_path: str) -> None: download_from_hub(out_dir=resources_path, - out_name="catboost_detect_tl_correctness.pkl.gz", - repo_name="catboost_detect_tl_correctness", + out_name="txtlayer_classifier.pkl.gz", + repo_name="txtlayer_classifier", hub_name="model.pkl.gz") download_from_hub(out_dir=resources_path, diff --git a/dedoc/main.py b/dedoc/main.py index e3741900..b784fad0 100644 --- a/dedoc/main.py +++ b/dedoc/main.py @@ -1,7 +1,6 @@ import argparse from dedoc.config import Configuration, get_config -from dedoc.configuration_manager import ConfigurationManager from dedoc.api.dedoc_api import run_api, get_api # noqa @@ -17,7 +16,6 @@ def main() -> None: parser_config.add_argument('-v', "--unitest_verbose_mode", nargs='?', help="to enable verbose mode of unittest. Only for tests") args_config = parser_config.parse_args() - ConfigurationManager().getInstance() Configuration.getInstance().getConfig(args_config) config = get_config() diff --git a/dedoc/manager/__init__.py b/dedoc/manager/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/dedoc/manager/dedoc_manager.py b/dedoc/manager/dedoc_manager.py deleted file mode 100644 index 4e875ed9..00000000 --- a/dedoc/manager/dedoc_manager.py +++ /dev/null @@ -1,220 +0,0 @@ -import copy -import logging -import os -import shutil -import tempfile -import time -from typing import Optional, List, Dict - -from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor -from dedoc.attachments_handler.attachments_handler import AttachmentsHandler -from dedoc.common.exceptions.dedoc_exception import DedocException -from dedoc.converters.file_converter import FileConverterComposition -from dedoc.data_structures.document_metadata import DocumentMetadata -from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.data_structures.unstructured_document import UnstructuredDocument -from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor -from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition -from dedoc.readers.reader_composition import ReaderComposition -from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition -from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition -from dedoc.train_dataset.train_dataset_utils import save_line_with_meta, get_path_original_documents -from dedoc.utils.utils import get_unique_name, get_empty_content - - -class DedocManager: - - def __init__(self, - converter: FileConverterComposition, - attachments_handler: AttachmentsHandler, - reader: ReaderComposition, - structure_extractor: StructureExtractorComposition, - structure_constructor: StructureConstructorComposition, - document_metadata_extractor: MetadataExtractorComposition, - logger: logging.Logger, - version: str, - *, - config: dict) -> None: - self.version = version - self.converter = converter - self.attachments_handler = attachments_handler - self.reader = reader - self.structure_extractor = structure_extractor - self.structure_constructor = structure_constructor - self.document_metadata_extractor = document_metadata_extractor - self.logger = logger - self.config = config - - @staticmethod - def from_config(version: str, manager_config: dict, *, config: dict) -> "DedocManager": - """ - this method helps to construct dedoc manager from config - :param version: str, actual version of dedoc (or your lib, based on dedoc) Lay in file VERSION - :param manager_config: dict, you may get example of managers config dict in manager_config.py - :param config: any additional parameters for dedoc lay in config, see config.py - :return: DedocManager - """ - - logger = config.get("logger") - logger = logger if logger is not None else logging.getLogger(__name__) - manager = DedocManager( - converter=manager_config["converter"], - attachments_handler=manager_config["attachments_extractor"], - reader=manager_config["reader"], - structure_extractor=manager_config["structure_extractor"], - structure_constructor=manager_config["structure_constructor"], - document_metadata_extractor=manager_config["document_metadata_extractor"], - logger=logger, - version=version, - config=config - ) - - return manager - - def parse_file(self, - file_path: str, - parameters: Dict[str, str], - original_file_name: Optional[str] = None) -> ParsedDocument: - """ - Function of complete parsing document with 'filename' with attachment files analyze - :param file_path: full path where file lay - :param parameters: any parameters, specify how we want to parse file - :param original_file_name: name of original file (None if file was not renamed) - :return: - """ - try: - return self._parse_file_no_error_handling(file_path=file_path, - parameters=parameters, - original_file_name=original_file_name) - except DedocException as e: - e.version = self.version - e.filename = original_file_name - file_dir, file_name = os.path.split(file_path) - e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, - filename=file_name, - name_actual=file_name, - parameters={}) - raise e - - def _parse_file_no_error_handling(self, - file_path: str, - parameters: Dict[str, str], - original_file_name: Optional[str] = None) -> ParsedDocument: - """ - Function of complete parsing document with 'filename' with attachment files analyze - :param file_path: full path where file lay - :param parameters: any parameters, specify how we want to parse file - :param original_file_name: name of original file (None if file was not ranamed) - :return: - """ - warnings = [] - if not os.path.isfile(path=file_path): - raise FileNotFoundError() - self.logger.info("Start handle {}".format(file_path)) - if original_file_name is None: - original_file_name = os.path.basename(file_path) - filename = get_unique_name(file_path) - with tempfile.TemporaryDirectory() as tmp_dir: - shutil.copy(file_path, os.path.join(tmp_dir, filename)) - - # Step 1 - Converting - filename_convert = self.converter.do_converting(tmp_dir, filename, parameters=parameters) - self.logger.info("Finish conversion {} -> {}".format(filename, filename_convert)) - # Step 2 - Parsing content of converted file - unstructured_document = self.reader.parse_file(tmp_dir=tmp_dir, filename=filename_convert, parameters=parameters) - self.logger.info("Finish parse file {}".format(filename_convert)) - # Step 3 - Adding meta-information - unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, - directory=tmp_dir, - filename=filename, - converted_filename=filename_convert, - original_filename=original_file_name, - parameters=parameters, - version=self.version, - other_fields=unstructured_document.metadata) - self.logger.info("Add metadata of file {}".format(filename_convert)) - # Step 4 - Extract structure - unstructured_document = self.structure_extractor.extract_structure(unstructured_document, parameters) - warnings.extend(unstructured_document.warnings) - self.logger.info("Extract structure from file {}".format(filename_convert)) - - if self.config.get("labeling_mode", False): - self.__save(os.path.join(tmp_dir, filename), unstructured_document) - - # Step 5 - Form the output structure - structure_type = parameters.get("structure_type") - parsed_document = self.structure_constructor.structure_document(document=unstructured_document, - version=self.version, - structure_type=structure_type, - parameters=parameters) - warnings.extend(parsed_document.warnings) - self.logger.info("Get structured document {}".format(filename_convert)) - - if AbstractAttachmentsExtractor.with_attachments(parameters): - self.logger.info("Start handle attachments") - parsed_attachment_files = self.__handle_attachments(document=unstructured_document, parameters=parameters, tmp_dir=tmp_dir) - self.logger.info("Get attachments {}".format(filename_convert)) - parsed_document.add_attachments(parsed_attachment_files) - else: - parsed_document.attachments = None - parsed_document.version = self.version - parsed_document.warnings.extend(warnings) - self.logger.info("Finish handle {}".format(filename)) - return parsed_document - - def __save(self, path: str, classified_document: UnstructuredDocument) -> None: - save_line_with_meta(lines=classified_document.lines, config=self.config, - original_document=os.path.basename(path)) - shutil.copy(path, os.path.join(get_path_original_documents(self.config), os.path.basename(path))) - - def __handle_attachments(self, - document: UnstructuredDocument, - parameters: dict, - tmp_dir: str) -> List[ParsedDocument]: - parsed_attachment_files = [] - self.attachments_handler.handle_attachments(document=document, parameters=parameters) - previous_log_time = time.time() - for i, attachment in enumerate(document.attachments): - current_time = time.time() - if current_time - previous_log_time > 3: - previous_log_time = current_time # not log too often - self.logger.info("Handle attachment {} of {}".format(i, len(document.attachments))) - parameters_copy = copy.deepcopy(parameters) - parameters_copy["is_attached"] = True - parameters_copy["attachment"] = attachment - try: - # TODO handle nested attachments according to recursion_deep_attachments (https://jira.ispras.ru/browse/TLDR-300) - if attachment.need_content_analysis: - file_path = os.path.join(tmp_dir, attachment.get_filename_in_path()) - parsed_file = self.parse_file(file_path, - parameters=parameters_copy, - original_file_name=attachment.get_original_filename()) - else: - parsed_file = self.__get_empty_document(directory=tmp_dir, - filename=attachment.get_filename_in_path(), - converted_filename=attachment.get_filename_in_path(), - original_file_name=attachment.get_original_filename(), - parameters=parameters_copy) - except DedocException: - # return empty ParsedDocument with Meta information - parsed_file = self.__get_empty_document(directory=tmp_dir, - filename=attachment.get_filename_in_path(), - converted_filename=attachment.get_filename_in_path(), - original_file_name=attachment.get_original_filename(), - parameters=parameters_copy) - parsed_file.metadata.set_uid(attachment.uid) - parsed_attachment_files.append(parsed_file) - return parsed_attachment_files - - def __get_empty_document(self, directory: str, filename: str, converted_filename: str, original_file_name: str, - parameters: dict) -> ParsedDocument: - unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[]) - unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_file_name, - parameters=parameters, - version=self.version) - metadata = DocumentMetadata(**unstructured_document.metadata) - return ParsedDocument(content=get_empty_content(), metadata=metadata, version=self.version) diff --git a/dedoc/manager/dedoc_thread_manager.py b/dedoc/manager/dedoc_thread_manager.py deleted file mode 100644 index 305168cc..00000000 --- a/dedoc/manager/dedoc_thread_manager.py +++ /dev/null @@ -1,159 +0,0 @@ -import logging -import os -import shutil -import tempfile -import uuid -from queue import Queue -from threading import Thread -from time import sleep -from typing import Optional, Dict - -from fastapi import UploadFile - -from dedoc.configuration_manager import get_manager_config -from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.manager.dedoc_manager import DedocManager -from dedoc.utils.utils import get_unique_name - - -class ThreadManager(Thread): - - def __init__(self, - manager_config: dict, - queue: Queue, - result: dict, - logger: logging.Logger, - version: str, - *, - config: dict) -> None: - Thread.__init__(self) - self.version = version - self.converter = manager_config["converter"] - self.reader = manager_config["reader"] - self.structure_constructor = manager_config["structure_constructor"] - self.document_metadata_extractor = manager_config["document_metadata_extractor"] - self.attachments_extractor = manager_config["attachments_extractor"] - self.queue = queue - self.result = result - self.logger = logger - - self.manager = DedocManager.from_config(version=version, manager_config=manager_config, config=config) - - def run(self) -> None: - sleep_time = 0.01 - while True: - if self.queue.empty(): - sleep(sleep_time) - sleep_time = min(sleep_time * 2, 1) - else: - sleep_time = 0.01 - self.logger.info("{} files to handle".format(self.queue.qsize())) - uid, directory, filename, parameters, original_file_name = self.queue.get() - self.logger.info("Start handle file {}".format(filename)) - with tempfile.TemporaryDirectory() as tmp_dir: - file_original = os.path.join(directory, filename) - file_new_location = os.path.join(tmp_dir, filename) - self.logger.debug("Move file from {} to {}".format(file_original, file_new_location)) - shutil.move(file_original, file_new_location) - try: - result = self.manager.parse_file(file_path=file_new_location, - parameters=parameters, - original_file_name=original_file_name) - self.logger.info("Finish handle file {}".format(original_file_name)) - self.result[uid] = result - except Exception as e: - self.result[uid] = e - - -class DedocThreadedManager(object): - - @staticmethod - def from_config(version: str, tmp_dir: Optional[str] = None, *, config: dict) -> "DedocThreadedManager": - manager_config = get_manager_config(config=config) - - if tmp_dir is not None and not os.path.exists(tmp_dir): - os.mkdir(tmp_dir) - - result = {} - queue = Queue() - logger = config.get("logger") - logger = logger if logger is not None else logging.getLogger(__name__) - thread_manager = ThreadManager(manager_config=manager_config, - queue=queue, - result=result, - logger=logger, - version=version, - config=config) - thread_manager.start() - return DedocThreadedManager(tmp_dir=tmp_dir, - thread_manager=thread_manager, - queue=queue, - result=result, - logger=logger, - config=config, - version=version) - - def __init__(self, - tmp_dir: str, - thread_manager: ThreadManager, - queue: Queue, - result: dict, - logger: logging.Logger, - version: str, - *, - config: dict) -> None: - self.version = version - self.tmp_dir = tmp_dir - self.queue = queue - self.result = result - self.thread_manager = thread_manager - self.logger = logger - self.config = config - - def parse_file(self, file: UploadFile, parameters: Dict[str, str]) -> ParsedDocument: - original_filename = file.filename.split("/")[-1] - self.logger.info("Get file {}".format(original_filename)) - filename = get_unique_name(original_filename) - self.logger.info("Rename file {} to {}".format(original_filename, filename)) - - if self.tmp_dir is None: - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_path = os.path.join(tmp_dir, filename) - with open(tmp_path, "wb") as df: - shutil.copyfileobj(file.file, df) - - return self.__parse_file( - tmp_dir=tmp_dir, - filename=filename, - parameters=parameters, - original_file_name=original_filename - ) - - tmp_path = os.path.join(self.tmp_dir, filename) - with open(tmp_path, "wb") as df: - shutil.copyfileobj(file.file, df) - return self.__parse_file( - tmp_dir=self.tmp_dir, - filename=filename, - parameters=parameters, - original_file_name=original_filename - ) - - def parse_existing_file(self, path: str, parameters: Dict[str, str]) -> ParsedDocument: - self.logger.info("Parse existing file {}".format(path)) - with open(path, 'rb') as fp: - file = UploadFile(file=fp, filename=path) - return self.parse_file(file=file, parameters=parameters) - - def __parse_file(self, tmp_dir: str, filename: str, parameters: dict, original_file_name: str) -> ParsedDocument: - sleep_time = 0.01 - uid = str(uuid.uuid1()) - self.logger.info("Put file in queue {}".format(filename)) - self.queue.put((uid, tmp_dir, filename, parameters, original_file_name)) - while uid not in self.result: - sleep(sleep_time) - sleep_time = min(0.3, sleep_time * 2) - result = self.result.pop(uid) - if isinstance(result, Exception): - raise result - return result diff --git a/dedoc/manager_config.py b/dedoc/manager_config.py index 7c579b1c..f2cd01c7 100644 --- a/dedoc/manager_config.py +++ b/dedoc/manager_config.py @@ -43,7 +43,7 @@ """MANAGER SETTINGS""" -def get_manager_config(config: dict) -> dict: +def _get_manager_config(config: dict) -> dict: converters = [ DocxConverter(config=config), ExcelConverter(config=config), @@ -99,5 +99,41 @@ def get_manager_config(config: dict) -> dict: default_constructor=TreeConstructor() ), document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors), - attachments_extractor=AttachmentsHandler(config=config) + attachments_handler=AttachmentsHandler(config=config) ) + + +class ConfigurationManager(object): + """ + Pattern Singleton for configuration service + INFO: Configuration class and config are created once at the first call + For initialization ConfigurationManager call ConfigurationManager.getInstance().initConfig(new_config: dict) + If you need default config, call ConfigurationManager.getInstance() + """ + __instance = None + __config = None + + @classmethod + def getInstance(cls: "ConfigurationManager") -> "ConfigurationManager": + """ + Actual object creation will happen when we use ConfigurationManager.getInstance() + """ + if not cls.__instance: + cls.__instance = ConfigurationManager() + + return cls.__instance + + def initConfig(self, config: dict, new_config: dict = None) -> None: + if new_config is None: + self.__config = _get_manager_config(config) + else: + self.__config = new_config + + def getConfig(self, config: dict) -> dict: + if self.__config is None: + self.initConfig(config) + return self.__config + + +def get_manager_config(config: dict) -> dict: + return ConfigurationManager().getInstance().getConfig(config) diff --git a/dedoc/metadata_extractors/abstract_metadata_extractor.py b/dedoc/metadata_extractors/abstract_metadata_extractor.py index 11501e2d..6346d155 100644 --- a/dedoc/metadata_extractors/abstract_metadata_extractor.py +++ b/dedoc/metadata_extractors/abstract_metadata_extractor.py @@ -30,7 +30,6 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ @@ -42,7 +41,6 @@ def add_metadata(self, The file gets a new name during processing by the dedoc manager (if used) :type converted_filename: name of the file after renaming and conversion (for example 23141.docx) :type original_filename: name of the file before renaming - :type version: version of the dedoc library :type parameters: additional parameters for document parsing :type other_fields: other fields that should be added to the document's metadata :return: document content with added metadata attribute (dict with information about the document) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py index b91275e3..f545ee1e 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/base_metadata_extractor.py @@ -13,6 +13,7 @@ class BaseMetadataExtractor(AbstractMetadataExtractor): It returns the following information about the given file: - file name; + - file name during parsing (unique); - file type (MIME); - file size in bytes; - time when the file was last accessed; @@ -40,7 +41,6 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ @@ -48,7 +48,7 @@ def add_metadata(self, Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. """ parameters = {} if parameters is None else parameters - meta_info = self._get_base_meta_information(directory, filename, original_filename, parameters) + meta_info = self._get_base_meta_information(directory, filename, original_filename) if parameters.get("is_attached", False) and str(parameters.get("return_base64", "false")).lower() == "true": other_fields = {} if other_fields is None else other_fields @@ -63,10 +63,11 @@ def add_metadata(self, return document @staticmethod - def _get_base_meta_information(directory: str, filename: str, name_actual: str, parameters: dict) -> dict: + def _get_base_meta_information(directory: str, filename: str, name_actual: str) -> dict: (mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(os.path.join(directory, filename)) meta = { "file_name": name_actual, + "temporary_file_name": filename, "file_type": get_file_mime_type(os.path.join(directory, filename)), "size": size, # in bytes "access_time": atime, diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py index 68412f9a..377cba55 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/docx_metadata_extractor.py @@ -43,7 +43,6 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: dict = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ @@ -52,14 +51,8 @@ def add_metadata(self, """ parameters = {} if parameters is None else parameters - result = super().add_metadata(document=document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - version=version, - other_fields=other_fields) + result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) file_path = os.path.join(directory, converted_filename) docx_other_fields = self._get_docx_fields(file_path) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py index 6418f13a..b1e2399e 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/image_metadata_extractor.py @@ -73,21 +73,14 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: dict = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ Add the predefined list of metadata for images. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - version=version, - other_fields=other_fields) + result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) exif_fields = self._get_exif(path) @@ -134,6 +127,7 @@ def _get_exif(self, path: str) -> dict: encoded_dict = {key_renamed: encode_function(exif.get(key)) for key, (key_renamed, encode_function) in self.keys.items() if key in exif} encoded_dict = {k: v for k, v in encoded_dict.items() if k is not None if v is not None} + image.close() return encoded_dict except Exception as e: # noqa self.logger.debug(e) diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py index d674a9f8..14839416 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/note_metadata_extarctor.py @@ -37,7 +37,6 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: dict = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py index f41b7879..74660e9e 100644 --- a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py +++ b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py @@ -66,21 +66,14 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: dict = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ Add the predefined list of metadata for the pdf documents. Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters. """ - result = super().add_metadata(document=document, - directory=directory, - filename=filename, - converted_filename=converted_filename, - original_filename=original_filename, - parameters=parameters, - version=version, - other_fields=other_fields) + result = super().add_metadata(document=document, directory=directory, filename=filename, converted_filename=converted_filename, + original_filename=original_filename, parameters=parameters, other_fields=other_fields) path = os.path.join(directory, filename) pdf_fields = self._get_pdf_info(path) if len(pdf_fields) > 0: diff --git a/dedoc/metadata_extractors/metadata_extractor_composition.py b/dedoc/metadata_extractors/metadata_extractor_composition.py index 9483e55f..68e308c9 100644 --- a/dedoc/metadata_extractors/metadata_extractor_composition.py +++ b/dedoc/metadata_extractors/metadata_extractor_composition.py @@ -23,7 +23,6 @@ def add_metadata(self, filename: str, converted_filename: str, original_filename: str, - version: str, parameters: Optional[dict] = None, other_fields: Optional[dict] = None) -> UnstructuredDocument: """ @@ -45,6 +44,5 @@ def add_metadata(self, converted_filename=converted_filename, original_filename=original_filename, parameters=parameters, - other_fields=other_fields, - version=version) + other_fields=other_fields) raise Exception(f"Can't extract metadata from from file {filename}") diff --git a/dedoc/readers/docx_reader/docx_reader.py b/dedoc/readers/docx_reader/docx_reader.py index e2599914..814cfdc3 100644 --- a/dedoc/readers/docx_reader/docx_reader.py +++ b/dedoc/readers/docx_reader/docx_reader.py @@ -17,6 +17,9 @@ class DocxReader(BaseReader): Please use :class:`~dedoc.converters.DocxConverter` for getting docx file from similar formats. """ def __init__(self, *, config: dict) -> None: + """ + :param config: configuration of the reader, e.g. logger for logging + """ self.attachment_extractor = DocxAttachmentsExtractor() self.logger = config.get("logger", logging.getLogger()) diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py index e69f473e..fe35ac03 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py @@ -4,11 +4,12 @@ import pickle from typing import List -import catboost.core +from xgboost import XGBClassifier from dedoc.config import get_config from dedoc.data_structures import LineWithMeta from dedoc.download_models import download_from_hub +from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_feature_extractor import TxtlayerFeatureExtractor class TxtlayerClassifier: @@ -19,26 +20,18 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = config.get("logger", logging.getLogger()) - eng = list(map(chr, range(ord('a'), ord('z') + 1))) - rus = [chr(i) for i in range(ord('а'), ord('а') + 32)] + ["ё"] - digits = [str(i) for i in range(10)] - special_symbols = [i for i in "<>~!@#$%^&*_+-/\"|?.,:;'`= "] - brackets = [i for i in "{}[]()"] - - self.letters_list = eng + [i.upper() for i in eng] + rus + [i.upper() for i in rus] - self.symbols_list = digits + special_symbols + brackets - - self.path = os.path.join(get_config()["resources_path"], "catboost_detect_tl_correctness.pkl.gz") + self.feature_extractor = TxtlayerFeatureExtractor() + self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz") self.__model = None @property - def __get_model(self) -> catboost.core.CatBoostClassifier: + def __get_model(self) -> XGBClassifier: if self.__model is not None: return self.__model if not os.path.isfile(self.path): out_dir, out_name = os.path.split(self.path) - download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="catboost_detect_tl_correctness", hub_name="model.pkl.gz") + download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz") assert os.path.isfile(self.path) with gzip.open(self.path, 'rb') as f: @@ -53,32 +46,12 @@ def predict(self, lines: List[LineWithMeta]) -> bool: :param lines: list of document textual lines. :returns: True if the textual layer is correct, False otherwise. """ - text_layer = u"".join([line.line for line in lines]) + text_layer = "".join([line.line for line in lines]) if not text_layer: return False - features = self.__get_features_for_predict(text_layer) - return self.__get_model.predict(features) == 1 - - def __get_features_for_predict(self, text: str) -> List[float]: - features = [] - num_letters_in_data = self._count_letters(text) - num_other_symbol_in_data = self._count_other(text) - - for symbol in self.letters_list: - # proportion of occurring english and russian letters - features.append(round(text.count(symbol) / num_letters_in_data, 5) if num_letters_in_data != 0 else 0.0) - - for symbol in self.symbols_list: - # number of symbols - features.append(text.count(symbol)) - - # proportion of letters with symbols - features.append((num_letters_in_data + num_other_symbol_in_data) / len(text) if len(text) != 0 else 0) - return features - - def _count_letters(self, text: str) -> int: - return sum(1 for symbol in text if symbol in self.letters_list) + if len(text_layer) < 150: + text_layer = f"\n{text_layer}" * (150 // len(text_layer)) - def _count_other(self, text: str) -> int: - return sum(1 for symbol in text if symbol in self.symbols_list) + features = self.feature_extractor.transform([text_layer]) + return self.__get_model.predict(features)[0] == 1 diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py new file mode 100644 index 00000000..33164c01 --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py @@ -0,0 +1,66 @@ +from collections import defaultdict +from typing import List + +import numpy as np +import pandas as pd + + +class TxtlayerFeatureExtractor: + + def __init__(self) -> None: + eng = "".join(list(map(chr, range(ord('a'), ord('z') + 1)))) + rus = "".join([chr(i) for i in range(ord('а'), ord('а') + 32)] + ["ё"]) + + self.lower_letters = eng + rus + self.upper_letters = self.lower_letters.upper() + self.letters = self.upper_letters + self.lower_letters + self.digits = "".join([str(i) for i in range(10)]) + self.special_symbols = "<>~!@#$%^&*_+-/\"|?.,:;'`= " + self.brackets = "{}[]()" + self.symbols = self.letters + self.digits + self.brackets + self.special_symbols + self.consonants = "".join(i for i in self.lower_letters if i not in "аоуыэяёюиеaeiouy") + + self.prohibited_symbols = {s: i for i, s in enumerate("[]<")} + + def transform(self, texts: List[str]) -> pd.DataFrame: + features = defaultdict(list) + + for text in texts: + num_letters = self.__count_symbols(text, self.letters) + num_digits = self.__count_symbols(text, self.digits) + num_special_symbols = self.__count_symbols(text, self.special_symbols) + num_brackets = self.__count_symbols(text, self.brackets) + num_consonants = self.__count_symbols(text.lower(), self.consonants) + + features["letters_proportion"].append(num_letters / len(text)) + features["digits_proportion"].append(num_digits / len(text)) + features["special_symbols_proportion"].append(num_special_symbols / len(text)) + features["brackets_proportion"].append(num_brackets / len(text)) + features["consonants_proportion"].append(num_consonants / num_letters if num_letters != 0 else 0.0) + + for symbol in self.letters + self.digits: + n = num_letters + num_digits + # proportion of occurring english and russian letters + features[f"{symbol}_proportion"].append(text.count(symbol) / n if n != 0 else 0.0) + + for symbol in self.special_symbols + self.brackets: + # number of symbols + symbol_name = symbol if symbol not in self.prohibited_symbols else f"symbol{self.prohibited_symbols[symbol]}" + features[f"{symbol_name}_number"].append(text.count(symbol)) + + # proportion of letters with symbols + features["all_proportion"].append((num_letters + num_digits + num_brackets + num_special_symbols) / len(text) if len(text) != 0 else 0) + + case_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.lower_letters) and (s2 in self.upper_letters)) + features["case_changes"].append(case_changes / len(text)) + symbol_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.symbols) != (s2 in self.symbols)) + features["symbol_changes"].append(symbol_changes / len(text)) + letter_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.letters) and (s2 not in self.symbols)) + features["letter_changes"].append(letter_changes / len(text)) + + features["mean_word_length"].append(np.mean([len(word) for word in text.split()])) + features = pd.DataFrame(features) + return features[sorted(features.columns)].astype(float) + + def __count_symbols(self, text: str, symbol_list: str) -> int: + return sum(1 for symbol in text if symbol in symbol_list) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py index 66468d05..6ee53e43 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_line_extractor.py @@ -71,7 +71,7 @@ def __split_image2bboxes(self, output_dict = get_text_with_bbox_from_document_page(image, language, ocr_conf_thr) else: output_dict = get_text_with_bbox_from_cells(image, language, ocr_conf_threshold=0.0) - line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num) + line_boxes = [TextWithBBox(text=line.text, page_num=page_num, bbox=line.bbox, line_num=line_num, annotations=line.get_confidence()) for line_num, line in enumerate(output_dict.lines)] return line_boxes diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py index 2e5f53b9..b98f74e3 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_line.py @@ -1,6 +1,7 @@ from typing import List from dedoc.data_structures.bbox import BBox +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_tuple import OcrElement from dedoc.readers.pdf_reader.pdf_image_reader.ocr.ocr_page.ocr_word import OcrWord @@ -19,6 +20,19 @@ def __init__(self, order: int, bbox: BBox, words: List[OcrWord]) -> None: def text(self) -> str: return " ".join(word.text for word in self.words if word.text != "") + "\n" + def get_confidence(self) -> List[ConfidenceAnnotation]: + start = 0 + annotations = [] + + for word in self.words: + if word.text == "": + continue + + annotations.append(ConfidenceAnnotation(start, start + len(word.text), str(word.confidence))) + start += len(word.text) + 1 + + return annotations + @staticmethod def from_list(line: List[OcrElement], ocr_conf_thr: float) -> "OcrLine": @@ -32,5 +46,5 @@ def from_list(line: List[OcrElement], ocr_conf_thr: float) -> "OcrLine": words.append(element) line = sorted(line, key=lambda word: word.line_num) line = list(filter(lambda word: float(word.conf) >= ocr_conf_thr, line)) - ocr_words = [OcrWord(bbox=word.bbox, text=word.text, order=word.word_num) for word in line] + ocr_words = [OcrWord(bbox=word.bbox, text=word.text, confidence=word.conf, order=word.word_num) for word in line] return OcrLine(order=head.line_num, words=ocr_words, bbox=head.bbox) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_word.py b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_word.py index 76dfc1e0..2a5aa843 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_word.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/ocr/ocr_page/ocr_word.py @@ -4,7 +4,7 @@ class OcrWord: level = 5 - def __init__(self, text: str, bbox: BBox, order: int) -> None: + def __init__(self, text: str, bbox: BBox, confidence: float, order: int) -> None: """ Single word from ocr. :param text: extracted text @@ -14,4 +14,5 @@ def __init__(self, text: str, bbox: BBox, order: int) -> None: super().__init__() self.text = text.replace("—", " ") self.bbox = bbox + self.confidence = confidence self.order = order diff --git a/dedoc/scripts/benchmark_tl_correctness.py b/dedoc/scripts/benchmark_tl_correctness.py index a32d490d..6258f9f4 100644 --- a/dedoc/scripts/benchmark_tl_correctness.py +++ b/dedoc/scripts/benchmark_tl_correctness.py @@ -2,16 +2,15 @@ import os import zipfile from collections import OrderedDict, namedtuple -from tempfile import TemporaryDirectory import requests import wget from tqdm import tqdm +from config import get_config from dedoc.utils.utils import send_file -path_result = os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks") -path_result = os.path.abspath(path_result) +path_result = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "benchmarks")) os.makedirs(path_result, exist_ok=True) path_result = os.path.join(path_result, "benchmarks_tl_correctness.json") @@ -23,46 +22,56 @@ def errors_param_for_text_layer(path_base: str, tl_type: str, tl_path: str, para failed = [] total_incorrect_files = 0 directory = os.path.join(path_base, tl_path) - files_list = os.listdir(directory) + files_list = [file_name for file_name in os.listdir(directory) if file_name.endswith(".pdf")] total_file_size = len(files_list) - print(files_list) - print(total_file_size) + print(f"Files: {files_list}\nFiles number: {total_file_size}") for file in tqdm(files_list): file_path = os.path.join(directory, file) r = send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) - if r["warnings"][-1].find(tl_type) != -1: + + found = False + for warning in r["warnings"]: + if warning.find(tl_type) != -1: + found = True + break + + if found: total_incorrect_files += 1 failed.append(file) return param_dist_errors(total_file_size, total_incorrect_files, failed) -def main() -> None: - with TemporaryDirectory() as path_base: - path_out = os.path.join(path_base, "data_with_text_layer.zip") - wget.download("https://at.ispras.ru/owncloud/index.php/s/BvABW6mx9wKWqPp/download", path_out) +if __name__ == "__main__": + data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") + os.makedirs(data_dir, exist_ok=True) + benchmark_data_dir = os.path.join(data_dir, "data_with_text_layer") + + if not os.path.isdir(benchmark_data_dir): + path_out = os.path.join(data_dir, "data_with_text_layer.zip") + wget.download("https://at.ispras.ru/owncloud/index.php/s/axacSYXf7YCLcbb/download", path_out) with zipfile.ZipFile(path_out, 'r') as zip_ref: - zip_ref.extractall(path_base) - directory = os.path.join(path_base, 'data_with_text_layer') - print(directory) - result = OrderedDict() - result["version"] = requests.get("{}/version".format(host)).text - parameters = dict(pdf_with_text_layer="auto", pages="0:1") - result_item = OrderedDict() - incorrect_tl_result = errors_param_for_text_layer(directory, 'incorrect', 'data_correct_text_layer', - parameters) - result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / \ - incorrect_tl_result.total_file_size - result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed - correct_tl_result = errors_param_for_text_layer(directory, 'correct', 'data_incorrect_text_layer', - parameters) - result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / \ - correct_tl_result.total_file_size - result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed - result["guessing_the_correctness_of_the_text"] = result_item - - with open(path_result, "w") as file_out: - json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) - print("save result in" + path_result) - - -main() + zip_ref.extractall(data_dir) + os.remove(path_out) + print(f"Benchmark data downloaded to {benchmark_data_dir}") + else: + print(f"Use cached benchmark data from {benchmark_data_dir}") + + assert os.path.isdir(benchmark_data_dir) + + result = OrderedDict() + result["version"] = requests.get(f"{host}/version").text + parameters = dict(pdf_with_text_layer="auto", pages="1:1") + result_item = OrderedDict() + + incorrect_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' incorrect ', 'data_correct_text_layer', parameters) + result_item["percentage_of_guessed_correct_tl"] = 1 - incorrect_tl_result.total_incorrect_files / incorrect_tl_result.total_file_size + result_item["list_of_file_with_incorrect_tl"] = incorrect_tl_result.failed + + correct_tl_result = errors_param_for_text_layer(benchmark_data_dir, ' correct ', 'data_incorrect_text_layer', parameters) + result_item["percentage_of_guessed_incorrect_tl"] = 1 - correct_tl_result.total_incorrect_files / correct_tl_result.total_file_size + result_item["list_of_file_with_correct_tl"] = correct_tl_result.failed + result["guessing_the_correctness_of_the_text"] = result_item + + with open(path_result, "w") as file_out: + json.dump(obj=result, fp=file_out, indent=4, ensure_ascii=False) + print("Save result in" + path_result) diff --git a/dedoc/scripts/create_txtlayer_dataset.py b/dedoc/scripts/create_txtlayer_dataset.py new file mode 100644 index 00000000..0714b4ed --- /dev/null +++ b/dedoc/scripts/create_txtlayer_dataset.py @@ -0,0 +1,226 @@ +import argparse +import os +import random +import re +import tempfile +from abc import ABC, abstractmethod +from typing import List, Tuple + +import numpy as np +import requests +from PIL import ImageFont, Image, ImageDraw +from bs4 import BeautifulSoup +from tqdm import tqdm + +from dedoc.readers import PdfImageReader + + +class CorrectTextGenerator: + def __init__(self) -> None: + self.citation = re.compile(r'\[\d+]') + self.meta = re.compile(r'\[править \| править код]') + self.symbols = re.compile(r'[→←↑]') + + self.title_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=query&format=json&list=random&rnlimit=1&rnnamespace=0" + self.article_url = "https://{lang}.wikipedia.org/w/api.php?origin=*&action=parse&format=json&page={title}&prop=text" + + def get_random_text(self, lang: str) -> str: + article_text_fixed = "" + + while len(article_text_fixed) == 0: + try: + # 1 - Get random title of the article in Wikipedia + title_result = requests.post(self.title_url.format(lang=lang)) + title_result_dict = title_result.json() + title = title_result_dict["query"]["random"][0]["title"] + + # 2 - Get text the article + article_result = requests.post(self.article_url.format(lang=lang, title=title)) + article_result_dict = article_result.json() + article = article_result_dict["parse"]["text"]['*'] + bs = BeautifulSoup(article, 'html.parser') + article_text = bs.get_text() + + # 3 - Clear text of the article from unused symbols + article_text_fixed = re.sub(self.citation, '', article_text) + article_text_fixed = re.sub(self.meta, "", article_text_fixed) + article_text_fixed = re.sub(self.symbols, "", article_text_fixed) + article_text_fixed = re.sub(r'\n+', "\n", article_text_fixed) + except: # noqa + article_text_fixed = "" + + return article_text_fixed + + +class Corruptor(ABC): + @abstractmethod + def corrupt(self, text: str, lang: str) -> str: + pass + + +class EncodingCorruptor(Corruptor): + def __init__(self) -> None: + self.encodings = { + "en": { + "input": ['cp1026'], + "output": ['cp1256', 'cp437', 'cp775', 'cp852', 'cp855', 'cp857', 'cp860', 'cp861', 'cp862', 'cp863', 'cp866', 'gb18030', 'hp_roman8', + 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'koi8_r', + 'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman'] + + }, + "ru": { + "input": ['cp855', 'cp866', 'gb18030', 'iso8859_5', 'koi8_r', 'mac_cyrillic', 'utf_8'], + "output": ['cp1026', 'cp1256', 'cp437', 'cp775', 'cp850', 'cp852', 'cp863', 'cp866', 'hp_roman8', 'iso8859_10', 'iso8859_11', + 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'iso8859_2', 'iso8859_4', 'iso8859_5', 'iso8859_9', 'koi8_r', + 'mac_cyrillic', 'mac_greek', 'mac_latin2', 'mac_roman', 'cp1140', 'cp273', 'cp855', 'cp860', 'cp861', 'cp857', 'cp500', + 'cp862', 'gb18030'] + + } + } + + def corrupt(self, text: str, lang: str) -> str: + input_encoding, output_encoding = "", "" + while input_encoding == output_encoding: + input_encoding = random.choice(self.encodings[lang]["input"]) + output_encoding = random.choice(self.encodings[lang]["output"]) + + encoded, decoded = "", "" + while encoded == "" and text != "": + try: + encoded = text.encode(encoding=input_encoding) + except UnicodeEncodeError as e: + text = text[:int(e.args[2])] + text[int(e.args[3]):] + encoded = "" + + while decoded == "" and encoded != "": + try: + decoded = encoded.decode(encoding=output_encoding) + except UnicodeDecodeError as e: + encoded = encoded[:int(e.args[2])] + encoded[int(e.args[3]):] + decoded = "" + + return decoded + + +class OCRCorruptor: + def __init__(self) -> None: + self.image_reader = PdfImageReader(config=dict(n_jobs=1)) + self.max_images = 10 + self.page_size = (2480, 3508) + self.font_size = 50 + self.line_gap = 50 + self.horizontal_padding = 50 + self.vertical_padding = 50 + + self.text_color = (0, 0, 0) + self.background_color = (255, 255, 255) + + font_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources", "Arial_Narrow.ttf")) + self.font = ImageFont.truetype(font_path, self.font_size) + + def corrupt(self, text: str, lang: str) -> str: + ocr_lang = "en" if lang == "ru" else "ru" + + with tempfile.TemporaryDirectory() as tmpdir: + # 1 - save images with text + images_path_list = self.__create_images(text, tmpdir) + + # 2 - read text from the image using OCR with another language + lines = [] + for image_path in images_path_list: + document = self.image_reader.read(image_path, document_type=None, parameters=dict(language=ocr_lang, + need_pdf_table_analysis="false", + document_orientation="no_change", + is_one_column_document="true")) + lines.extend(document.lines) + + return "".join([line.line for line in lines]) + + def __create_images(self, text: str, out_dir: str) -> List[str]: + image_paths = [] + img, draw = self.__create_page() + lines = text.split("\n") + x, y = self.horizontal_padding, self.vertical_padding + + for line in lines: + words = line.split() + + for word in words: + x_min, y_min, x_max, y_max = draw.textbbox((x, y), word, self.font) + if x_max + self.horizontal_padding >= self.page_size[0]: + x = self.horizontal_padding + y += self.font_size + self.line_gap + + if y_max + self.vertical_padding >= self.page_size[1]: + img_path = os.path.join(out_dir, f"{len(image_paths)}.png") + img.save(img_path) + image_paths.append(img_path) + if len(image_paths) >= self.max_images: + return image_paths + + img, draw = self.__create_page() + x, y = self.horizontal_padding, self.vertical_padding + + x_min, y_min, x_max, y_max = draw.textbbox((x, y), word, self.font) + draw.text((x, y), word, self.text_color, font=self.font) + x = x_max + self.font_size + + y += self.font_size + self.line_gap + x = self.horizontal_padding + + return image_paths + + def __create_page(self) -> Tuple[Image.Image, ImageDraw.ImageDraw]: + page_size = self.page_size + img_arr = np.zeros((page_size[1], page_size[0], 3), dtype=np.uint8) + img_arr[:, :] = self.background_color + img = Image.fromarray(img_arr) + draw = ImageDraw.Draw(img) + return img, draw + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--dataset_size", type=int, help="Number of images in each text group (correct/incorrect) for each language (ru, en)." + "E.g. if dataset_size=1000, 4000 images will be generated overall", default=1000) + parser.add_argument("--start_number", type=int, help="Number from which to start images numbering", default=0) + parser.add_argument("--out_dir", help="Path to the directory to save the data", default="generated_data") + parser.add_argument("--correct_dir", help="Name of the directory with correct texts", default="correct") + parser.add_argument("--incorrect_dir", help="Name of the directory with incorrect texts", default="incorrect") + args = parser.parse_args() + + text_generator = CorrectTextGenerator() + corruptor_list = [OCRCorruptor(), EncodingCorruptor()] + + os.makedirs(os.path.join(args.out_dir, args.correct_dir), exist_ok=True) + os.makedirs(os.path.join(args.out_dir, args.incorrect_dir), exist_ok=True) + + i = args.start_number + print("Generating incorrect texts") + for _ in tqdm(range(args.dataset_size)): + for language in ("ru", "en"): + text = "" + + while not text: + try: + text = text_generator.get_random_text(lang=language) + corruptor = random.choice(corruptor_list) + text = corruptor.corrupt(text, lang=language) + except Exception as e: + print(e) + text = "" + + with open(os.path.join(args.out_dir, args.incorrect_dir, f"{i:08d}_{language}.txt"), "w") as f: + f.write(text) + i += 1 + + i = args.start_number + print("Generating correct texts") + for _ in tqdm(range(args.dataset_size)): + for language in ("ru", "en"): + + text = text_generator.get_random_text(lang=language) + + with open(os.path.join(args.out_dir, args.correct_dir, f"{i:08d}_{language}.txt"), "w") as f: + f.write(text) + i += 1 diff --git a/dedoc/scripts/train/train_catboost_detect_tl_correctness.py b/dedoc/scripts/train/train_catboost_detect_tl_correctness.py deleted file mode 100644 index 9d9e1ff9..00000000 --- a/dedoc/scripts/train/train_catboost_detect_tl_correctness.py +++ /dev/null @@ -1,133 +0,0 @@ -import os -from pathlib import Path -from typing import List - -import pandas as pd -from catboost import CatBoostClassifier, Pool -from sklearn.metrics import f1_score -import gzip -import pickle - -from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier import TxtlayerClassifier - - -class GetTextAndTarget: - """ - The GetTextAndTarget class is used for loading and processing text data from correct and incorrect text files. - """ - def __init__(self, path_correct_texts: str, path_incorrect_texts: str) -> None: - self.path_correct_texts = self.make_path(Path(path_correct_texts)) - self.path_incorrect_texts = self.make_path(Path(path_incorrect_texts)) - self.path_all = self.path_correct_texts + self.path_incorrect_texts - - def make_path(self, path: Path) -> List[str]: - path_all = [] - if path.is_dir(): - for subdir in path.iterdir(): - for subsubdir in subdir.iterdir(): - path_all.append(str(subsubdir)) - else: - print("Empty dir ", path) - return path_all - - def __len__(self) -> int: - return len(self.path_all) - - def __getitem__(self, item: int) -> dict: - try: - with open(self.path_all[item], mode="r") as f: - text = f.read() - except Exception as e: - print(f'Bad file {str(e)}: ', self.path_all[item]) - - try: - if len(text.strip()) == 0: - raise Exception('Empty file') - except Exception as error: - print('Caught this error: ' + str(error)) - - label = 1 if self.path_all[item] in str(self.path_correct_texts) else 0 - - return {"text": text, "label": label} - - -class GetFeaturesFromText(TxtlayerClassifier): - """ - The GetFeaturesFromText class is used for extracting features from text data. - """ - def __init__(self, *, config: dict) -> None: - super().__init__(config=config) - - def __len__(self) -> int: - return len(self.symbols_list) - - def get_feature(self, correct_data_path: str, not_correct_data_path: str) -> dict: - """ - Generate features and labels for the given dataset. - :param correct_data_path: Path to the directory containing correct text files. - :param not_correct_data_path: Path to the directory containing incorrect text files. - :returns: a dictionary containing features and labels. - """ - dataset = GetTextAndTarget(path_correct_texts=correct_data_path, path_incorrect_texts=not_correct_data_path) - label = [] - features = [] - for data in dataset: - list_of_sub = [] - num_letters_in_data = self._count_letters(data["text"]) - num_other_symboll_in_data = self._count_other(data["text"]) - for symbol in self.letters_list: - if num_letters_in_data != 0: - list_of_sub.append(round(data["text"].count(symbol) / num_letters_in_data, 5)) - else: - list_of_sub.append(0.0) - for symbol in self.symbols_list: - list_of_sub.append(data["text"].count(symbol)) - list_of_sub.append(num_letters_in_data + num_other_symboll_in_data / len(data["text"]) if len(data["text"]) != 0 else 0) - features.append(list_of_sub) - label.append(data["label"]) - return {"features": features, "label": label} - - def get_need_dataframe(self, correct_data_path: str, not_correct_data_path: str, csv_name: str) -> pd.DataFrame: - """ - Create a DataFrame from the given dataset and save it as a CSV file. - :param correct_data_path: Path to the directory containing correct text files. - :param not_correct_data_path: Path to the directory containing incorrect text files. - :param csv_name: Name of the output CSV file. - :returns: The generated DataFrame. - """ - features = self.get_feature(correct_data_path=correct_data_path, not_correct_data_path=not_correct_data_path) - df = pd.DataFrame(features["features"]) - df.to_csv(csv_name, sep='\t', index=False) - return df - - -def train() -> None: - boost = GetFeaturesFromText(config={}) - features_train = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct/", - not_correct_data_path=os.getcwd() + "/data/not_correct/") - features_test = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct_test/", - not_correct_data_path=os.getcwd() + "/data/not_correct_test/") - features_val = boost.get_feature(correct_data_path=os.getcwd() + "/data/correct_val/", - not_correct_data_path=os.getcwd() + "/data/not_correct_val/") - - df_train = pd.DataFrame(features_train["features"]) - df_test = pd.DataFrame(features_test["features"]) - df_val = pd.DataFrame(features_val["features"]) - df_train_label = features_train["label"] - df_test_label = features_test["label"] - df_val_label = features_val["label"] - - booster = CatBoostClassifier(iterations=100, verbose=10, task_type="CPU", devices="0") - - train_data = Pool(df_train, df_train_label) - test_data = Pool(df_test, df_test_label) - val_data = Pool(df_val, df_val_label) - - booster.fit(train_data, eval_set=val_data, plot=True) - - test_preds = booster.predict(test_data) - - f1_score(df_test_label, test_preds) - - with gzip.open('catboost_detect_tl_correctness.pkl.gz', 'wb') as file: - pickle.dump(booster, file) diff --git a/dedoc/scripts/train/train_txtlayer_classifier.py b/dedoc/scripts/train/train_txtlayer_classifier.py new file mode 100644 index 00000000..152c11c9 --- /dev/null +++ b/dedoc/scripts/train/train_txtlayer_classifier.py @@ -0,0 +1,104 @@ +import gzip +import os +import pickle +import zipfile +from pathlib import Path +from typing import List, Tuple + +import wget +import xgbfir +from sklearn.metrics import f1_score +from xgboost import XGBClassifier + +from dedoc.config import get_config +from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_feature_extractor import TxtlayerFeatureExtractor + + +class GetTextAndTarget: + """ + The GetTextAndTarget class is used for loading and processing text data from correct and incorrect text files. + """ + def __init__(self, path_correct_texts: str, path_incorrect_texts: str) -> None: + self.path_correct_texts = self.__make_path(Path(path_correct_texts)) + self.path_incorrect_texts = self.__make_path(Path(path_incorrect_texts)) + self.path_all = self.path_correct_texts + self.path_incorrect_texts + + def __make_path(self, path: Path) -> List[str]: + if not path.is_dir(): + return [] + + path_all = [] + for subdir in path.iterdir(): + if not subdir.is_dir(): + continue + for file_path in subdir.iterdir(): + if str(file_path).endswith(".txt"): + path_all.append(str(file_path)) + + return path_all + + def get_texts_and_targets(self) -> Tuple[List[str], List[int]]: + texts, labels = [], [] + + for path in self.path_all: + try: + with open(path, mode="r") as f: + text = f.read() + except Exception as e: + print(f'Bad file {str(e)}: {path}') + continue + + if len(text.strip()) == 0: + print(f'Empty file: {path}') + continue + + texts.append(text) + labels.append(int(path in str(self.path_correct_texts))) + + return texts, labels + + +if __name__ == "__main__": + data_dir = os.path.join(get_config()["intermediate_data_path"], "text_layer_correctness_data") + os.makedirs(data_dir, exist_ok=True) + txtlayer_classifier_dataset_dir = os.path.join(data_dir, "data") + + if not os.path.isdir(txtlayer_classifier_dataset_dir): + path_out = os.path.join(data_dir, "data.zip") + wget.download("https://at.ispras.ru/owncloud/index.php/s/z9WLFiKKFo2WMgW/download", path_out) + with zipfile.ZipFile(path_out, 'r') as zip_ref: + zip_ref.extractall(data_dir) + os.remove(path_out) + print(f"Dataset downloaded to {txtlayer_classifier_dataset_dir}") + else: + print(f"Use cached dataset from {txtlayer_classifier_dataset_dir}") + + assert os.path.isdir(txtlayer_classifier_dataset_dir) + + features_extractor = TxtlayerFeatureExtractor() + stages_data = {} + + for stage in ["train", "test", "val"]: + texts, labels = GetTextAndTarget(os.path.join(txtlayer_classifier_dataset_dir, f"correct_{stage}"), + os.path.join(txtlayer_classifier_dataset_dir, f"not_correct_{stage}")).get_texts_and_targets() + features = features_extractor.transform(texts) + stages_data[stage] = dict(features=features, labels=labels) + + clf = XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=600, booster="gbtree", tree_method="hist", max_depth=3) + clf.fit( + X=stages_data["train"]["features"], + y=stages_data["train"]["labels"], + eval_set=[(stages_data["val"]["features"], stages_data["val"]["labels"])], + ) + test_preds = clf.predict(stages_data["test"]["features"]) + + score = f1_score(stages_data["test"]["labels"], test_preds) + print(f"F1 score = {score}") + + resources_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "..", "..", "resources") + with gzip.open(os.path.join(resources_dir, 'txtlayer_classifier.pkl.gz'), 'wb') as file: + pickle.dump(clf, file) + + xgbfir.saveXgbFI(clf, + feature_names=features.columns, + OutputXlsxFile=os.path.join(resources_dir, "feature_importances", 'txtlayer_classifier_feature_importances.xlsx')) diff --git a/dedoc/structure_constructors/abstract_structure_constructor.py b/dedoc/structure_constructors/abstract_structure_constructor.py index 674c2db6..bda3e927 100644 --- a/dedoc/structure_constructors/abstract_structure_constructor.py +++ b/dedoc/structure_constructors/abstract_structure_constructor.py @@ -19,13 +19,12 @@ class AbstractStructureConstructor(ABC): """ @abstractmethod - def structure_document(self, document: UnstructuredDocument, version: str, structure_type: Optional[str] = None) -> ParsedDocument: + def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: """ Process unstructured document and build parsed document representation on this basis. :param document: intermediate representation of the document received from some structure extractor \ (there should be filled hierarchy levels for all lines) - :param version: current version of the dedoc library :param structure_type: type of the structure that should be retrieved for the document :return: the structured representation of the given document """ diff --git a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py index 311a7682..c7160d1f 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/linear_constructor.py @@ -14,7 +14,7 @@ class LinearConstructor(AbstractStructureConstructor): The result contains the empty root node with the consecutive list of all document lines as its children. """ - def structure_document(self, document: UnstructuredDocument, version: str, structure_type: Optional[str] = None) -> ParsedDocument: + def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: """ Build the linear structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. @@ -26,4 +26,4 @@ def structure_document(self, document: UnstructuredDocument, version: str, struc tree.merge_annotations() document_content = DocumentContent(tables=document.tables, structure=tree) metadata = DocumentMetadata(**document.metadata) - return ParsedDocument(content=document_content, metadata=metadata, version=version) + return ParsedDocument(content=document_content, metadata=metadata, warnings=document.warnings) diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index b90cfe7b..04b63fc0 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -33,7 +33,7 @@ class TreeConstructor(AbstractStructureConstructor): - **second child line (1, 0)** """ - def structure_document(self, document: UnstructuredDocument, version: str, structure_type: Optional[str] = None) -> ParsedDocument: + def structure_document(self, document: UnstructuredDocument, structure_type: Optional[str] = None) -> ParsedDocument: """ Build the tree structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. @@ -58,7 +58,7 @@ def structure_document(self, document: UnstructuredDocument, version: str, struc tree.merge_annotations() document_content = DocumentContent(tables=document.tables, structure=tree) metadata = DocumentMetadata(**document.metadata) - return ParsedDocument(content=document_content, metadata=metadata, version=version) + return ParsedDocument(content=document_content, metadata=metadata, warnings=document.warnings) def __get_document_name(self, lines: List[LineWithMeta]) -> Tuple[List[LineWithMeta], List[LineWithMeta]]: document_name = [] diff --git a/dedoc/structure_constructors/structure_constructor_composition.py b/dedoc/structure_constructors/structure_constructor_composition.py index d9b768b0..899070f5 100644 --- a/dedoc/structure_constructors/structure_constructor_composition.py +++ b/dedoc/structure_constructors/structure_constructor_composition.py @@ -24,7 +24,6 @@ def __init__(self, constructors: Dict[str, AbstractStructureConstructor], defaul def structure_document(self, document: UnstructuredDocument, - version: str, structure_type: Optional[str] = None, parameters: Optional[dict] = None) -> ParsedDocument: """ @@ -38,9 +37,9 @@ def structure_document(self, document = self.table_patcher.insert_table(document=document) if structure_type in self.constructors: - return self.constructors[structure_type].structure_document(document, structure_type) + return self.constructors[structure_type].structure_document(document) if structure_type is None or structure_type == "": - return self.default_constructor.structure_document(document, version, structure_type) + return self.default_constructor.structure_document(document) raise StructureExtractorException(f"Bad structure type {structure_type}, available structure types is: {' '.join(self.constructors.keys())}") diff --git a/dedoc/utils/annotation_merger.py b/dedoc/utils/annotation_merger.py index f105edc4..c9bdee84 100644 --- a/dedoc/utils/annotation_merger.py +++ b/dedoc/utils/annotation_merger.py @@ -82,7 +82,7 @@ def _merge_one_group(self, annotations: List[Annotation], spaces: List[Space]) - """ Merge one group annotations, assume that all annotations has the same name and value """ - if len(annotations) <= 1: + if len(annotations) <= 1 or not annotations[0].is_mergeable: return annotations self.__check_annotations_group(annotations) result = [] diff --git a/dedoc/utils/utils.py b/dedoc/utils/utils.py index 282068b5..e9f4b9d5 100644 --- a/dedoc/utils/utils.py +++ b/dedoc/utils/utils.py @@ -2,24 +2,26 @@ import difflib import gzip import hashlib +import json import mimetypes import os import random import re +import shutil import time -import requests -import json from os.path import splitext from typing import List, Optional, TypeVar, Tuple, Iterable, Iterator, Dict, Any +import requests from Levenshtein._levenshtein import ratio -from dateutil.parser import parse from charset_normalizer import from_bytes +from dateutil.parser import parse +from fastapi import UploadFile from dedoc.data_structures.document_content import DocumentContent +from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.tree_node import TreeNode -from dedoc.data_structures.hierarchy_level import HierarchyLevel T = TypeVar("T") @@ -91,6 +93,19 @@ def get_unique_name(filename: str) -> str: return str(ts) + '_' + str(rnd) + ext +def save_upload_file(upload_file: UploadFile, output_dir: str) -> str: + file_name = upload_file.filename.split("/")[-1] + file_name = check_filename_length(file_name) + file_path = os.path.join(output_dir, file_name) + try: + with open(file_path, "wb") as buffer: + shutil.copyfileobj(upload_file.file, buffer) + finally: + upload_file.file.close() + + return file_path + + def save_data_to_unique_file(directory: str, filename: str, binary_data: bytes) -> str: """ Saving binary data into a unique name by the filename diff --git a/dedoc/version.py b/dedoc/version.py new file mode 100644 index 00000000..7602829c --- /dev/null +++ b/dedoc/version.py @@ -0,0 +1 @@ +__version__ = "" diff --git a/docker/Dockerfile b/docker/Dockerfile index fd0989d9..43e4be14 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,10 +9,12 @@ RUN pip3 install -r requirements.txt RUN mkdir /dedoc_root ADD dedoc /dedoc_root/dedoc +ADD VERSION /dedoc_root + +RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py RUN python3 /dedoc_root/dedoc/download_models.py ADD tests /dedoc_root/tests ADD resources /dedoc_root/resources -ADD VERSION /dedoc_root CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"] \ No newline at end of file diff --git a/docs/source/_static/code_examples/dedoc_usage_tutorial.py b/docs/source/_static/code_examples/dedoc_usage_tutorial.py index 24d5a644..b15190e8 100644 --- a/docs/source/_static/code_examples/dedoc_usage_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_usage_tutorial.py @@ -86,7 +86,7 @@ metadata_extractor = DocxMetadataExtractor() metadata_extractor.can_extract(document, file_dir, file_name, file_name, file_name) # True -document = metadata_extractor.add_metadata(document, file_dir, file_name, file_name, file_name, "") +document = metadata_extractor.add_metadata(document, file_dir, file_name, file_name, file_name) print(document.metadata) # {'file_name': 'example.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'size': 373795, 'access_time': 1686825619, 'created_time': 1686825617, 'modified_time': 1686823541, 'other_fields': {'document_subject': '', 'keywords': '', 'category': '', 'comments': '', 'author': '', 'last_modified_by': '', 'created_date': 1568725611, 'modified_date': 1686752726, 'last_printed_date': None}} @@ -112,7 +112,7 @@ from dedoc.structure_constructors import TreeConstructor constructor = TreeConstructor() -parsed_document = constructor.structure_document(document, "") +parsed_document = constructor.structure_document(document) print(parsed_document) # print(list(vars(parsed_document))) # ['metadata', 'content', 'attachments', 'version', 'warnings'] @@ -122,12 +122,10 @@ """Run the whole pipeline""" -from dedoc.manager.dedoc_manager import DedocManager -from dedoc.config import _config as config -from dedoc.configuration_manager import get_manager_config +from dedoc import DedocManager -manager = DedocManager.from_config("", get_manager_config(config=config), config=config) -result = manager.parse_file(file_path=file_path, parameters={}) +manager = DedocManager() +result = manager.parse(file_path=file_path, parameters={}) print(result) # print(result.to_dict()) # OrderedDict([('version', ''), ('warnings', []), ('content', OrderedDict([('structure', OrderedDict([('node_id', '0'), ('text', ''), ('annotations', []), ('metadata', OrderedDict([('page_id', 0), ('line_id', 0), ('paragraph_type', 'root'), ('other_fields', {})])), ... diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 8602e89b..380b97c6 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,19 @@ Changelog ========= +v0.10.0 (2023-08-01) +-------------------- +Release note: `v0.10.0 `_ + +* Add ConfidenceAnnotation annotation for PdfImageReader. +* Remove version parameter from metadata extractors, structure constructors and parsed document methods. +* Add version file and version resolving for the library. +* Add recursive handling of attachments. +* Add parameter for saving attachments in a custom directory. +* Remove dedoc threaded manager. +* Improve PdfAutoReader. +* Add temporary file name to DocumentMetadata. + v0.9.2 (2023-07-18) ------------------- Release note: `v0.9.2 `_ diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 435a9448..5fcf9363 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -140,6 +140,11 @@ Api parameters description The encoded contents will be saved in the attachment's metadata in the `base64_encode` field. Use `true` value to enable this behaviour. + * - attachments_dir + - optional string with a valid path + - None + - The path to the directory where document's attached files can be saved instead of a temporary directory. + * - :cspan:`3` **Tables handling** * - insert_table diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index f51af12f..b5f36022 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -287,11 +287,12 @@ The description of :ref:`API output JSON format ` also may be usefu Run the whole pipeline ---------------------- -For running the whole pipeline with all readers, metadata and structure extractors, structure constructors, one may use manager class. +For running the whole pipeline with all readers, metadata and structure extractors, structure constructors, +one may use manager class (see :ref:`dedoc_manager` for more details). .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python - :lines: 125-133 + :lines: 125-131 Manager allows to run workflow (see :ref:`dedoc_workflow`) for a file of any format supported by dedoc (see :ref:`table_formats`). -One can also make a custom `config` and `manager_config` for more flexible usage of the library. \ No newline at end of file +One can also make a custom `config` and `manager_config` (parameters of the manager constructor) for more flexible usage of the library. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index ce0a0e29..8978f173 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -24,7 +24,7 @@ Workflow -------- .. image:: _static/workflow.png - :align: center + :width: 700px The main workflow consists of the following stages: 1. **Converting** document to one of the supported formats. @@ -234,6 +234,7 @@ For a document of unknown or unsupported domain there is an option to use defaul :maxdepth: 1 :caption: Package Reference + modules/manager modules/data_structures modules/converters modules/readers diff --git a/docs/source/modules/data_structures.rst b/docs/source/modules/data_structures.rst index a3ca2ca5..39628cff 100644 --- a/docs/source/modules/data_structures.rst +++ b/docs/source/modules/data_structures.rst @@ -201,3 +201,9 @@ Concrete annotations :special-members: __init__ .. autoattribute:: name + +.. autoclass:: dedoc.data_structures.ConfidenceAnnotation + :show-inheritance: + :special-members: __init__ + + .. autoattribute:: name diff --git a/docs/source/modules/manager.rst b/docs/source/modules/manager.rst new file mode 100644 index 00000000..53a5cb26 --- /dev/null +++ b/docs/source/modules/manager.rst @@ -0,0 +1,12 @@ +.. _dedoc_manager: + +Dedoc pipeline +============== + +.. autoclass:: dedoc.DedocManager + :special-members: __init__ + :members: + +.. autoclass:: dedoc.attachments_handler.AttachmentsHandler + :special-members: __init__ + :members: diff --git a/requirements.txt b/requirements.txt index 5b44f4d3..2f4ace74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ Cython==0.29.28 flake8==3.9.2 pyflakes==2.3.0 beautifulsoup4==4.10.0 -catboost==1.1 charset-normalizer==2.0.12 docx==0.2.4 huggingface-hub==0.14.1 diff --git a/resources/benchmarks/benchmarks_tl_correctness.json b/resources/benchmarks/benchmarks_tl_correctness.json new file mode 100644 index 00000000..445b658d --- /dev/null +++ b/resources/benchmarks/benchmarks_tl_correctness.json @@ -0,0 +1,24 @@ +{ + "version": "0.9.2", + "guessing_the_correctness_of_the_text": { + "percentage_of_guessed_correct_tl": 0.9699570815450643, + "list_of_file_with_incorrect_tl": [ + "access-the-vision-for-2013.pdf", + "afcea-spy.pdf", + "b96a__usmc-combat-camera-directory.pdf", + "HBG-JMP-CIM_1616126784_120.pdf", + "demystifying-nge-rock-ridge_1643518222_537.pdf", + "nifog-2009_1616541890_542.pdf", + "hogans-federal-motion-for-a-preliminary-injunction_1616093696_24.pdf" + ], + "percentage_of_guessed_incorrect_tl": 0.75, + "list_of_file_with_correct_tl": [ + "slides.pdf", + "╨º╨£╨£╨ñ_╨É╨▒╨░╨║╤â╨╝╨╛╨▓_╤â╤ç╨╡╨▒╨╜╨╕╨║.pdf", + "PE20_1616439522_1.pdf", + "EXTERNAL FORMS - SUPPORTING DOCUMENTATION-ESHS9615401 2017_07_27 11_22_39_1616049888_455.pdf", + "cu-spy-holes_1616346633_620.pdf", + "PE157_1616278053_181.pdf" + ] + } +} \ No newline at end of file diff --git a/resources/feature_importances/txtlayer_classifier_feature_importances.xlsx b/resources/feature_importances/txtlayer_classifier_feature_importances.xlsx new file mode 100644 index 00000000..9c7739ff Binary files /dev/null and b/resources/feature_importances/txtlayer_classifier_feature_importances.xlsx differ diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..9d4a5d4a --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +import os + +from setuptools import setup + + +if __name__ == "__main__": + with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "VERSION"), "r") as f: + version = f.read() + + # Dynamically set the __version__ attribute + with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "dedoc", "version.py"), "w", encoding="utf-8") as f: + f.write(f'__version__ = "{version}"\n') + setup(name="dedoc", version=version) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index 2beb4593..2e49beba 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -2,6 +2,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation +from dedoc.data_structures.concrete_annotations.confidence_annotation import ConfidenceAnnotation from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from dedoc.utils import supported_image_types @@ -15,8 +16,10 @@ def __check_example_file(self, result: dict) -> None: content = result["content"]["structure"]["subparagraphs"] self._check_similarity("Пример документа", content[0]["text"].strip().split("\n")[0]) annotations = content[0]["annotations"] - self.assertIn(BoldAnnotation.name, [annotation["name"] for annotation in annotations]) - self.assertIn(SpacingAnnotation.name, [annotation["name"] for annotation in annotations]) + annotation_names = {annotation["name"] for annotation in annotations} + self.assertIn(BoldAnnotation.name, annotation_names) + self.assertIn(SpacingAnnotation.name, annotation_names) + self.assertIn(ConfidenceAnnotation.name, annotation_names) self._check_similarity("1.2.1 Поясним за непонятное", content[3]["subparagraphs"][0]["text"]) def __check_metainfo(self, metainfo: dict, actual_type: str, actual_name: str) -> None: diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py index 437d044d..479e2bb2 100644 --- a/tests/api_tests/test_api_misc_with_attachments.py +++ b/tests/api_tests/test_api_misc_with_attachments.py @@ -179,3 +179,17 @@ def __check_base64(self, with_base64: bool) -> dict: # check that attachment on cloud and looks fine metadata = attachment["metadata"] return metadata + + def test_attachments_recursion(self) -> None: + file_name = "with_attachments/with_attachments_0.docx" + + result = self._send_request(file_name=file_name, data=dict(with_attachments=True, need_content_analysis=True, recursion_deep_attachments=0)) + self.assertEqual(0, len(result["attachments"])) + + result = self._send_request(file_name=file_name, data=dict(with_attachments=True, need_content_analysis=True, recursion_deep_attachments=1)) + self.assertLess(0, len(result["attachments"])) + self.assertEqual(0, len(result["attachments"][1]["attachments"])) + + result = self._send_request(file_name=file_name, data=dict(with_attachments=True, need_content_analysis=True, recursion_deep_attachments=2)) + self.assertLess(0, len(result["attachments"])) + self.assertLess(0, len(result["attachments"][1]["attachments"])) diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index db543516..391075b5 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -20,7 +20,7 @@ def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") directory, filename = os.path.split(path) document = self.txt_reader.read(path=path, document_type="law", parameters={}) - document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename, "") + document = self.metadata_extractor.add_metadata(document, directory, filename, filename, filename) document = self.law_extractor.extract_structure(document, {}) self.assertListEqual([], document.attachments) diff --git a/tests/unit_tests/test_misc_dedoc_manager.py b/tests/unit_tests/test_misc_dedoc_manager.py index 345d4715..d63c8fe2 100644 --- a/tests/unit_tests/test_misc_dedoc_manager.py +++ b/tests/unit_tests/test_misc_dedoc_manager.py @@ -2,28 +2,25 @@ from unittest import TestCase from dedoc.config import get_config -from dedoc.configuration_manager import get_manager_config -from dedoc.manager.dedoc_manager import DedocManager +from dedoc.manager_config import get_manager_config +from dedoc.dedoc_manager import DedocManager class TestDedocManager(TestCase): path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "csvs")) config = get_config() manager_config = get_manager_config(config=config) - dedoc_manager = DedocManager.from_config(version="tests", manager_config=manager_config, config=config) + dedoc_manager = DedocManager(manager_config=manager_config, config=config) def test_parse_file(self) -> None: filename = "csv_tab.tsv" - result = self.dedoc_manager.parse_file(os.path.join(self.path, "csv_tab.tsv"), {}) + result = self.dedoc_manager.parse(os.path.join(self.path, "csv_tab.tsv")) self.assertEqual(filename, result.metadata.file_name) self.assertEqual(filename, result.metadata.file_name) self.assertLessEqual(["1", "2", "3"], result.content.tables[0].cells[0]) self.assertLessEqual(["2", "1", "5"], result.content.tables[0].cells[1]) self.assertLessEqual(["5", "3", "1"], result.content.tables[0].cells[2]) - def test_version(self) -> None: - self.assertEqual("tests", self.dedoc_manager.version) - def test_file_not_exists(self) -> None: with self.assertRaises(FileNotFoundError): - self.dedoc_manager.parse_file("afagahcr", {}) + self.dedoc_manager.parse("afagahcr") diff --git a/tests/unit_tests/test_misc_tasker.py b/tests/unit_tests/test_misc_tasker.py index e558c6ee..64825354 100644 --- a/tests/unit_tests/test_misc_tasker.py +++ b/tests/unit_tests/test_misc_tasker.py @@ -9,11 +9,12 @@ from dedoc.attachments_handler.attachments_handler import AttachmentsHandler from dedoc.converters.file_converter import FileConverterComposition -from dedoc.manager.dedoc_manager import DedocManager +from dedoc.dedoc_manager import DedocManager from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition from dedoc.readers.docx_reader.docx_reader import DocxReader from dedoc.readers.reader_composition import ReaderComposition +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader from dedoc.structure_constructors.concrete_structure_constructors.tree_constructor import TreeConstructor from dedoc.structure_constructors.structure_constructor_composition import StructureConstructorComposition from dedoc.structure_extractors.concrete_structure_extractors.classifying_law_structure_extractor import ClassifyingLawStructureExtractor @@ -21,13 +22,12 @@ from dedoc.structure_extractors.concrete_structure_extractors.foiv_law_structure_extractor import FoivLawStructureExtractor from dedoc.structure_extractors.concrete_structure_extractors.law_structure_excractor import LawStructureExtractor from dedoc.structure_extractors.structure_extractor_composition import StructureExtractorComposition -from dedoc.train_dataset.train_dataset_utils import get_path_original_documents -from dedoc.readers.txt_reader.raw_text_reader import RawTextReader -from tests.test_utils import get_test_config from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker from dedoc.train_dataset.taskers.images_creators.concrete_creators.docx_images_creator import DocxImagesCreator from dedoc.train_dataset.taskers.images_creators.concrete_creators.txt_images_creator import TxtImagesCreator from dedoc.train_dataset.taskers.tasker import Tasker +from dedoc.train_dataset.train_dataset_utils import get_path_original_documents +from tests.test_utils import get_test_config class TestTasker(unittest.TestCase): @@ -117,16 +117,14 @@ def test_images_creators(self) -> None: files_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "images_creator")) images_creators = [DocxImagesCreator(path2docs, config=get_test_config()), TxtImagesCreator(path2docs, config=get_test_config())] - test_manager = DedocManager.from_config(version="0", manager_config=self.__create_test_manager_config(config), config=config) + test_manager = DedocManager(manager_config=self.__create_test_manager_config(config), config=config) for doc in os.listdir(files_dir): if not doc.endswith(('docx', 'txt')): continue with tempfile.TemporaryDirectory() as tmp_dir: with zipfile.ZipFile(os.path.join(tmp_dir, "archive.zip"), "w") as archive: - _ = test_manager.parse_file(file_path=os.path.join(files_dir, doc), - parameters=dict(document_type="law"), - original_file_name=doc) + _ = test_manager.parse(file_path=os.path.join(files_dir, doc), parameters=dict(document_type="law")) lines_path = os.path.join(config["intermediate_data_path"], "lines.jsonlines") self.assertTrue(os.path.isfile(lines_path)) with open(lines_path, "r") as f: @@ -162,5 +160,5 @@ def __create_test_manager_config(self, config: dict) -> dict: structure_extractor=StructureExtractorComposition(extractors=structure_extractors, default_key="other"), structure_constructor=StructureConstructorComposition(default_constructor=TreeConstructor(), constructors={"tree": TreeConstructor()}), document_metadata_extractor=MetadataExtractorComposition(extractors=metadata_extractors), - attachments_extractor=AttachmentsHandler(config=config) + attachments_handler=AttachmentsHandler(config=config) ) diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index 58cb8073..bc43a0e7 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -6,6 +6,7 @@ from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor +from dedoc.dedoc_manager import DedocManager from dedoc.readers import ArchiveReader from tests.test_utils import get_test_config @@ -107,3 +108,15 @@ def __get_list_of_files_in_archive(self, file_name: str) -> List[str]: document = ArchiveReader(config=config).read(path=file_path, parameters={"with_attachment": True}) files = [file.original_name for file in document.attachments] return files + + def test_attachments_dir(self) -> None: + file_name = "with_attachments_0.docx" + manager = DedocManager() + + with tempfile.TemporaryDirectory() as tmpdir: + result = manager.parse(file_path=os.path.join(self.src_dir, file_name), + parameters=dict(with_attachments=True, need_content_analysis=False, attachments_dir=tmpdir)) + + attachment_names = os.listdir(tmpdir) + for attachment in result.attachments: + self.assertIn(attachment.metadata.temporary_file_name, attachment_names)