Skip to content

Commit

Permalink
TLDR-369 class for full dedoc pipeline running (#300)
Browse files Browse the repository at this point in the history
* DedocPipeline added (work in progress)

* TLDR-369_dedoc_manager

* TLDR-369 fix documentation and add test for attachments recursion

* TLDR-369 change version saving

* TLDR-369 review fixes

* TLDR-369 added temporary file name
  • Loading branch information
NastyBoget authored Aug 1, 2023
1 parent 28fd511 commit 31fb470
Show file tree
Hide file tree
Showing 44 changed files with 432 additions and 535 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dedoc/version.py

### Python template
# Byte-compiled / optimized / DLL files
*__pycache__*
Expand Down
2 changes: 2 additions & 0 deletions dedoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dedoc_manager import DedocManager # noqa
from .version import __version__ # noqa
3 changes: 3 additions & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class QueryParameters(BaseModel):
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
return_base64: Optional[str]
attachments_dir: Optional[str]

insert_table: Optional[str]
need_pdf_table_analysis: Optional[str]
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self,
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
Expand Down Expand Up @@ -79,6 +81,7 @@ def __init__(self,
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
Expand Down
28 changes: 14 additions & 14 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import importlib
import os
import tempfile

import uvicorn
from fastapi import Response, FastAPI, Request, Depends, UploadFile, File
from fastapi.responses import UJSONResponse, ORJSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse

import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.common.exceptions.missing_file_exception import MissingFileException
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.utils.utils import save_upload_file

config = get_config()
PORT = config["api_port"]
Expand All @@ -24,8 +27,7 @@

module_api_args = importlib.import_module(config['import_path_init_api_args'])
logger = config["logger"]
version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


@app.get("/")
Expand All @@ -47,7 +49,7 @@ def get_static_file(request: Request) -> Response:

@app.get('/version')
def get_version() -> Response:
return PlainTextResponse(manager.version)
return PlainTextResponse(dedoc.__version__)


def _get_static_file_path(request: Request) -> str:
Expand All @@ -63,13 +65,14 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
parameters = query_params.dict(by_alias=True)

if not file or file.filename == "":
raise MissingFileException("Error: Missing content in request_post file parameter", version=manager.version)
raise MissingFileException("Error: Missing content in request_post file parameter", version=dedoc.__version__)
# check if the post request_post has the file part

logger.info("Get file {} with parameters {}".format(file.filename, parameters))
warnings = []
document_tree = manager.parse_file(file, parameters=dict(parameters))
document_tree.warnings.extend(warnings)
logger.info(f"Get file {file.filename} with parameters {parameters}")
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
document_tree = manager.parse(file_path, parameters=dict(parameters))

return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
Expand All @@ -83,7 +86,7 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
else:
logger.info("Send result. File {} with parameters {}".format(file.filename, parameters))
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)


Expand All @@ -96,10 +99,7 @@ async def exception_handler(request: Request, exc: DedocException) -> Response:
result["dedoc_version"] = exc.version
if exc.metadata:
result["metadata"] = exc.metadata
return JSONResponse(
status_code=exc.code,
content=result,
)
return JSONResponse(status_code=exc.code, content=result)


def get_api() -> FastAPI:
Expand Down
5 changes: 2 additions & 3 deletions dedoc/api/train_dataset/api_collect_train_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dedoc.api.train_dataset.api_args import TrainDatasetParameters
from dedoc.api.train_dataset.async_archive_handler import AsyncHandler
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.concrete_taskers.filtered_line_label_tasker import FilteredLineLabelTasker
from dedoc.train_dataset.taskers.concrete_taskers.header_footer_tasker import HeaderFooterTasker
from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker
Expand All @@ -33,8 +33,7 @@
app.mount('/static', StaticFiles(directory=static_path), name="static")
templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset"))

version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
Expand Down
15 changes: 4 additions & 11 deletions dedoc/api/train_dataset/async_archive_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,13 @@
from fastapi import UploadFile

from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.tasker import Tasker


class _ArchiveHandler(Thread):

def __init__(self,
queue: Queue,
results: dict,
progress: dict,
tasker: Tasker,
manager: DedocThreadedManager,
*,
config: dict) -> None:
def __init__(self, queue: Queue, results: dict, progress: dict, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
Thread.__init__(self)
self.progress = progress
self.config = config
Expand Down Expand Up @@ -77,15 +70,15 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic
if not path_out.endswith("/"):
with open(path_out, "wb") as file_out:
file_out.write(item.read())
self.manager.parse_existing_file(path=path_out, parameters=parameters)
self.manager.parse(file_path=path_out, parameters=parameters)
except BadFileFormatException as e:
self.logger.warning("Can't handle file {}, exception {}".format(file, str(e)))
self.logger.info("Finish handle {}".format(file))


class AsyncHandler:

def __init__(self, tasker: Tasker, manager: DedocThreadedManager, *, config: dict) -> None:
def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
super().__init__()
self.queue = Queue()
self.__results = {}
Expand Down
3 changes: 3 additions & 0 deletions dedoc/attachments_handler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .attachments_handler import AttachmentsHandler

__all__ = ["AttachmentsHandler"]
96 changes: 93 additions & 3 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,106 @@
import copy
import logging
import os
import shutil
import tempfile
import time
from typing import List

from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.utils.utils import get_empty_content


class AttachmentsHandler:
"""
This class is used for handling attached files:
- they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path);
- they may be ignored (if the option `with_attachments=false` in parameters);
- the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters)
- they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \
the parsing recursion may be set via `recursion_deep_attachments` parameter.
"""

def __init__(self, *, config: dict) -> None:
"""
:param config: configuration of the handler, e.g. logger for logging
"""
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa
"""
Handle attached files, for example save it on disk or S3 storage
Handle attachments of the document in the intermediate representation.
:param document_parser: class with `parse` method for parsing attachments if needed;
:param document: intermediate representation of the document whose attachments need to be handled;
:param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
pass
parsed_attachment_files = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1

if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
return parsed_attachment_files

self._handle_attachments(document=document, parameters=parameters)

previous_log_time = time.time()

for i, attachment in enumerate(document.attachments):
current_time = time.time()
if current_time - previous_log_time > 3:
previous_log_time = current_time # not log too often
self.logger.info(f"Handle attachment {i} of {len(document.attachments)}")

if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185
continue

parameters_copy = copy.deepcopy(parameters)
parameters_copy["is_attached"] = True
parameters_copy["attachment"] = attachment
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)

try:
if attachment.need_content_analysis:
with tempfile.TemporaryDirectory() as tmpdir:
attachment_path = os.path.join(tmpdir, attachment.get_original_filename())
shutil.copy(attachment.get_filename_in_path(), attachment_path)
parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy)
else:
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
except DedocException:
# return empty ParsedDocument with Meta information
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)

parsed_file.metadata.set_uid(attachment.uid)
parsed_attachment_files.append(parsed_file)
return parsed_attachment_files

def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
"""
Handle attached files, for example save it on disk or S3 storage.
This method can be redefined by other AttachmentHandler class.
"""
attachments_dir = parameters.get("attachments_dir")
if not attachments_dir:
return

for attachment in document.attachments:
new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1])
shutil.move(attachment.get_filename_in_path(), new_path)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
4 changes: 3 additions & 1 deletion dedoc/common/exceptions/dedoc_exception.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

import dedoc


class DedocException(Exception):
def __init__(self,
Expand All @@ -12,7 +14,7 @@ def __init__(self,
self.msg = msg
self.msg_api = msg if msg_api is None else msg_api
self.filename = filename
self.version = version
self.version = version if version is not None else dedoc.__version__
self.metadata = metadata

def __str__(self) -> str:
Expand Down
35 changes: 0 additions & 35 deletions dedoc/configuration_manager.py

This file was deleted.

5 changes: 5 additions & 0 deletions dedoc/data_structures/document_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class DocumentMetadata(Serializable):

def __init__(self,
file_name: str,
temporary_file_name: str,
size: int,
modified_time: int,
created_time: int,
Expand All @@ -24,6 +25,7 @@ def __init__(self,
"""
:param uid: document unique identifier (useful for attached files)
:param file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on)
:param temporary_file_name: file name during parsing (unique name after rename and conversion);
:param size: size of the original file in bytes
:param modified_time: time of the last modification in unix time format (seconds since the epoch)
:param created_time: time of the creation in unixtime
Expand All @@ -32,6 +34,7 @@ def __init__(self,
:param other_fields: additional fields of user metadata
"""
self.file_name = file_name
self.temporary_file_name = temporary_file_name
self.size = size
self.modified_time = modified_time
self.created_time = created_time
Expand Down Expand Up @@ -62,6 +65,7 @@ def to_dict(self) -> dict:
res = OrderedDict()
res["uid"] = self.uid
res["file_name"] = self.file_name
res["temporary_file_name"] = self.temporary_file_name
res["size"] = self.size
res["modified_time"] = self.modified_time
res["created_time"] = self.created_time
Expand All @@ -78,6 +82,7 @@ def get_api_dict(api: Api) -> Model:
return api.model('DocumentMetadata', {
"uid": fields.String(description='unique document identifier', example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0"),
'file_name': fields.String(description='file name', example="example.odt"),
'temporary_file_name': fields.String(description='file name', example="123.odt"),
'size': fields.Integer(description='file size in bytes', example="20060"),
'modified_time': fields.Integer(description='modification time of the document in the format UnixTime', example="1590579805"),
'created_time': fields.Integer(description='creation time of the document in the format UnixTime', example="1590579805"),
Expand Down
Loading

0 comments on commit 31fb470

Please sign in to comment.