Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update master #303

Merged
merged 5 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@ on:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
push:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dedoc/version.py

### Python template
# Byte-compiled / optimized / DLL files
*__pycache__*
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.2
0.10.0
2 changes: 2 additions & 0 deletions dedoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dedoc_manager import DedocManager # noqa
from .version import __version__ # noqa
3 changes: 3 additions & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class QueryParameters(BaseModel):
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
return_base64: Optional[str]
attachments_dir: Optional[str]

insert_table: Optional[str]
need_pdf_table_analysis: Optional[str]
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self,
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
Expand Down Expand Up @@ -79,6 +81,7 @@ def __init__(self,
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
Expand Down
28 changes: 14 additions & 14 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import importlib
import os
import tempfile

import uvicorn
from fastapi import Response, FastAPI, Request, Depends, UploadFile, File
from fastapi.responses import UJSONResponse, ORJSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse

import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.common.exceptions.missing_file_exception import MissingFileException
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.utils.utils import save_upload_file

config = get_config()
PORT = config["api_port"]
Expand All @@ -24,8 +27,7 @@

module_api_args = importlib.import_module(config['import_path_init_api_args'])
logger = config["logger"]
version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


@app.get("/")
Expand All @@ -47,7 +49,7 @@ def get_static_file(request: Request) -> Response:

@app.get('/version')
def get_version() -> Response:
return PlainTextResponse(manager.version)
return PlainTextResponse(dedoc.__version__)


def _get_static_file_path(request: Request) -> str:
Expand All @@ -63,13 +65,14 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
parameters = query_params.dict(by_alias=True)

if not file or file.filename == "":
raise MissingFileException("Error: Missing content in request_post file parameter", version=manager.version)
raise MissingFileException("Error: Missing content in request_post file parameter", version=dedoc.__version__)
# check if the post request_post has the file part

logger.info("Get file {} with parameters {}".format(file.filename, parameters))
warnings = []
document_tree = manager.parse_file(file, parameters=dict(parameters))
document_tree.warnings.extend(warnings)
logger.info(f"Get file {file.filename} with parameters {parameters}")
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
document_tree = manager.parse(file_path, parameters=dict(parameters))

return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
Expand All @@ -83,7 +86,7 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
else:
logger.info("Send result. File {} with parameters {}".format(file.filename, parameters))
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)


Expand All @@ -96,10 +99,7 @@ async def exception_handler(request: Request, exc: DedocException) -> Response:
result["dedoc_version"] = exc.version
if exc.metadata:
result["metadata"] = exc.metadata
return JSONResponse(
status_code=exc.code,
content=result,
)
return JSONResponse(status_code=exc.code, content=result)


def get_api() -> FastAPI:
Expand Down
5 changes: 2 additions & 3 deletions dedoc/api/train_dataset/api_collect_train_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dedoc.api.train_dataset.api_args import TrainDatasetParameters
from dedoc.api.train_dataset.async_archive_handler import AsyncHandler
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.concrete_taskers.filtered_line_label_tasker import FilteredLineLabelTasker
from dedoc.train_dataset.taskers.concrete_taskers.header_footer_tasker import HeaderFooterTasker
from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker
Expand All @@ -33,8 +33,7 @@
app.mount('/static', StaticFiles(directory=static_path), name="static")
templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset"))

version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
Expand Down
15 changes: 4 additions & 11 deletions dedoc/api/train_dataset/async_archive_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,13 @@
from fastapi import UploadFile

from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.tasker import Tasker


class _ArchiveHandler(Thread):

def __init__(self,
queue: Queue,
results: dict,
progress: dict,
tasker: Tasker,
manager: DedocThreadedManager,
*,
config: dict) -> None:
def __init__(self, queue: Queue, results: dict, progress: dict, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
Thread.__init__(self)
self.progress = progress
self.config = config
Expand Down Expand Up @@ -77,15 +70,15 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic
if not path_out.endswith("/"):
with open(path_out, "wb") as file_out:
file_out.write(item.read())
self.manager.parse_existing_file(path=path_out, parameters=parameters)
self.manager.parse(file_path=path_out, parameters=parameters)
except BadFileFormatException as e:
self.logger.warning("Can't handle file {}, exception {}".format(file, str(e)))
self.logger.info("Finish handle {}".format(file))


class AsyncHandler:

def __init__(self, tasker: Tasker, manager: DedocThreadedManager, *, config: dict) -> None:
def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
super().__init__()
self.queue = Queue()
self.__results = {}
Expand Down
3 changes: 3 additions & 0 deletions dedoc/attachments_handler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .attachments_handler import AttachmentsHandler

__all__ = ["AttachmentsHandler"]
96 changes: 93 additions & 3 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,106 @@
import copy
import logging
import os
import shutil
import tempfile
import time
from typing import List

from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.utils.utils import get_empty_content


class AttachmentsHandler:
"""
This class is used for handling attached files:

- they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path);
- they may be ignored (if the option `with_attachments=false` in parameters);
- the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters)
- they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \
the parsing recursion may be set via `recursion_deep_attachments` parameter.
"""

def __init__(self, *, config: dict) -> None:
"""
:param config: configuration of the handler, e.g. logger for logging
"""
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa
"""
Handle attached files, for example save it on disk or S3 storage
Handle attachments of the document in the intermediate representation.

:param document_parser: class with `parse` method for parsing attachments if needed;
:param document: intermediate representation of the document whose attachments need to be handled;
:param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
pass
parsed_attachment_files = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1

if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
return parsed_attachment_files

self._handle_attachments(document=document, parameters=parameters)

previous_log_time = time.time()

for i, attachment in enumerate(document.attachments):
current_time = time.time()
if current_time - previous_log_time > 3:
previous_log_time = current_time # not log too often
self.logger.info(f"Handle attachment {i} of {len(document.attachments)}")

if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185
continue

parameters_copy = copy.deepcopy(parameters)
parameters_copy["is_attached"] = True
parameters_copy["attachment"] = attachment
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)

try:
if attachment.need_content_analysis:
with tempfile.TemporaryDirectory() as tmpdir:
attachment_path = os.path.join(tmpdir, attachment.get_original_filename())
shutil.copy(attachment.get_filename_in_path(), attachment_path)
parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy)
else:
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
except DedocException:
# return empty ParsedDocument with Meta information
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)

parsed_file.metadata.set_uid(attachment.uid)
parsed_attachment_files.append(parsed_file)
return parsed_attachment_files

def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
"""
Handle attached files, for example save it on disk or S3 storage.
This method can be redefined by other AttachmentHandler class.
"""
attachments_dir = parameters.get("attachments_dir")
if not attachments_dir:
return

for attachment in document.attachments:
new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1])
shutil.move(attachment.get_filename_in_path(), new_path)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
4 changes: 3 additions & 1 deletion dedoc/common/exceptions/dedoc_exception.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

import dedoc


class DedocException(Exception):
def __init__(self,
Expand All @@ -12,7 +14,7 @@ def __init__(self,
self.msg = msg
self.msg_api = msg if msg_api is None else msg_api
self.filename = filename
self.version = version
self.version = version if version is not None else dedoc.__version__
self.metadata = metadata

def __str__(self) -> str:
Expand Down
35 changes: 0 additions & 35 deletions dedoc/configuration_manager.py

This file was deleted.

2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class PDFConverter(AbstractConverter):
"""
def __init__(self, *, config: dict) -> None:
super().__init__(config=config)
self.timeout = 20
self.timeout = 60

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
4 changes: 3 additions & 1 deletion dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Annotation(Serializable):
Look to the concrete kind of annotations to get mode examples.
"""

def __init__(self, start: int, end: int, name: str, value: str) -> None:
def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
"""
Some kind of text information about symbols between start and end.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was writen in italic.
Expand All @@ -21,11 +21,13 @@ def __init__(self, start: int, end: int, name: str, value: str) -> None:
:param end: end of the annotated text (end isn't included)
:param name: annotation's name
:param value: information about annotated text
:param is_mergeable: is it possible to merge annotations with the same value
"""
self.start = start
self.end = end
self.name = name
self.value = value
self.is_mergeable = is_mergeable

def __eq__(self, o: object) -> bool:
if not isinstance(o, Annotation):
Expand Down
Loading
Loading