Skip to content

Commit

Permalink
update master (#303)
Browse files Browse the repository at this point in the history
* TLDR-386 pdf auto reader bug (#298)

* TLDR-386 Added features importances

* TLDR-386 added script for txtlayer dataset generation

* TLDR-386 move all data to the cloud

* Review fixes

* exclude version and changelog files (#299)

* TLDR-419 add confidence annotation (#301)

* add new annotation

* add confidence extracting

* add test for confidence annotation

* add confidence annotation to documentation

* fix flake

* add mergeable field for annotation

* review fixes

* TLDR-369 class for full dedoc pipeline running (#300)

* DedocPipeline added (work in progress)

* TLDR-369_dedoc_manager

* TLDR-369 fix documentation and add test for attachments recursion

* TLDR-369 change version saving

* TLDR-369 review fixes

* TLDR-369 added temporary file name

* new version 0.10.0 (#302)

---------

Co-authored-by: Bogatenkova Anastasiya <[email protected]>
  • Loading branch information
dronperminov and NastyBoget authored Aug 1, 2023
1 parent 0b19ba4 commit bfa506a
Show file tree
Hide file tree
Showing 67 changed files with 1,001 additions and 758 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@ on:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
push:
branches:
- develop
- master
paths-ignore:
- 'VERSION'
- 'docs/source/changelog.rst'
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
dedoc/version.py

### Python template
# Byte-compiled / optimized / DLL files
*__pycache__*
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.2
0.10.0
2 changes: 2 additions & 0 deletions dedoc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .dedoc_manager import DedocManager # noqa
from .version import __version__ # noqa
3 changes: 3 additions & 0 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class QueryParameters(BaseModel):
need_content_analysis: Optional[str]
recursion_deep_attachments: Optional[str]
return_base64: Optional[str]
attachments_dir: Optional[str]

insert_table: Optional[str]
need_pdf_table_analysis: Optional[str]
Expand Down Expand Up @@ -44,6 +45,7 @@ def __init__(self,
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
Expand Down Expand Up @@ -79,6 +81,7 @@ def __init__(self,
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
Expand Down
28 changes: 14 additions & 14 deletions dedoc/api/dedoc_api.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import importlib
import os
import tempfile

import uvicorn
from fastapi import Response, FastAPI, Request, Depends, UploadFile, File
from fastapi.responses import UJSONResponse, ORJSONResponse
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, PlainTextResponse

import dedoc
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2html, json2tree, json2collapsed_tree
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.common.exceptions.missing_file_exception import MissingFileException
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.utils.utils import save_upload_file

config = get_config()
PORT = config["api_port"]
Expand All @@ -24,8 +27,7 @@

module_api_args = importlib.import_module(config['import_path_init_api_args'])
logger = config["logger"]
version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


@app.get("/")
Expand All @@ -47,7 +49,7 @@ def get_static_file(request: Request) -> Response:

@app.get('/version')
def get_version() -> Response:
return PlainTextResponse(manager.version)
return PlainTextResponse(dedoc.__version__)


def _get_static_file_path(request: Request) -> str:
Expand All @@ -63,13 +65,14 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
parameters = query_params.dict(by_alias=True)

if not file or file.filename == "":
raise MissingFileException("Error: Missing content in request_post file parameter", version=manager.version)
raise MissingFileException("Error: Missing content in request_post file parameter", version=dedoc.__version__)
# check if the post request_post has the file part

logger.info("Get file {} with parameters {}".format(file.filename, parameters))
warnings = []
document_tree = manager.parse_file(file, parameters=dict(parameters))
document_tree.warnings.extend(warnings)
logger.info(f"Get file {file.filename} with parameters {parameters}")
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
document_tree = manager.parse(file_path, parameters=dict(parameters))

return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
Expand All @@ -83,7 +86,7 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content, status_code=200)
else:
logger.info("Send result. File {} with parameters {}".format(file.filename, parameters))
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
return ORJSONResponse(content=document_tree.to_dict(), status_code=200)


Expand All @@ -96,10 +99,7 @@ async def exception_handler(request: Request, exc: DedocException) -> Response:
result["dedoc_version"] = exc.version
if exc.metadata:
result["metadata"] = exc.metadata
return JSONResponse(
status_code=exc.code,
content=result,
)
return JSONResponse(status_code=exc.code, content=result)


def get_api() -> FastAPI:
Expand Down
5 changes: 2 additions & 3 deletions dedoc/api/train_dataset/api_collect_train_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dedoc.api.train_dataset.api_args import TrainDatasetParameters
from dedoc.api.train_dataset.async_archive_handler import AsyncHandler
from dedoc.config import get_config
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.concrete_taskers.filtered_line_label_tasker import FilteredLineLabelTasker
from dedoc.train_dataset.taskers.concrete_taskers.header_footer_tasker import HeaderFooterTasker
from dedoc.train_dataset.taskers.concrete_taskers.line_label_tasker import LineLabelTasker
Expand All @@ -33,8 +33,7 @@
app.mount('/static', StaticFiles(directory=static_path), name="static")
templates = Jinja2Templates(directory=os.path.join(static_path, "train_dataset"))

version_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..", "VERSION"))
manager = DedocThreadedManager.from_config(config=config, version=open(version_file_path).read().strip())
manager = DedocManager(config=config)


project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
Expand Down
15 changes: 4 additions & 11 deletions dedoc/api/train_dataset/async_archive_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,13 @@
from fastapi import UploadFile

from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.manager.dedoc_thread_manager import DedocThreadedManager
from dedoc.dedoc_manager import DedocManager
from dedoc.train_dataset.taskers.tasker import Tasker


class _ArchiveHandler(Thread):

def __init__(self,
queue: Queue,
results: dict,
progress: dict,
tasker: Tasker,
manager: DedocThreadedManager,
*,
config: dict) -> None:
def __init__(self, queue: Queue, results: dict, progress: dict, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
Thread.__init__(self)
self.progress = progress
self.config = config
Expand Down Expand Up @@ -77,15 +70,15 @@ def __handle_one_file(self, archive: zipfile.ZipFile, file: str, parameters: dic
if not path_out.endswith("/"):
with open(path_out, "wb") as file_out:
file_out.write(item.read())
self.manager.parse_existing_file(path=path_out, parameters=parameters)
self.manager.parse(file_path=path_out, parameters=parameters)
except BadFileFormatException as e:
self.logger.warning("Can't handle file {}, exception {}".format(file, str(e)))
self.logger.info("Finish handle {}".format(file))


class AsyncHandler:

def __init__(self, tasker: Tasker, manager: DedocThreadedManager, *, config: dict) -> None:
def __init__(self, tasker: Tasker, manager: DedocManager, *, config: dict) -> None:
super().__init__()
self.queue = Queue()
self.__results = {}
Expand Down
3 changes: 3 additions & 0 deletions dedoc/attachments_handler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .attachments_handler import AttachmentsHandler

__all__ = ["AttachmentsHandler"]
96 changes: 93 additions & 3 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,106 @@
import copy
import logging
import os
import shutil
import tempfile
import time
from typing import List

from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_exception import DedocException
from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.utils.utils import get_empty_content


class AttachmentsHandler:
"""
This class is used for handling attached files:
- they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path);
- they may be ignored (if the option `with_attachments=false` in parameters);
- the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters)
- they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \
the parsing recursion may be set via `recursion_deep_attachments` parameter.
"""

def __init__(self, *, config: dict) -> None:
"""
:param config: configuration of the handler, e.g. logger for logging
"""
self.config = config
self.logger = self.config.get("logger", logging.getLogger())

def handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa
"""
Handle attached files, for example save it on disk or S3 storage
Handle attachments of the document in the intermediate representation.
:param document_parser: class with `parse` method for parsing attachments if needed;
:param document: intermediate representation of the document whose attachments need to be handled;
:param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
pass
parsed_attachment_files = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1

if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
return parsed_attachment_files

self._handle_attachments(document=document, parameters=parameters)

previous_log_time = time.time()

for i, attachment in enumerate(document.attachments):
current_time = time.time()
if current_time - previous_log_time > 3:
previous_log_time = current_time # not log too often
self.logger.info(f"Handle attachment {i} of {len(document.attachments)}")

if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185
continue

parameters_copy = copy.deepcopy(parameters)
parameters_copy["is_attached"] = True
parameters_copy["attachment"] = attachment
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)

try:
if attachment.need_content_analysis:
with tempfile.TemporaryDirectory() as tmpdir:
attachment_path = os.path.join(tmpdir, attachment.get_original_filename())
shutil.copy(attachment.get_filename_in_path(), attachment_path)
parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy)
else:
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
except DedocException:
# return empty ParsedDocument with Meta information
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)

parsed_file.metadata.set_uid(attachment.uid)
parsed_attachment_files.append(parsed_file)
return parsed_attachment_files

def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
"""
Handle attached files, for example save it on disk or S3 storage.
This method can be redefined by other AttachmentHandler class.
"""
attachments_dir = parameters.get("attachments_dir")
if not attachments_dir:
return

for attachment in document.attachments:
new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1])
shutil.move(attachment.get_filename_in_path(), new_path)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = DocumentMetadata(**unstructured_document.metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
4 changes: 3 additions & 1 deletion dedoc/common/exceptions/dedoc_exception.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Optional

import dedoc


class DedocException(Exception):
def __init__(self,
Expand All @@ -12,7 +14,7 @@ def __init__(self,
self.msg = msg
self.msg_api = msg if msg_api is None else msg_api
self.filename = filename
self.version = version
self.version = version if version is not None else dedoc.__version__
self.metadata = metadata

def __str__(self) -> str:
Expand Down
35 changes: 0 additions & 35 deletions dedoc/configuration_manager.py

This file was deleted.

2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class PDFConverter(AbstractConverter):
"""
def __init__(self, *, config: dict) -> None:
super().__init__(config=config)
self.timeout = 20
self.timeout = 60

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Expand Down
4 changes: 3 additions & 1 deletion dedoc/data_structures/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Annotation(Serializable):
Look to the concrete kind of annotations to get mode examples.
"""

def __init__(self, start: int, end: int, name: str, value: str) -> None:
def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
"""
Some kind of text information about symbols between start and end.
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was writen in italic.
Expand All @@ -21,11 +21,13 @@ def __init__(self, start: int, end: int, name: str, value: str) -> None:
:param end: end of the annotated text (end isn't included)
:param name: annotation's name
:param value: information about annotated text
:param is_mergeable: is it possible to merge annotations with the same value
"""
self.start = start
self.end = end
self.name = name
self.value = value
self.is_mergeable = is_mergeable

def __eq__(self, o: object) -> bool:
if not isinstance(o, Annotation):
Expand Down
Loading

0 comments on commit bfa506a

Please sign in to comment.