-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* TLDR-386 pdf auto reader bug (#298) * TLDR-386 Added features importances * TLDR-386 added script for txtlayer dataset generation * TLDR-386 move all data to the cloud * Review fixes * exclude version and changelog files (#299) * TLDR-419 add confidence annotation (#301) * add new annotation * add confidence extracting * add test for confidence annotation * add confidence annotation to documentation * fix flake * add mergeable field for annotation * review fixes * TLDR-369 class for full dedoc pipeline running (#300) * DedocPipeline added (work in progress) * TLDR-369_dedoc_manager * TLDR-369 fix documentation and add test for attachments recursion * TLDR-369 change version saving * TLDR-369 review fixes * TLDR-369 added temporary file name * new version 0.10.0 (#302) --------- Co-authored-by: Bogatenkova Anastasiya <[email protected]>
- Loading branch information
1 parent
0b19ba4
commit bfa506a
Showing
67 changed files
with
1,001 additions
and
758 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
dedoc/version.py | ||
|
||
### Python template | ||
# Byte-compiled / optimized / DLL files | ||
*__pycache__* | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
0.9.2 | ||
0.10.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .dedoc_manager import DedocManager # noqa | ||
from .version import __version__ # noqa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .attachments_handler import AttachmentsHandler | ||
|
||
__all__ = ["AttachmentsHandler"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,106 @@ | ||
import copy | ||
import logging | ||
import os | ||
import shutil | ||
import tempfile | ||
import time | ||
from typing import List | ||
|
||
from dedoc.attachments_extractors import AbstractAttachmentsExtractor | ||
from dedoc.common.exceptions.dedoc_exception import DedocException | ||
from dedoc.data_structures import ParsedDocument, DocumentMetadata, AttachedFile | ||
from dedoc.data_structures.unstructured_document import UnstructuredDocument | ||
from dedoc.utils.utils import get_empty_content | ||
|
||
|
||
class AttachmentsHandler: | ||
""" | ||
This class is used for handling attached files: | ||
- they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path); | ||
- they may be ignored (if the option `with_attachments=false` in parameters); | ||
- the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters) | ||
- they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \ | ||
the parsing recursion may be set via `recursion_deep_attachments` parameter. | ||
""" | ||
|
||
def __init__(self, *, config: dict) -> None: | ||
""" | ||
:param config: configuration of the handler, e.g. logger for logging | ||
""" | ||
self.config = config | ||
self.logger = self.config.get("logger", logging.getLogger()) | ||
|
||
def handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None: | ||
def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa | ||
""" | ||
Handle attached files, for example save it on disk or S3 storage | ||
Handle attachments of the document in the intermediate representation. | ||
:param document_parser: class with `parse` method for parsing attachments if needed; | ||
:param document: intermediate representation of the document whose attachments need to be handled; | ||
:param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \ | ||
are important, look to the API parameters documentation for more details). | ||
:return: list of parsed document attachments | ||
""" | ||
pass | ||
parsed_attachment_files = [] | ||
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1 | ||
|
||
if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0: | ||
return parsed_attachment_files | ||
|
||
self._handle_attachments(document=document, parameters=parameters) | ||
|
||
previous_log_time = time.time() | ||
|
||
for i, attachment in enumerate(document.attachments): | ||
current_time = time.time() | ||
if current_time - previous_log_time > 3: | ||
previous_log_time = current_time # not log too often | ||
self.logger.info(f"Handle attachment {i} of {len(document.attachments)}") | ||
|
||
if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185 | ||
continue | ||
|
||
parameters_copy = copy.deepcopy(parameters) | ||
parameters_copy["is_attached"] = True | ||
parameters_copy["attachment"] = attachment | ||
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments) | ||
|
||
try: | ||
if attachment.need_content_analysis: | ||
with tempfile.TemporaryDirectory() as tmpdir: | ||
attachment_path = os.path.join(tmpdir, attachment.get_original_filename()) | ||
shutil.copy(attachment.get_filename_in_path(), attachment_path) | ||
parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy) | ||
else: | ||
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) | ||
except DedocException: | ||
# return empty ParsedDocument with Meta information | ||
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy) | ||
|
||
parsed_file.metadata.set_uid(attachment.uid) | ||
parsed_attachment_files.append(parsed_file) | ||
return parsed_attachment_files | ||
|
||
def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None: | ||
""" | ||
Handle attached files, for example save it on disk or S3 storage. | ||
This method can be redefined by other AttachmentHandler class. | ||
""" | ||
attachments_dir = parameters.get("attachments_dir") | ||
if not attachments_dir: | ||
return | ||
|
||
for attachment in document.attachments: | ||
new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1]) | ||
shutil.move(attachment.get_filename_in_path(), new_path) | ||
attachment.tmp_file_path = new_path | ||
|
||
def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa | ||
unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[]) | ||
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path()) | ||
unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir, | ||
filename=attachment_name, converted_filename=attachment_name, | ||
original_filename=attachment.get_original_filename(), | ||
parameters=parameters) | ||
metadata = DocumentMetadata(**unstructured_document.metadata) | ||
return ParsedDocument(content=get_empty_content(), metadata=metadata) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.