update master (#376)

* Use older pydantic version (#364) * Added rtf format to docx convertor (#366) Co-authored-by: Alexander Golodkov <[email protected]> * fix small bugs with docx reader such as non-integer sizes in docx sty… (#367) * fix small bugs with docx reader such as non-integer sizes in docx style and filename with dots and spaces * Rename test --------- Co-authored-by: Nasty <[email protected]> * TLDR-462 gpu for 1.1 (#365) * TLDR-462 - test on GPU work * TLDR-354 images attachments extraction from PDF (#368) * Benchmarks before changes * Add image extraction to tabby * Fix document partial parsing * Use start_page, end_page in java tabby execution * Fix txtlayer classification tests * Fixes in partial parsing * Fix tests * TLDR-518: Fix tabby partially read (#372) * Fix tabby partially read * Add more tests * Fix tabby page slice parameters * Fix extract table in tabby with page range parameter --------- Co-authored-by: Nasty <[email protected]> * TLDR-514 creating document classes tutorial (#369) * TLDR-517 attachments_dir (#370) * TLDR-533 extract images from PDF to attachments_dir (#374) * new version 1.1.1 (#375) --------- Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: Alexander Golodkov <[email protected]> Co-authored-by: IlyaKozlov <[email protected]> Co-authored-by: raxtemur <[email protected]> Co-authored-by: Andrey Mikhailov <[email protected]> Co-authored-by: Nikita Shevtsov <[email protected]>
ispras · Nov 24, 2023 · d83bf23 · d83bf23
1 parent b79dd4c
commit d83bf23
Show file tree

Hide file tree

Showing 63 changed files with 7,076 additions and 161 deletions.
diff --git a/.flake8 b/.flake8
@@ -19,6 +19,7 @@ exclude =
     venv,
     build,
     dedoc.egg-info
+    docs/_build
 
 # ANN101 - type annotations for self
 ignore =

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.1.0
+1.1.1
diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py
@@ -1,13 +1,14 @@
 from typing import Optional
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, Extra, Field
 
 
 class DocumentMetadata(BaseModel):
     """
     Document metadata like its name, size, author, etc.
     """
-    model_config = ConfigDict(extra="allow")
+    class Config:
+        extra = Extra.allow
 
     uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
     file_name: str = Field(description="Original document name before rename and conversion", example="example.odt")

diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py
@@ -1,13 +1,14 @@
 from typing import Optional
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, Extra, Field
 
 
 class LineMetadata(BaseModel):
     """
     Holds information about document node/line metadata, such as page number or line type.
     """
-    model_config = ConfigDict(extra="allow")
+    class Config:
+        extra = Extra.allow
 
     paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
     page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)

diff --git a/dedoc/attachments_extractors/abstract_attachment_extractor.py b/dedoc/attachments_extractors/abstract_attachment_extractor.py
@@ -48,11 +48,14 @@ def with_attachments(parameters: dict) -> bool:
         """
         return str(parameters.get("with_attachments", "false")).lower() == "true"
 
-    def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool) -> List[AttachedFile]:
+    def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]:
         attachments = []
+        attachments_dir = parameters.get("attachments_dir", None)
+        attachments_dir = tmpdir if attachments_dir is None else attachments_dir
+
         for original_name, contents in content:
-            tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=original_name, binary_data=contents)
-            tmp_file_path = os.path.join(tmpdir, tmp_file_name)
+            tmp_file_name = save_data_to_unique_file(directory=attachments_dir, filename=original_name, binary_data=contents)
+            tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
             file = AttachedFile(original_name=original_name,
                                 tmp_file_path=tmp_file_path,
                                 uid=f"attach_{uuid.uuid4()}",

diff --git a/...ments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py b/...ments_extractors/concrete_attachments_extractors/abstract_office_attachments_extractor.py
@@ -94,5 +94,5 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm
                     # TODO process any ole files except \x01Ole10Native and PDF (looks like impossible task)
 
             need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-            attachments = self._content2attach_file(content=result, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
+            attachments = self._content2attach_file(content=result, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)
             return attachments
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/docx_attachments_extractor.py
@@ -35,7 +35,8 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
             with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
                 diagram_attachments = self.__extract_diagrams(zfile)
                 need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-                result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
+                result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis,
+                                                    parameters=parameters)
 
             result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")
 

diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/json_attachment_extractor.py
@@ -58,7 +58,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
             attachments.append((attached_filename, binary_data))
 
         need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-        return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
+        return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)
 
     def __get_value_by_keys(self, data: dict, keys: List[str]) -> dict:
         value = data

diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py
@@ -55,7 +55,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
                 self.logger.warning(f"{filename} is broken")
 
         need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
-        return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
+        return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)
 
     def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
         attachments = []

diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py
@@ -1,8 +1,6 @@
 import copy
 import logging
 import os
-import shutil
-import tempfile
 import time
 from typing import List
 
@@ -47,8 +45,6 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
         if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
             return parsed_attachment_files
 
-        self._handle_attachments(document=document, parameters=parameters)
-
         previous_log_time = time.time()
 
         for i, attachment in enumerate(document.attachments):
@@ -66,12 +62,12 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
 
             try:
                 if attachment.need_content_analysis:
-                    with tempfile.TemporaryDirectory() as tmpdir:
-                        attachment_path = os.path.join(tmpdir, attachment.get_original_filename())
-                        shutil.copy(attachment.get_filename_in_path(), attachment_path)
-                        parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy)
+                    parsed_file = document_parser.parse(attachment.get_filename_in_path(), parameters=parameters_copy)
                 else:
                     parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
+
+                parsed_file.metadata.file_name = attachment.original_name  # initial name of the attachment
+                parsed_file.metadata.temporary_file_name = os.path.split(attachment.get_filename_in_path())[-1]  # actual name in the file system
             except DedocError:
                 # return empty ParsedDocument with Meta information
                 parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
@@ -80,20 +76,6 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
             parsed_attachment_files.append(parsed_file)
         return parsed_attachment_files
 
-    def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
-        """
-        Handle attached files, for example save it on disk or S3 storage.
-        This method can be redefined by other AttachmentHandler class.
-        """
-        attachments_dir = parameters.get("attachments_dir")
-        if not attachments_dir:
-            return
-
-        for attachment in document.attachments:
-            new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1])
-            shutil.move(attachment.get_filename_in_path(), new_path)
-            attachment.tmp_file_path = new_path
-
     def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument:  # noqa
         attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
         metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,

diff --git a/dedoc/config.py b/dedoc/config.py
@@ -22,6 +22,10 @@
     # number of parallel jobs in some tasks as OCR
     n_jobs=1,
 
+    # --------------------------------------------GPU SETTINGS-------------------------------------------------------
+    # set gpu in XGBoost and torch models
+    on_gpu=False,
+
     # ---------------------------------------------API SETTINGS---------------------------------------------------------
     # max file size in bytes
     max_content_length=512 * 1024 * 1024,

diff --git a/dedoc/converters/concrete_converters/docx_converter.py b/dedoc/converters/concrete_converters/docx_converter.py
@@ -15,7 +15,7 @@ def __init__(self, *, config: dict) -> None:
 
     def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
         """
-        Checks if the document is docx-like, e.g. it has .doc or .odt extension.
+        Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
         """
         return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format
 

diff --git a/dedoc/dedoc_manager.py b/dedoc/dedoc_manager.py
@@ -86,7 +86,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
         :return: parsed document
         """
         if not os.path.isfile(path=file_path):
-            raise FileNotFoundError()
+            raise FileNotFoundError(file_path)
         self.logger.info(f"Start handle {file_path}")
         file_dir, file_name = os.path.split(file_path)
         unique_filename = get_unique_name(file_name)

diff --git a/dedoc/extensions.py b/dedoc/extensions.py
@@ -15,7 +15,7 @@
 
 converted_extensions = Extensions(
     excel_like_format=[".ods", "xls"],
-    docx_like_format=[".odt", ".doc"],
+    docx_like_format=[".odt", ".doc", ".rtf"],
     pptx_like_format=[".odp", ".ppt"],
     archive_like_format=[],
     image_like_format=[".pcx", ".webp", ".sgi", ".hdr", ".sr", ".pic", ".dib", ".jfif", ".j2k"],
@@ -26,7 +26,7 @@
 # .sgi, .hdr, .sr, .ras - не зарегистрованы в mime
 converted_mimes = Extensions(
     excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"],
-    docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"],
+    docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text", "application/rtf"],
     pptx_like_format=[
         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
         "application/vnd.ms-powerpoint",

diff --git a/dedoc/readers/archive_reader/archive_reader.py b/dedoc/readers/archive_reader/archive_reader.py
@@ -41,58 +41,67 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
         The method return empty content of archive, all content will be placed inside attachments.
         Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
         """
-        attachments = self.__get_attachments(path=path)
+        parameters = {} if parameters is None else parameters
+
+        with_attachments = str(parameters.get("with_attachments", "false")).lower() == "true"
+        if not with_attachments:
+            return UnstructuredDocument(lines=[], tables=[], attachments=[])
+
+        attachments_dir = parameters.get("attachments_dir", None)
+        attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir
+
+        need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
+        attachments = self.__get_attachments(path=path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis)
         return UnstructuredDocument(lines=[], tables=[], attachments=attachments)
 
-    def __get_attachments(self, path: str) -> List[AttachedFile]:
-        tmp_dir = os.path.dirname(path)
+    def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]:
         mime = get_file_mime_type(path)
         if zipfile.is_zipfile(path) and mime == "application/zip":
-            return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir))
+            return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
         if tarfile.is_tarfile(path):
-            return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir))
+            return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
         if rarfile.is_rarfile(path):
-            return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir))
+            return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
         if mime == "application/x-7z-compressed":
-            return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir))
+            return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
         # if no one can handle this archive raise exception
         raise BadFileFormatError(f"bad archive {path}")
 
-    def __read_zip_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
+    def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
         try:
             with zipfile.ZipFile(path, "r") as arch_file:
                 names = [member.filename for member in arch_file.infolist() if member.file_size > 0]
                 for name in names:
                     with arch_file.open(name) as file:
-                        yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
+                        yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
         except (zipfile.BadZipFile, zlib.error) as e:
             self.logger.warning(f"Can't read file {path} ({e})")
             raise BadFileFormatError(f"Can't read file {path} ({e})")
 
-    def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
+    def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
         with tarfile.open(path, "r") as arch_file:
             names = [member.name for member in arch_file.getmembers() if member.isfile()]
             for name in names:
                 file = arch_file.extractfile(name)
-                yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
+                yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
                 file.close()
 
-    def __read_rar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
+    def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
         with rarfile.RarFile(path, "r") as arch_file:
             names = [item.filename for item in arch_file.infolist() if item.compress_size > 0]
             for name in names:
                 with arch_file.open(name) as file:
-                    yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
+                    yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
 
-    def __read_7z_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
+    def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
         with open(path, "rb") as content:
             arch_file = py7zlib.Archive7z(content)
             names = arch_file.getnames()
             for name in names:
                 file = arch_file.getmember(name)
-                yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
+                yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
 
-    def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) -> AttachedFile:
+    def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
         file_name = os.path.basename(file_name)
         binary_data = file.read()
         if isinstance(binary_data, str):
@@ -101,7 +110,7 @@ def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) ->
         attachment = AttachedFile(
             original_name=file_name,
             tmp_file_path=os.path.join(tmp_dir, tmp_path),
-            need_content_analysis=True,
+            need_content_analysis=need_content_analysis,
             uid=f"attach_{uuid.uuid1()}"
         )
         return attachment
diff --git a/dedoc/readers/docx_reader/properties_extractor.py b/dedoc/readers/docx_reader/properties_extractor.py
@@ -74,7 +74,10 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None:
     if not tree.ind:
         return
 
-    attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]}
+    attributes = {
+        attribute: 0 for attribute in
+        ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]
+    }
     for attribute in attributes:
         attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0))
 
@@ -106,7 +109,8 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None:
     :param tree: BeautifulSoup tree with properties
     """
     if tree.sz:
-        old_properties.size = int(tree.sz.get("w:val", old_properties.size))
+        new_size = float(tree.sz.get("w:val", old_properties.size))
+        old_properties.size = int(new_size)
 
 
 def change_jc(old_properties: BaseProperties, tree: Tag) -> None:
@@ -176,19 +180,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None:
 
     if not before_autospacing:
         before_lines = tree.spacing.get("w:beforeLines", False)
-        before_lines = int(before_lines) if before_lines else before_lines
+        before_lines = int(float(before_lines)) if before_lines else before_lines
         if not before_lines:
             before_tag = tree.spacing.get("w:before", False)
-            before = int(before_tag) if before_tag else before
+            before = int(float(before_tag)) if before_tag else before
         else:
             before = before_lines
 
     if not after_autospacing:
         after_lines = tree.spacing.get("w:afterLines", False)
-        after_lines = int(after_lines) if after_lines else after_lines
+        after_lines = int(float(after_lines)) if after_lines else after_lines
         if not after_lines:
             after_tag = tree.spacing.get("w:after", False)
-            after = int(after_tag) if after_tag else after
+            after = int(float(after_tag)) if after_tag else after
         else:
             after = after_lines