Skip to content

Commit

Permalink
update master (#376)
Browse files Browse the repository at this point in the history
* Use older pydantic version (#364)

* Added rtf format to docx convertor (#366)

Co-authored-by: Alexander Golodkov <[email protected]>

* fix small bugs with docx reader such as non-integer sizes in docx sty… (#367)

* fix small bugs with docx reader such as non-integer sizes in docx style and filename with dots and spaces

* Rename test

---------

Co-authored-by: Nasty <[email protected]>

* TLDR-462 gpu for 1.1 (#365)

* TLDR-462 - test on GPU work

* TLDR-354 images attachments extraction from PDF (#368)

* Benchmarks before changes

* Add image extraction to tabby

* Fix document partial parsing

* Use start_page, end_page in java tabby execution

* Fix txtlayer classification tests

* Fixes in partial parsing

* Fix tests

* TLDR-518: Fix tabby partially read  (#372)

* Fix tabby partially read

* Add more tests

* Fix tabby page slice parameters

* Fix extract table in tabby with page range parameter

---------

Co-authored-by: Nasty <[email protected]>

* TLDR-514 creating document classes tutorial (#369)

* TLDR-517 attachments_dir (#370)

* TLDR-533 extract images from PDF to attachments_dir (#374)

* new version 1.1.1 (#375)

---------

Co-authored-by: Alexander Golodkov <[email protected]>
Co-authored-by: Alexander Golodkov <[email protected]>
Co-authored-by: IlyaKozlov <[email protected]>
Co-authored-by: raxtemur <[email protected]>
Co-authored-by: Andrey Mikhailov <[email protected]>
Co-authored-by: Nikita Shevtsov <[email protected]>
  • Loading branch information
7 people authored Nov 24, 2023
1 parent b79dd4c commit d83bf23
Show file tree
Hide file tree
Showing 63 changed files with 7,076 additions and 161 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ exclude =
venv,
build,
dedoc.egg-info
docs/_build
# ANN101 - type annotations for self
ignore =
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1.0
1.1.1
5 changes: 3 additions & 2 deletions dedoc/api/schema/document_metadata.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, Extra, Field


class DocumentMetadata(BaseModel):
"""
Document metadata like its name, size, author, etc.
"""
model_config = ConfigDict(extra="allow")
class Config:
extra = Extra.allow

uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
file_name: str = Field(description="Original document name before rename and conversion", example="example.odt")
Expand Down
5 changes: 3 additions & 2 deletions dedoc/api/schema/line_metadata.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field
from pydantic import BaseModel, Extra, Field


class LineMetadata(BaseModel):
"""
Holds information about document node/line metadata, such as page number or line type.
"""
model_config = ConfigDict(extra="allow")
class Config:
extra = Extra.allow

paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
Expand Down
9 changes: 6 additions & 3 deletions dedoc/attachments_extractors/abstract_attachment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,14 @@ def with_attachments(parameters: dict) -> bool:
"""
return str(parameters.get("with_attachments", "false")).lower() == "true"

def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool) -> List[AttachedFile]:
def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]:
attachments = []
attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = tmpdir if attachments_dir is None else attachments_dir

for original_name, contents in content:
tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=original_name, binary_data=contents)
tmp_file_path = os.path.join(tmpdir, tmp_file_name)
tmp_file_name = save_data_to_unique_file(directory=attachments_dir, filename=original_name, binary_data=contents)
tmp_file_path = os.path.join(attachments_dir, tmp_file_name)
file = AttachedFile(original_name=original_name,
tmp_file_path=tmp_file_path,
uid=f"attach_{uuid.uuid4()}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,5 @@ def _get_attachments(self, tmpdir: str, filename: str, parameters: dict, attachm
# TODO process any ole files except \x01Ole10Native and PDF (looks like impossible task)

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
attachments = self._content2attach_file(content=result, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
attachments = self._content2attach_file(content=result, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)
return attachments
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
diagram_attachments = self.__extract_diagrams(zfile)
need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
result += self._content2attach_file(content=diagram_attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis,
parameters=parameters)

result += self._get_attachments(tmpdir=tmpdir, filename=filename, parameters=parameters, attachments_dir="word")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
attachments.append((attached_filename, binary_data))

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)

def __get_value_by_keys(self, data: dict, keys: List[str]) -> dict:
value = data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
self.logger.warning(f"{filename} is broken")

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)
return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis, parameters=parameters)

def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
attachments = []
Expand Down
26 changes: 4 additions & 22 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import copy
import logging
import os
import shutil
import tempfile
import time
from typing import List

Expand Down Expand Up @@ -47,8 +45,6 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
if not AbstractAttachmentsExtractor.with_attachments(parameters) or recursion_deep_attachments < 0:
return parsed_attachment_files

self._handle_attachments(document=document, parameters=parameters)

previous_log_time = time.time()

for i, attachment in enumerate(document.attachments):
Expand All @@ -66,12 +62,12 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct

try:
if attachment.need_content_analysis:
with tempfile.TemporaryDirectory() as tmpdir:
attachment_path = os.path.join(tmpdir, attachment.get_original_filename())
shutil.copy(attachment.get_filename_in_path(), attachment_path)
parsed_file = document_parser.parse(attachment_path, parameters=parameters_copy)
parsed_file = document_parser.parse(attachment.get_filename_in_path(), parameters=parameters_copy)
else:
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)

parsed_file.metadata.file_name = attachment.original_name # initial name of the attachment
parsed_file.metadata.temporary_file_name = os.path.split(attachment.get_filename_in_path())[-1] # actual name in the file system
except DedocError:
# return empty ParsedDocument with Meta information
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
Expand All @@ -80,20 +76,6 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
parsed_attachment_files.append(parsed_file)
return parsed_attachment_files

def _handle_attachments(self, document: UnstructuredDocument, parameters: dict) -> None:
"""
Handle attached files, for example save it on disk or S3 storage.
This method can be redefined by other AttachmentHandler class.
"""
attachments_dir = parameters.get("attachments_dir")
if not attachments_dir:
return

for attachment in document.attachments:
new_path = os.path.join(attachments_dir, os.path.split(attachment.get_filename_in_path())[1])
shutil.move(attachment.get_filename_in_path(), new_path)
attachment.tmp_file_path = new_path

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
Expand Down
4 changes: 4 additions & 0 deletions dedoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
# number of parallel jobs in some tasks as OCR
n_jobs=1,

# --------------------------------------------GPU SETTINGS-------------------------------------------------------
# set gpu in XGBoost and torch models
on_gpu=False,

# ---------------------------------------------API SETTINGS---------------------------------------------------------
# max file size in bytes
max_content_length=512 * 1024 * 1024,
Expand Down
2 changes: 1 addition & 1 deletion dedoc/converters/concrete_converters/docx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, *, config: dict) -> None:

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Checks if the document is docx-like, e.g. it has .doc or .odt extension.
Checks if the document is docx-like, e.g. it has .doc, .rtf or .odt extension.
"""
return extension.lower() in converted_extensions.docx_like_format or mime in converted_mimes.docx_like_format

Expand Down
2 changes: 1 addition & 1 deletion dedoc/dedoc_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str])
:return: parsed document
"""
if not os.path.isfile(path=file_path):
raise FileNotFoundError()
raise FileNotFoundError(file_path)
self.logger.info(f"Start handle {file_path}")
file_dir, file_name = os.path.split(file_path)
unique_filename = get_unique_name(file_name)
Expand Down
4 changes: 2 additions & 2 deletions dedoc/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

converted_extensions = Extensions(
excel_like_format=[".ods", "xls"],
docx_like_format=[".odt", ".doc"],
docx_like_format=[".odt", ".doc", ".rtf"],
pptx_like_format=[".odp", ".ppt"],
archive_like_format=[],
image_like_format=[".pcx", ".webp", ".sgi", ".hdr", ".sr", ".pic", ".dib", ".jfif", ".j2k"],
Expand All @@ -26,7 +26,7 @@
# .sgi, .hdr, .sr, .ras - не зарегистрованы в mime
converted_mimes = Extensions(
excel_like_format=["application/vnd.oasis.opendocument.spreadsheet", "application/vnd.ms-excel"],
docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text"],
docx_like_format=["application/msword", "application/vnd.oasis.opendocument.text", "application/rtf"],
pptx_like_format=[
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.ms-powerpoint",
Expand Down
43 changes: 26 additions & 17 deletions dedoc/readers/archive_reader/archive_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,58 +41,67 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
The method return empty content of archive, all content will be placed inside attachments.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
attachments = self.__get_attachments(path=path)
parameters = {} if parameters is None else parameters

with_attachments = str(parameters.get("with_attachments", "false")).lower() == "true"
if not with_attachments:
return UnstructuredDocument(lines=[], tables=[], attachments=[])

attachments_dir = parameters.get("attachments_dir", None)
attachments_dir = os.path.dirname(path) if attachments_dir is None else attachments_dir

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
attachments = self.__get_attachments(path=path, tmp_dir=attachments_dir, need_content_analysis=need_content_analysis)
return UnstructuredDocument(lines=[], tables=[], attachments=attachments)

def __get_attachments(self, path: str) -> List[AttachedFile]:
tmp_dir = os.path.dirname(path)
def __get_attachments(self, path: str, tmp_dir: str, need_content_analysis: bool) -> List[AttachedFile]:
mime = get_file_mime_type(path)
if zipfile.is_zipfile(path) and mime == "application/zip":
return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir))
return list(self.__read_zip_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
if tarfile.is_tarfile(path):
return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir))
return list(self.__read_tar_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
if rarfile.is_rarfile(path):
return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir))
return list(self.__read_rar_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
if mime == "application/x-7z-compressed":
return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir))
return list(self.__read_7z_archive(path=path, tmp_dir=tmp_dir, need_content_analysis=need_content_analysis))
# if no one can handle this archive raise exception
raise BadFileFormatError(f"bad archive {path}")

def __read_zip_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
def __read_zip_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
try:
with zipfile.ZipFile(path, "r") as arch_file:
names = [member.filename for member in arch_file.infolist() if member.file_size > 0]
for name in names:
with arch_file.open(name) as file:
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
except (zipfile.BadZipFile, zlib.error) as e:
self.logger.warning(f"Can't read file {path} ({e})")
raise BadFileFormatError(f"Can't read file {path} ({e})")

def __read_tar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
def __read_tar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
with tarfile.open(path, "r") as arch_file:
names = [member.name for member in arch_file.getmembers() if member.isfile()]
for name in names:
file = arch_file.extractfile(name)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
file.close()

def __read_rar_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
with rarfile.RarFile(path, "r") as arch_file:
names = [item.filename for item in arch_file.infolist() if item.compress_size > 0]
for name in names:
with arch_file.open(name) as file:
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)

def __read_7z_archive(self, path: str, tmp_dir: str) -> Iterator[AttachedFile]:
def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
with open(path, "rb") as content:
arch_file = py7zlib.Archive7z(content)
names = arch_file.getnames()
for name in names:
file = arch_file.getmember(name)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)

def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) -> AttachedFile:
def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
file_name = os.path.basename(file_name)
binary_data = file.read()
if isinstance(binary_data, str):
Expand All @@ -101,7 +110,7 @@ def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes]) ->
attachment = AttachedFile(
original_name=file_name,
tmp_file_path=os.path.join(tmp_dir, tmp_path),
need_content_analysis=True,
need_content_analysis=need_content_analysis,
uid=f"attach_{uuid.uuid1()}"
)
return attachment
16 changes: 10 additions & 6 deletions dedoc/readers/docx_reader/properties_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ def change_indent(old_properties: BaseProperties, tree: Tag) -> None:
if not tree.ind:
return

attributes = {attribute: 0 for attribute in ["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]}
attributes = {
attribute: 0 for attribute in
["firstLine", "firstLineChars", "hanging", "hangingChars", "start", "startChars", "left"]
}
for attribute in attributes:
attributes[attribute] = float(tree.ind.get(f"w:{attribute}", 0))

Expand Down Expand Up @@ -106,7 +109,8 @@ def change_size(old_properties: BaseProperties, tree: Tag) -> None:
:param tree: BeautifulSoup tree with properties
"""
if tree.sz:
old_properties.size = int(tree.sz.get("w:val", old_properties.size))
new_size = float(tree.sz.get("w:val", old_properties.size))
old_properties.size = int(new_size)


def change_jc(old_properties: BaseProperties, tree: Tag) -> None:
Expand Down Expand Up @@ -176,19 +180,19 @@ def change_spacing(old_properties: BaseProperties, tree: Tag) -> None:

if not before_autospacing:
before_lines = tree.spacing.get("w:beforeLines", False)
before_lines = int(before_lines) if before_lines else before_lines
before_lines = int(float(before_lines)) if before_lines else before_lines
if not before_lines:
before_tag = tree.spacing.get("w:before", False)
before = int(before_tag) if before_tag else before
before = int(float(before_tag)) if before_tag else before
else:
before = before_lines

if not after_autospacing:
after_lines = tree.spacing.get("w:afterLines", False)
after_lines = int(after_lines) if after_lines else after_lines
after_lines = int(float(after_lines)) if after_lines else after_lines
if not after_lines:
after_tag = tree.spacing.get("w:after", False)
after = int(after_tag) if after_tag else after
after = int(float(after_tag)) if after_tag else after
else:
after = after_lines

Expand Down
Loading

0 comments on commit d83bf23

Please sign in to comment.