diff --git a/.flake8 b/.flake8 index 401f544b..c0511db7 100644 --- a/.flake8 +++ b/.flake8 @@ -28,6 +28,7 @@ exclude = *__init__.py, resources, venv, + .venv, build, dedoc.egg-info, docs/_build, @@ -48,5 +49,5 @@ per-file-ignores = scripts/*:T201 scripts/benchmark_pdf_performance*:JS101 tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802 - docs/source/_static/code_examples/*:I251 + docs/source/_static/code_examples/*:I251,T201 docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 4cb468e8..262c3ba5 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -33,3 +33,4 @@ jobs: python dedoc_usage_tutorial.py python dedoc_add_new_doc_type_tutorial.py python dedoc_add_new_structure_type_tutorial.py + python dedoc_using_patterns_tutorial.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09231202..2b5eae7a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 5.0.4 hooks: - id: flake8 - exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py + exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py args: - "--config=.flake8" additional_dependencies: [ diff --git a/README.md b/README.md index f4a8c726..a14ad28b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Dedoc +[![Telegram](https://img.shields.io/badge/chat-on%20Telegram-2ba2d9.svg)](https://t.me/dedoc_chat) [![image](https://img.shields.io/pypi/pyversions/dedoc.svg)](https://pypi.python.org/pypi/dedoc) [![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/) [![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc) @@ -94,6 +95,12 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io * Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023) * Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023) +# Join Our Community + +Have questions or want to discuss Dedoc? Join our [Telegram chat](https://t.me/dedoc_chat) and connect with the community and the developers. + +Join our [Telegram channel](https://t.me/dedoc_channel) to get notifications about the most recent updates. + # Installation instructions This project has a REST api and you can run it in Docker container. diff --git a/VERSION b/VERSION index b539adea..c0943d3e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.7 \ No newline at end of file +2.3 \ No newline at end of file diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py index 8f3e1415..d1f7d5cf 100644 --- a/dedoc/api/api_args.py +++ b/dedoc/api/api_args.py @@ -8,6 +8,7 @@ class QueryParameters: # type of document structure parsing document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain") + patterns: str = Form("", description='Patterns for default document type (when document_type="")') structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type") return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"], description="Response representation, most types (except json) are used for debug purposes only") @@ -39,6 +40,7 @@ class QueryParameters: '"no_change" - set vertical orientation of the document without using an orientation classifier') need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result") need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)") + need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document") # other formats handling delimiter: Optional[str] = Form(None, description="Column separator for CSV files") diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py index 9add75dd..225de396 100644 --- a/dedoc/api/schema/annotation.py +++ b/dedoc/api/schema/annotation.py @@ -5,6 +5,16 @@ class Annotation(BaseModel): """ The piece of information about the text line: it's appearance or links to another document object. For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic. + + :ivar start: start of the annotated text + :ivar end: end of the annotated text (end isn't included) + :ivar name: annotation's name, specific for each type of annotation + :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc. + + :vartype start: int + :vartype end: int + :vartype name: str + :vartype value: str """ start: int = Field(description="Start of the annotated text", example=0) end: int = Field(description="End of the annotated text (end isn't included)", example=5) diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py index efeb0fdf..05cb6f66 100644 --- a/dedoc/api/schema/cell_with_meta.py +++ b/dedoc/api/schema/cell_with_meta.py @@ -8,6 +8,16 @@ class CellWithMeta(BaseModel): """ Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + + :ivar lines: list of textual lines of the cell + :ivar colspan: number of columns to span (for cells merged horizontally) + :ivar rowspan: number of rows to span (for cells merged vertically) + :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display) + + :vartype lines: List[LineWithMeta] + :vartype colspan: int + :vartype rowspan: int + :vartype invisible: bool """ lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations") rowspan: int = Field(description="Number of rows to span like in HTML format", example=1) diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py index 5127650e..e9d8a47c 100644 --- a/dedoc/api/schema/document_content.py +++ b/dedoc/api/schema/document_content.py @@ -9,6 +9,12 @@ class DocumentContent(BaseModel): """ Content of the document - structured text and tables. + + :ivar tables: list of document tables + :ivar structure: tree structure of the document nodes with text and additional metadata + + :vartype tables: List[Table] + :vartype structure: TreeNode """ structure: TreeNode = Field(description="Tree structure where content of the document is organized") tables: List[Table] = Field(description="List of document tables") diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py index 4d814fc3..fb45c075 100644 --- a/dedoc/api/schema/document_metadata.py +++ b/dedoc/api/schema/document_metadata.py @@ -4,6 +4,26 @@ class DocumentMetadata(BaseModel): """ Document metadata like its name, size, author, etc. + + :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :ivar temporary_file_name: file name during parsing (unique name after rename and conversion) + :ivar size: size of the original file in bytes + :ivar modified_time: time of the last modification in unix time format (seconds since the epoch) + :ivar created_time: time of the creation in unixtime + :ivar access_time: time of the last access to the file in unixtime + :ivar file_type: mime type of the file + :ivar uid: document unique identifier (useful for attached files) + + :vartype file_name: str + :vartype temporary_file_name: str + :vartype size: int + :vartype modified_time: int + :vartype created_time: int + :vartype access_time: int + :vartype file_type: str + :vartype uid: str + + Additional variables may be added with other file metadata. """ class Config: extra = Extra.allow diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py index 37e893d8..e123f28d 100644 --- a/dedoc/api/schema/line_metadata.py +++ b/dedoc/api/schema/line_metadata.py @@ -6,10 +6,20 @@ class LineMetadata(BaseModel): """ Holds information about document node/line metadata, such as page number or line type. + + :ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.) + :ivar page_id: page number where paragraph starts, the numeration starts from page 0 + :ivar line_id: line number inside the entire document, the numeration starts from line 0 + + :vartype paragraph_type: str + :vartype page_id: int + :vartype line_id: Optional[int] + + Additional variables may be added with other line metadata. """ class Config: extra = Extra.allow - paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text") + paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text") page_id: int = Field(description="Page number of the line/paragraph beginning", example=0) line_id: Optional[int] = Field(description="Line number", example=1) diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py index 1c155ab5..a8f61b1d 100644 --- a/dedoc/api/schema/line_with_meta.py +++ b/dedoc/api/schema/line_with_meta.py @@ -8,6 +8,12 @@ class LineWithMeta(BaseModel): """ Textual line with text annotations. + + :ivar text: text of the line + :ivar annotations: text annotations (font, size, bold, italic, etc.) + + :vartype text: str + :vartype annotations: List[Annotation] """ text: str = Field(description="Text of the line", example="Some text") - annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)") + annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)") diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py index 076540a4..d4a7d846 100644 --- a/dedoc/api/schema/parsed_document.py +++ b/dedoc/api/schema/parsed_document.py @@ -9,6 +9,18 @@ class ParsedDocument(BaseModel): """ Holds information about the document content, metadata and attachments. + + :ivar content: document text (hierarchy of nodes) and tables + :ivar attachments: result of analysis of attached files (empty if with_attachments=False) + :ivar metadata: document metadata such as size, creation date and so on. + :ivar warnings: list of warnings and possible errors, arising in the process of document parsing + :ivar version: version of the program that parsed this document + + :vartype content: DocumentContent + :vartype attachments: List[ParsedDocument] + :vartype metadata: DocumentMetadata + :vartype warnings: List[str] + :vartype version: str """ content: DocumentContent = Field(description="Document text and tables") metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on") diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py index 52b2b59c..e834f1bf 100644 --- a/dedoc/api/schema/table.py +++ b/dedoc/api/schema/table.py @@ -11,6 +11,12 @@ class Table(BaseModel): Holds information about tables in the document. We assume that a table has rectangle form (has the same number of columns in each row). Table representation is row-based i.e. external list contains list of rows. + + :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes) + :ivar cells: table metadata as location, title and so on + + :vartype metadata: TableMetadata + :vartype cells: List[List[CellWithMeta]] """ cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)") metadata: TableMetadata = Field(description="Table meta information") diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py index 53299a16..b75dbc21 100644 --- a/dedoc/api/schema/table_metadata.py +++ b/dedoc/api/schema/table_metadata.py @@ -6,6 +6,16 @@ class TableMetadata(BaseModel): """ Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + + :ivar page_id: number of the page where table starts + :ivar uid: unique identifier of the table (used for linking table to text) + :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition + :ivar title: table's title + + :vartype page_id: Optional[int] + :vartype uid: str + :vartype rotated_angle: float + :vartype title: str """ page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0) uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f") diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py index 5eeedd42..2aeeccae 100644 --- a/dedoc/api/schema/tree_node.py +++ b/dedoc/api/schema/tree_node.py @@ -10,6 +10,18 @@ class TreeNode(BaseModel): """ Helps to represent document as recursive tree structure. It has list of children `TreeNode` nodes (empty list for a leaf node). + + :ivar node_id: unique node identifier + :ivar text: text of the node (may contain several lines) + :ivar annotations: some metadata related to the part of the text (as font size) + :ivar metadata: metadata refers to entire node (as node type) + :ivar subparagraphs: list of child of this node + + :vartype node_id: str + :vartype text: str + :vartype annotations: List[Annotation] + :vartype metadata: LineMetadata + :vartype subparagraphs: List[TreeNode] """ node_id: str = Field(description="Document element identifier. It is unique within a document content tree. " "The identifier consists of numbers separated by dots where each number " diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 423dbcfe..4362832a 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -28,7 +28,7 @@

Parameters configuration

Type of document structure parsing

-
document_type, structure_type, return_format +
document_type, patterns, structure_type, return_format

Type of document structure parsing

+

+

+ Patterns for default structure extractor (document_type="other")
+
+ +
+

+

+

+ +

@@ -213,4 +224,18 @@

Useful links

+ + + \ No newline at end of file diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py index 2820f5a4..3cede658 100644 --- a/dedoc/data_structures/annotation.py +++ b/dedoc/data_structures/annotation.py @@ -7,6 +7,18 @@ class Annotation(Serializable): Base class for text annotations of all kinds. Annotation is the piece of information about the text line: it's appearance or links to another document object. Look to the concrete kind of annotations to get mode examples. + + :ivar start: start of the annotated text + :ivar end: end of the annotated text (end isn't included) + :ivar name: annotation's name, specific for each type of annotation + :ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc. + :ivar is_mergeable: is it possible to merge annotations with the same value + + :vartype start: int + :vartype end: int + :vartype name: str + :vartype value: str + :vartype is_mergeable: bool """ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None: @@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo :param value: information about annotated text :param is_mergeable: is it possible to merge annotations with the same value """ - self.start = start - self.end = end - self.name = name - self.value = value - self.is_mergeable = is_mergeable + self.start: int = start + self.end: int = end + self.name: str = name + self.value: str = value + self.is_mergeable: bool = is_mergeable def __eq__(self, o: object) -> bool: if not isinstance(o, Annotation): @@ -35,7 +47,7 @@ def __str__(self) -> str: return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})" def __repr__(self) -> str: - return f"{self.name.capitalize()}(...)" + return self.__str__() def to_api_schema(self) -> ApiAnnotation: return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value) diff --git a/dedoc/data_structures/attached_file.py b/dedoc/data_structures/attached_file.py index c838acd6..9aac4d09 100644 --- a/dedoc/data_structures/attached_file.py +++ b/dedoc/data_structures/attached_file.py @@ -1,18 +1,28 @@ class AttachedFile: """ Holds information about files, attached to the parsed document. + + :ivar original_name: original name of the attached file if it was possible to extract it + :ivar tmp_file_path: path to the attached file on disk - its name is different from original_name + :ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`) + :ivar uid: unique identifier of the attached file + + :vartype original_name: str + :vartype tmp_file_path: str + :vartype need_content_analysis: bool + :vartype uid: str """ def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None: """ - :param original_name: Name of the file from which the attachments are extracted - :param tmp_file_path: path to the attachment file. + :param original_name: original name of the attached file + :param tmp_file_path: path to the attachment file :param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing :param uid: unique identifier of the attachment """ - self.original_name = original_name - self.tmp_file_path = tmp_file_path - self.need_content_analysis = need_content_analysis - self.uid = uid + self.original_name: str = original_name + self.tmp_file_path: str = tmp_file_path + self.need_content_analysis: bool = need_content_analysis + self.uid: str = uid def get_filename_in_path(self) -> str: return self.tmp_file_path diff --git a/dedoc/data_structures/cell_with_meta.py b/dedoc/data_structures/cell_with_meta.py index c2ff7cac..133d69bf 100644 --- a/dedoc/data_structures/cell_with_meta.py +++ b/dedoc/data_structures/cell_with_meta.py @@ -9,6 +9,16 @@ class CellWithMeta(Serializable): """ This class holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible). + + :ivar lines: list of textual lines of the cell + :ivar colspan: number of columns to span (for cells merged horizontally) + :ivar rowspan: number of rows to span (for cells merged vertically) + :ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display) + + :vartype lines: List[LineWithMeta] + :vartype colspan: int + :vartype rowspan: int + :vartype invisible: bool """ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1, invisible: bool = False) -> None: """ @@ -17,10 +27,10 @@ def __init__(self, lines: List[LineWithMeta], colspan: int = 1, rowspan: int = 1 :param rowspan: number of rows to span like in HTML format :param invisible: indicator for displaying or hiding cell text """ - self.lines = lines - self.colspan = colspan - self.rowspan = rowspan - self.invisible = invisible + self.lines: List[LineWithMeta] = lines + self.colspan: int = colspan + self.rowspan: int = rowspan + self.invisible: bool = invisible def __repr__(self) -> str: return f"CellWithMeta({self.get_text()[:65]})" diff --git a/dedoc/data_structures/document_content.py b/dedoc/data_structures/document_content.py index ad4fa81e..4b420249 100644 --- a/dedoc/data_structures/document_content.py +++ b/dedoc/data_structures/document_content.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from dedoc.api.schema.document_content import DocumentContent as ApiDocumentContent from dedoc.data_structures.serializable import Serializable @@ -9,16 +9,24 @@ class DocumentContent(Serializable): """ This class holds the document content - structured text and tables. + + :ivar tables: list of document tables + :ivar structure: tree structure of the document nodes with text and additional metadata + :ivar warnings: list of warnings, obtained in the process of the document parsing + + :vartype tables: List[Table] + :vartype structure: TreeNode + :vartype warnings: List[str] """ - def __init__(self, tables: List[Table], structure: TreeNode, warnings: List[str] = None) -> None: + def __init__(self, tables: List[Table], structure: TreeNode, warnings: Optional[List[str]] = None) -> None: """ :param tables: list of document tables :param structure: tree structure in which content of the document is organized - :param warnings: list of warnings, obtained in the process of the document structure constructing + :param warnings: list of warnings """ - self.tables = tables - self.structure = structure - self.warnings = warnings if warnings is not None else [] + self.tables: List[Table] = tables + self.structure: TreeNode = structure + self.warnings: List[str] = warnings if warnings is not None else [] def to_api_schema(self) -> ApiDocumentContent: structure = self.structure.to_api_schema() diff --git a/dedoc/data_structures/document_metadata.py b/dedoc/data_structures/document_metadata.py index ec51d143..23f8a11f 100644 --- a/dedoc/data_structures/document_metadata.py +++ b/dedoc/data_structures/document_metadata.py @@ -1,4 +1,4 @@ -from typing import Dict, Union +from typing import Dict, Optional, Union from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata from dedoc.data_structures.serializable import Serializable @@ -7,6 +7,26 @@ class DocumentMetadata(Serializable): """ This class holds information about document metadata. + + :ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) + :ivar temporary_file_name: file name during parsing (unique name after rename and conversion) + :ivar size: size of the original file in bytes + :ivar modified_time: time of the last modification in unix time format (seconds since the epoch) + :ivar created_time: time of the creation in unixtime + :ivar access_time: time of the last access to the file in unixtime + :ivar file_type: mime type of the file + :ivar uid: document unique identifier (useful for attached files) + + :vartype file_name: str + :vartype temporary_file_name: str + :vartype size: int + :vartype modified_time: int + :vartype created_time: int + :vartype access_time: int + :vartype file_type: str + :vartype uid: str + + Additional variables may be added with other file metadata. """ def __init__(self, @@ -17,30 +37,30 @@ def __init__(self, created_time: int, access_time: int, file_type: str, - uid: str = None, + uid: Optional[str] = None, **kwargs: Dict[str, Union[str, int, float]]) -> None: """ - :param uid: document unique identifier (useful for attached files) - :param file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on) - :param temporary_file_name: file name during parsing (unique name after rename and conversion); + :param uid: document unique identifier + :param file_name: original document name + :param temporary_file_name: file name during parsing :param size: size of the original file in bytes - :param modified_time: time of the last modification in unix time format (seconds since the epoch) + :param modified_time: time of the last modification in unix time format :param created_time: time of the creation in unixtime :param access_time: time of the last access to the file in unixtime :param file_type: mime type of the file """ import uuid - self.file_name = file_name - self.temporary_file_name = temporary_file_name - self.size = size - self.modified_time = modified_time - self.created_time = created_time - self.access_time = access_time - self.file_type = file_type + self.file_name: str = file_name + self.temporary_file_name: str = temporary_file_name + self.size: int = size + self.modified_time: int = modified_time + self.created_time: int = created_time + self.access_time: int = access_time + self.file_type: str = file_type for key, value in kwargs.items(): self.add_attribute(key, value) - self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid + self.uid: str = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid def add_attribute(self, key: str, value: Union[str, int, float]) -> None: setattr(self, key, value) diff --git a/dedoc/data_structures/hierarchy_level.py b/dedoc/data_structures/hierarchy_level.py index ab2ea053..f7964bc9 100644 --- a/dedoc/data_structures/hierarchy_level.py +++ b/dedoc/data_structures/hierarchy_level.py @@ -15,6 +15,16 @@ class HierarchyLevel: For the least important lines (line_type=raw_text) both levels are None. Look to the :ref:`hierarchy level description ` to get more details. + + :ivar level_1: value of a line's primary importance + :ivar level_2: level of the line inside specific class + :ivar can_be_multiline: is used to unify lines inside tree node, if line can be multiline, it can be joined with another line + :ivar line_type: type of the line, e.g. raw text, list item, header, etc. + + :vartype level_1: Optional[int] + :vartype level_2: Optional[int] + :vartype can_be_multiline: bool + :vartype line_type: str """ root = "root" toc = "toc" @@ -33,14 +43,14 @@ def __init__(self, level_1: Optional[int], level_2: Optional[int], can_be_multil :param level_1: value of a line's primary importance :param level_2: level of the line inside specific class :param can_be_multiline: is used to unify lines inside tree node, if line can be multiline, it can be joined with another line - :param line_type: type of the line, e.g. raw text, list item, header, etc. + :param line_type: type of the line """ assert level_1 is None or level_1 >= 0 assert level_2 is None or level_2 >= 0 - self.level_1 = level_1 - self.level_2 = level_2 - self.can_be_multiline = can_be_multiline - self.line_type = line_type + self.level_1: Optional[int] = level_1 + self.level_2: Optional[int] = level_2 + self.can_be_multiline: bool = can_be_multiline + self.line_type: str = line_type def __is_defined(self, other: "HierarchyLevel") -> bool: return self.level_1 is not None and self.level_2 is not None and other.level_1 is not None and other.level_2 is not None diff --git a/dedoc/data_structures/line_metadata.py b/dedoc/data_structures/line_metadata.py index e9be87a3..9b255d5d 100644 --- a/dedoc/data_structures/line_metadata.py +++ b/dedoc/data_structures/line_metadata.py @@ -8,6 +8,20 @@ class LineMetadata(Serializable): """ This class holds information about document node (and document line) metadata, such as page number or line level in a document hierarchy. + + :ivar tag_hierarchy_level: the hierarchy level of the line with its type directly extracted by some of the readers + (usually information got from tags e.g. in docx or html readers) + :ivar hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line. + The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree. + :ivar page_id: page number where paragraph starts, the numeration starts from page 0 + :ivar line_id: line number inside the entire document, the numeration starts from line 0 + + :vartype tag_hierarchy_level: HierarchyLevel + :vartype hierarchy_level: Optional[HierarchyLevel] + :vartype page_id: int + :vartype line_id: Optional[int] + + Additional variables may be added with other line metadata. """ def __init__(self, @@ -20,15 +34,12 @@ def __init__(self, :param page_id: page number where paragraph starts, the numeration starts from page 0 :param line_id: line number inside the entire document, the numeration starts from line 0 :param tag_hierarchy_level: the hierarchy level of the line with its type directly extracted by some of the readers - (usually information got from tags e.g. in docx or html readers) :param hierarchy_level: the hierarchy level of the line extracted by some of the structure extractors - the result type and level of the line. - The lower the level of the hierarchy, the closer it is to the root, it's used to construct document tree. """ - self.tag_hierarchy_level = HierarchyLevel(None, None, can_be_multiline=True, line_type=HierarchyLevel.unknown) \ - if tag_hierarchy_level is None else tag_hierarchy_level - self.hierarchy_level = hierarchy_level - self.page_id = page_id - self.line_id = line_id + self.tag_hierarchy_level: HierarchyLevel = HierarchyLevel.create_unknown() if tag_hierarchy_level is None else tag_hierarchy_level + self.hierarchy_level: Optional[HierarchyLevel] = hierarchy_level + self.page_id: int = page_id + self.line_id: Optional[int] = line_id for key, value in kwargs.items(): setattr(self, key, value) diff --git a/dedoc/data_structures/line_with_meta.py b/dedoc/data_structures/line_with_meta.py index 18f64e23..4642c8ff 100644 --- a/dedoc/data_structures/line_with_meta.py +++ b/dedoc/data_structures/line_with_meta.py @@ -119,18 +119,30 @@ def __extract_annotations_by_slice(self, start: int, stop: int) -> List[Annotati @property def line(self) -> str: + """ + Raw text of the document line + """ return self._line @property def metadata(self) -> LineMetadata: + """ + Line metadata related to the entire line, as line or page number, hierarchy level + """ return self._metadata @property def annotations(self) -> List[Annotation]: + """ + Metadata that refers to some part of the text, for example, font size, font type, etc. + """ return self._annotations @property def uid(self) -> str: + """ + Unique identifier of the line + """ return self._uid def set_line(self, line: str) -> None: @@ -140,8 +152,12 @@ def set_metadata(self, metadata: LineMetadata) -> None: self._metadata = metadata def __repr__(self) -> str: - return (f"LineWithMeta({self.line[:65]}, " - f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})") + text = self.line if len(self.line) < 65 else self.line[:62] + "..." + tag_hl = "None" if self.metadata.tag_hierarchy_level is None else \ + f"{self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type}" + hl = "None" if self.metadata.hierarchy_level is None else \ + f"{self.metadata.hierarchy_level.level_1, self.metadata.hierarchy_level.level_2, self.metadata.hierarchy_level.line_type}" + return f"LineWithMeta({text.strip()}, tagHL={tag_hl}, HL={hl})" def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta": from dedoc.utils.annotation_merger import AnnotationMerger @@ -164,3 +180,12 @@ def __add__(self, other: Union["LineWithMeta", str]) -> "LineWithMeta": def to_api_schema(self) -> ApiLineWithMeta: annotations = [annotation.to_api_schema() for annotation in self.annotations] return ApiLineWithMeta(text=self._line, annotations=annotations) + + def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: + import json + from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation + for annotation in self.annotations: + if annotation.name == "bounding box": + bbox, page_width, page_height = BBoxAnnotation.get_bbox_from_value(annotation.value) + bbox.shift(shift_x, shift_y) + annotation.value = json.dumps(bbox.to_relative_dict(image_width, image_height)) diff --git a/dedoc/data_structures/parsed_document.py b/dedoc/data_structures/parsed_document.py index 862b87d0..67a700cb 100644 --- a/dedoc/data_structures/parsed_document.py +++ b/dedoc/data_structures/parsed_document.py @@ -9,11 +9,21 @@ class ParsedDocument(Serializable): """ This class holds information about the document content, metadata and attachments. + + :ivar content: document text (hierarchy of nodes) and tables + :ivar attachments: result of analysis of attached files (empty if with_attachments=False) + :ivar metadata: document metadata such as size, creation date and so on. + :ivar warnings: list of warnings and possible errors, arising in the process of document parsing + + :vartype content: DocumentContent + :vartype attachments: List[ParsedDocument] + :vartype metadata: DocumentMetadata + :vartype warnings: List[str] """ def __init__(self, metadata: DocumentMetadata, - content: Optional[DocumentContent], - warnings: List[str] = None, + content: DocumentContent, + warnings: Optional[List[str]] = None, attachments: Optional[List["ParsedDocument"]] = None) -> None: """ :param metadata: document metadata such as size, creation date and so on. @@ -21,10 +31,10 @@ def __init__(self, :param attachments: result of analysis of attached files :param warnings: list of warnings and possible errors, arising in the process of document parsing """ - self.metadata = metadata - self.content = content - self.attachments = [] if attachments is None else attachments - self.warnings = warnings if warnings is not None else [] + self.metadata: DocumentMetadata = metadata + self.content: DocumentContent = content + self.attachments: List["ParsedDocument"] = [] if attachments is None else attachments + self.warnings: List[str] = warnings if warnings is not None else [] def add_attachments(self, new_attachment: List["ParsedDocument"]) -> None: if self.attachments is None: diff --git a/dedoc/data_structures/table.py b/dedoc/data_structures/table.py index 65ac6d49..1f53bf55 100644 --- a/dedoc/data_structures/table.py +++ b/dedoc/data_structures/table.py @@ -10,15 +10,22 @@ class Table(Serializable): """ This class holds information about tables in the document. We assume that a table has rectangle form (has the same number of columns in each row). + If some cells are merged, they are duplicated and information about merge is stored in rowspan and colspan. Table representation is row-based i.e. external list contains list of rows. + + :ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes) + :ivar cells: table metadata as location, title and so on + + :vartype metadata: TableMetadata + :vartype cells: List[List[CellWithMeta]] """ def __init__(self, cells: List[List[CellWithMeta]], metadata: TableMetadata) -> None: """ - :param cells: a list of lists of cells (cell has text, colspan and rowspan attributes) - :param metadata: some table metadata as location, size and so on + :param cells: a list of lists of cells + :param metadata: table metadata """ - self.metadata = metadata - self.cells = cells + self.metadata: TableMetadata = metadata + self.cells: List[List[CellWithMeta]] = cells def to_api_schema(self) -> ApiTable: cells = [[cell.to_api_schema() for cell in row] for row in self.cells] diff --git a/dedoc/data_structures/table_metadata.py b/dedoc/data_structures/table_metadata.py index e85c747e..cdcab2fb 100644 --- a/dedoc/data_structures/table_metadata.py +++ b/dedoc/data_structures/table_metadata.py @@ -7,20 +7,30 @@ class TableMetadata(Serializable): """ This class holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on. + + :ivar page_id: number of the page where table starts + :ivar uid: unique identifier of the table (used for linking table to text) + :ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition + :ivar title: table's title + + :vartype page_id: Optional[int] + :vartype uid: str + :vartype rotated_angle: float + :vartype title: str """ def __init__(self, page_id: Optional[int], uid: Optional[str] = None, rotated_angle: float = 0.0, title: str = "") -> None: """ :param page_id: number of the page where table starts :param uid: unique identifier of the table - :param rotated_angle: value of the rotation angle by which the table was rotated during recognition + :param rotated_angle: rotation angle by which the table was rotated during recognition :param title: table's title """ import uuid - self.page_id = page_id - self.uid = str(uuid.uuid4()) if not uid else uid - self.rotated_angle = rotated_angle - self.title = title + self.page_id: Optional[int] = page_id + self.uid: str = str(uuid.uuid4()) if not uid else uid + self.rotated_angle: float = rotated_angle + self.title: str = title def to_api_schema(self) -> ApiTableMetadata: return ApiTableMetadata(uid=self.uid, page_id=self.page_id, rotated_angle=self.rotated_angle, title=self.title) diff --git a/dedoc/data_structures/tree_node.py b/dedoc/data_structures/tree_node.py index 6cde3554..de380584 100644 --- a/dedoc/data_structures/tree_node.py +++ b/dedoc/data_structures/tree_node.py @@ -12,6 +12,20 @@ class TreeNode(Serializable): """ TreeNode helps to represent document as recursive tree structure. It has parent node (None for root ot the tree) and list of children nodes (empty list for list node). + + :ivar node_id: unique node identifier + :ivar text: text of the node (may contain several lines) + :ivar annotations: some metadata related to the part of the text (as font size) + :ivar metadata: metadata refers to entire node (as node type) + :ivar subparagraphs: list of child of this node + :ivar parent: parent node (None for root, not none for other nodes) + + :vartype node_id: str + :vartype text: str + :vartype annotations: List[Annotation] + :vartype metadata: LineMetadata + :vartype subparagraphs: List[TreeNode] + :vartype parent: TreeNode """ def __init__(self, node_id: str, @@ -23,17 +37,17 @@ def __init__(self, """ :param node_id: node id is unique in one document :param text: text of the node - :param annotations: some metadata related to the part of the text (as font size) - :param metadata: metadata refers to entire node (as node type) + :param annotations: metadata related to the part of the text + :param metadata: metadata refers to entire node :param subparagraphs: list of child of this node - :param parent: parent node (None for root, not none for other nodes) + :param parent: parent node """ - self.node_id = node_id - self.text = text - self.annotations = annotations - self.metadata = metadata - self.subparagraphs = subparagraphs - self.parent = parent + self.node_id: str = node_id + self.text: str = text + self.annotations: List[Annotation] = annotations + self.metadata: LineMetadata = metadata + self.subparagraphs: List["TreeNode"] = subparagraphs + self.parent: "TreeNode" = parent def to_api_schema(self) -> ApiTreeNode: annotations = [annotation.to_api_schema() for annotation in self.annotations] diff --git a/dedoc/data_structures/unstructured_document.py b/dedoc/data_structures/unstructured_document.py index 94197e2e..256ef8db 100644 --- a/dedoc/data_structures/unstructured_document.py +++ b/dedoc/data_structures/unstructured_document.py @@ -9,12 +9,24 @@ class UnstructuredDocument: """ This class holds information about raw document content: its text, tables and attachments, that have been procured using one of the readers. Text is represented as a flat list of lines, hierarchy level of each line isn't defined (only tag hierarchy level may exist). + + :ivar lines: list of textual lines with metadata returned by a reader + :ivar tables: list of document tables returned by a reader + :ivar attachments: list of document attached files + :ivar metadata: information about the document (like in :class:`~dedoc.data_structures.DocumentMetadata`) + :ivar warnings: list of warnings, obtained in the process of the document parsing + + :vartype lines: List[LineWithMeta] + :vartype tables: List[Table] + :vartype attachments: List[AttachedFile] + :vartype metadata: dict + :vartype warnings: List[str] """ def __init__(self, tables: List[Table], lines: List[LineWithMeta], attachments: List[AttachedFile], - warnings: List[str] = None, + warnings: Optional[List[str]] = None, metadata: Optional[dict] = None) -> None: """ :param tables: list of document tables @@ -23,11 +35,11 @@ def __init__(self, :param warnings: list of warnings, obtained in the process of the document parsing :param metadata: additional data """ - self.tables = tables - self.lines = lines - self.attachments = attachments - self.warnings = warnings if warnings else [] - self.metadata = metadata if metadata is not None else {} + self.tables: List[Table] = tables + self.lines: List[LineWithMeta] = lines + self.attachments: List[AttachedFile] = attachments + self.warnings: List[str] = warnings if warnings else [] + self.metadata: dict = metadata if metadata is not None else {} def get_text(self) -> str: return LineWithMeta.join(self.lines).line diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py index eb65b3eb..ead068d7 100644 --- a/dedoc/readers/docx_reader/line_with_meta_converter.py +++ b/dedoc/readers/docx_reader/line_with_meta_converter.py @@ -64,4 +64,4 @@ def __get_tag(self, paragraph: Paragraph) -> HierarchyLevel: if paragraph.list_level is not None: return HierarchyLevel(2, paragraph.list_level, False, HierarchyLevel.list_item) - return HierarchyLevel(None, None, True, HierarchyLevel.unknown) + return HierarchyLevel.create_unknown() diff --git a/dedoc/readers/docx_reader/numbering_extractor.py b/dedoc/readers/docx_reader/numbering_extractor.py index 4378adab..35b7e85d 100644 --- a/dedoc/readers/docx_reader/numbering_extractor.py +++ b/dedoc/readers/docx_reader/numbering_extractor.py @@ -104,6 +104,7 @@ def __get_list_item_text(self, ilvl: str, num_id: str) -> str: if abstract_num_id in self.state.prev_ilvl_dict: prev_ilvl = self.state.prev_ilvl_dict[abstract_num_id] + restarted = False # startOverride: if lvl_info.restart: if abstract_num_id in self.state.prev_num_id_dict: @@ -112,13 +113,15 @@ def __get_list_item_text(self, ilvl: str, num_id: str) -> str: prev_num_id = None if prev_num_id and prev_num_id != num_id: self.state.numerations_dict[(abstract_num_id, ilvl)] = lvl_info.start + restarted = True - # it's a new deeper level - if prev_ilvl < ilvl and lvl_info.lvl_restart or (abstract_num_id, ilvl) not in self.state.numerations_dict: - self.state.numerations_dict[(abstract_num_id, ilvl)] = lvl_info.start - # it's a continue of the old level (current level <= previous level) - else: - self.state.numerations_dict[(abstract_num_id, ilvl)] += 1 + if not restarted: + # it's a new deeper level + if prev_ilvl < ilvl and lvl_info.lvl_restart or (abstract_num_id, ilvl) not in self.state.numerations_dict: + self.state.numerations_dict[(abstract_num_id, ilvl)] = lvl_info.start + # it's a continue of the old level (current level <= previous level) + else: + self.state.numerations_dict[(abstract_num_id, ilvl)] += 1 # there isn't the information about this list else: diff --git a/dedoc/readers/pdf_reader/data_classes/line_with_location.py b/dedoc/readers/pdf_reader/data_classes/line_with_location.py index 190c1d87..9babf0d6 100644 --- a/dedoc/readers/pdf_reader/data_classes/line_with_location.py +++ b/dedoc/readers/pdf_reader/data_classes/line_with_location.py @@ -13,9 +13,13 @@ def __init__(self, line: str, metadata: LineMetadata, annotations: List[Annotati self.order = order super().__init__(line, metadata, annotations, uid) + def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: + super().shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) + self.location.shift(shift_x, shift_y) + def __repr__(self) -> str: - text = self.line if len(self.line) < 65 else self.line[:62] + "..." - return f"LineWithLocation({text[:65]})" + parent_repr = super().__repr__() + return parent_repr.replace("LineWithMeta", "LineWithLocation") def __str__(self) -> str: return self.__repr__() diff --git a/dedoc/readers/pdf_reader/data_classes/tables/cell.py b/dedoc/readers/pdf_reader/data_classes/tables/cell.py index 9308fded..dd7ece41 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/cell.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/cell.py @@ -30,6 +30,17 @@ def copy_from(cell: "Cell", uid=cell.cell_uid, contour_coord=cell.con_coord) + def shift(self, shift_x: int, shift_y: int, image_width: int, image_height: int) -> None: + if self.lines: + for line in self.lines: + line.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) + self.x_top_left += shift_x + self.x_bottom_right += shift_x + self.y_top_left += shift_y + self.y_bottom_right += shift_y + if self.con_coord: + self.con_coord.shift(shift_x=shift_x, shift_y=shift_y) + def __init__(self, x_top_left: int, x_bottom_right: int, diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py index 27d053a5..42bc468d 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/location.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/location.py @@ -12,6 +12,9 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: self.name = name self.rotated_angle = rotated_angle + def shift(self, shift_x: int, shift_y: int) -> None: + self.bbox.shift(shift_x, shift_y) + def to_dict(self) -> Dict[str, Any]: from collections import OrderedDict diff --git a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py index eb7c933d..284f9437 100644 --- a/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py +++ b/dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py @@ -11,7 +11,7 @@ class PdfAutoReader(BaseReader): :class:`~dedoc.readers.PdfAutoReader` is used for automatic detection of a correct textual layer in the given PDF file: - * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtLayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \ + * if PDF document has a correct textual layer then :class:`~dedoc.readers.PdfTxtlayerReader` or :class:`~dedoc.readers.PdfTabbyReader` is used \ for document content extraction; * if PDF document doesn't have a correct textual layer then :class:`~dedoc.readers.PdfImageReader` is used for document content extraction. diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 839b5006..0b34ce72 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -1,7 +1,9 @@ from abc import abstractmethod from collections import namedtuple -from typing import Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterator, List, Optional, Set, Tuple +import numpy as np +from dedocutils.data_structures.bbox import BBox from numpy import ndarray from dedoc.common.exceptions.bad_file_error import BadFileFormatError @@ -11,6 +13,7 @@ from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer ParametersForParseDoc = namedtuple("ParametersForParseDoc", [ "orient_analysis_cells", @@ -26,7 +29,9 @@ "table_type", "with_attachments", "attachments_dir", - "need_content_analysis" + "need_content_analysis", + "need_gost_frame_analysis", + "pdf_with_txt_layer" ]) @@ -50,11 +55,13 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti self.attachment_extractor = PDFAttachmentsExtractor(config=self.config) self.linker = LineObjectLinker(config=self.config) self.paragraph_extractor = ScanParagraphClassifierExtractor(config=self.config) + self.gost_frame_recognizer = GOSTFrameRecognizer(config=self.config) def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. - This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. + This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata` + (``can_be_multiline`` attribute is important for paragraph extraction). Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. @@ -78,7 +85,9 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure table_type=param_utils.get_param_table_type(parameters), with_attachments=param_utils.get_param_with_attachments(parameters), attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path), - need_content_analysis=param_utils.get_param_need_content_analysis(parameters) + need_content_analysis=param_utils.get_param_need_content_analysis(parameters), + need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters), + pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters) ) lines, scan_tables, attachments, warnings, metadata = self._parse_document(file_path, params_for_parse) @@ -94,18 +103,23 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( Tuple)[List[LineWithMeta], List[ScanTable], List[PdfImageAttachment], List[str], Optional[dict]]: import math from joblib import Parallel, delayed + from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.readers.pdf_reader.utils.header_footers_analysis import footer_header_analysis - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.utils.pdf_utils import get_pdf_page_count + from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader from dedoc.utils.utils import flatten first_page = 0 if parameters.first_page is None or parameters.first_page < 0 else parameters.first_page last_page = math.inf if parameters.last_page is None else parameters.last_page images = self._get_images(path, first_page, last_page) - result = Parallel(n_jobs=self.config["n_jobs"])( - delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page) - ) + if parameters.need_gost_frame_analysis and isinstance(self, (PdfImageReader, PdfTxtlayerReader)): + result, gost_analyzed_images = self._process_document_with_gost_frame(images=images, first_page=first_page, parameters=parameters, path=path) + else: + result = Parallel(n_jobs=self.config["n_jobs"])( + delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, image in enumerate(images, start=first_page) + ) page_count = get_pdf_page_count(path) page_count = math.inf if page_count is None else page_count @@ -126,19 +140,64 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> ( lines = [lines for lines, _, _, _ in result] lines, headers, footers = footer_header_analysis(lines) all_lines = list(flatten(lines)) + if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader): + self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images) mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines) all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments) - prev_line = None for line in all_lines_with_links: - line.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line) - prev_line = line + line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() all_lines_with_paragraphs = self.paragraph_extractor.extract(all_lines_with_links) if page_angles: metadata["rotated_page_angles"] = page_angles return all_lines_with_paragraphs, mp_tables, attachments, warnings, metadata + def _process_document_with_gost_frame(self, images: Iterator[np.ndarray], first_page: int, parameters: ParametersForParseDoc, path: str) -> \ + Tuple[Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]], Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]]: + from joblib import Parallel, delayed + from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader + + gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images) + page_range = range(first_page, first_page + len(gost_analyzed_images)) + gost_analyzed_images = dict(zip(page_range, gost_analyzed_images)) + if isinstance(self, PdfTxtlayerReader): + self.gost_frame_boxes = dict(zip(page_range, [item[1] for item in gost_analyzed_images.values()])) + result = Parallel(n_jobs=self.config["n_jobs"])( + delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in + gost_analyzed_images.items() + ) + return result, gost_analyzed_images + + def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment], + gost_analyzed_images: Dict[int, Tuple[np.ndarray, BBox, Tuple[int, ...]]]) -> None: + # shift unref_tables + for scan_table in unref_tables: + for location in scan_table.locations: + table_page_number = location.page_number + location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left) + page_number = scan_table.locations[0].page_number + for row in scan_table.matrix_cells: + for cell in row: + image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] + shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left) + cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) + + # shift attachments + for attachment in attachments: + attachment_page_number = attachment.location.page_number + shift_x, shift_y = gost_analyzed_images[attachment_page_number][1].x_top_left, gost_analyzed_images[attachment_page_number][1].y_top_left + attachment.location.shift(shift_x, shift_y) + + # shift lines + for line in lines: + page_number = line.metadata.page_id + image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0] + line.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, + shift_y=gost_analyzed_images[page_number][1].y_top_left, + image_width=image_width, + image_height=image_height) + @abstractmethod def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) \ -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py new file mode 100644 index 00000000..53201a1a --- /dev/null +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/gost_frame_recognizer.py @@ -0,0 +1,43 @@ +import logging +from typing import Optional, Tuple + +import cv2 +import numpy as np +from dedocutils.data_structures import BBox + +from dedoc.readers.pdf_reader.data_classes.tables.table_tree import TableTree +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_utils.img_processing import detect_horizontal_and_vertical_lines as detect_lines + +MIN_FRAME_CONTENT_AREA = 0.65 + + +class GOSTFrameRecognizer: + def __init__(self, *, config: dict = None) -> None: + self.logger = config.get("logger", logging.getLogger()) + self.config = config + + def rec_and_clean_frame(self, image: np.ndarray) -> Tuple[np.ndarray, BBox, Tuple[int, ...]]: + if len(image.shape) < 3: # check if an image is already converted to grayscale + thresh, img_bin = cv2.threshold(image, 225, 255, cv2.THRESH_BINARY) + else: + thresh, img_bin = cv2.threshold(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), 225, 255, cv2.THRESH_BINARY) + lines_bin = detect_lines(255 - img_bin, self.config, "tables") + contours, hierarchy = cv2.findContours(lines_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_TC89_KCOS) + tree_table = TableTree.parse_contours_to_tree(contours=contours, hierarchy=hierarchy, config=self.config) + + img_area = image.shape[0] * image.shape[1] + has_gost_frame, main_box = self._analyze_cells_on_frame(tree_table, img_area) + if has_gost_frame: + return BBox.crop_image_by_box(image, main_box), main_box, (int(image.shape[0]), int(image.shape[1])) + return image, BBox(0, 0, image.shape[1], image.shape[0]), (int(image.shape[0]), int(image.shape[1])) + + def _analyze_cells_on_frame(self, tree_table: "TableTree", img_area: "int") -> Tuple[bool, Optional[BBox]]: + try: + sub_bboxes = tree_table.children[0].children + for box in sub_bboxes: + if box.cell_box.square / img_area > MIN_FRAME_CONTENT_AREA: + return True, box.cell_box + return False, None + except Exception as ex: + self.logger.warning(ex) + return False, None diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py index f8cda475..c060d9d6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_utils/img_processing.py @@ -83,7 +83,7 @@ def get_contours_cells(img: np.ndarray, table_type: str, *, config: dict) -> [An if config.get("debug_mode", False): cv2.imwrite(os.path.join(get_path_param(config, "path_detect"), "image_bin.jpg"), img_bin) # step 2 - img_final_bin = __detect_horizontal_and_vertical_lines(img_bin, config, "tables") + img_final_bin = detect_horizontal_and_vertical_lines(img_bin, config, "tables") # step 3 img_final_bin_houph, angle_alignment = __apply_houph_lines_and_detect_angle(img_final_bin, config) @@ -182,7 +182,7 @@ def __apply_houph_lines_and_detect_angle(image: np.ndarray, config: dict) -> [np return img_final_bin_houph, angle_alignment -def __detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray: +def detect_horizontal_and_vertical_lines(img_bin: np.ndarray, config: dict, task: str) -> np.ndarray: # Defining a kernel length if task == "orientation": diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index 4eaed54c..5d55496b 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -56,10 +56,18 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. """ + import tempfile from dedoc.utils.parameter_utils import get_param_with_attachments parameters = {} if parameters is None else parameters warnings = [] - lines, tables, tables_on_images, attachments, document_metadata = self.__extract(path=file_path, parameters=parameters, warnings=warnings) + + with tempfile.TemporaryDirectory() as tmp_dir: + lines, tables, tables_on_images, attachments, document_metadata = self.__extract( + path=file_path, + parameters=parameters, + warnings=warnings, + tmp_dir=tmp_dir + ) lines = self.linker.link_objects(lines=lines, tables=tables_on_images, images=attachments) if get_param_with_attachments(parameters) and self.attachment_extractor.can_extract(file_path): @@ -71,7 +79,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure return self._postprocess(result) - def __extract(self, path: str, parameters: dict, warnings: list)\ + def __extract(self, path: str, parameters: dict, warnings: list, tmp_dir: str)\ -> Tuple[List[LineWithMeta], List[Table], List[ScanTable], List[PdfImageAttachment], Optional[dict]]: import math from dedoc.utils.pdf_utils import get_pdf_page_count @@ -102,7 +110,7 @@ def __extract(self, path: str, parameters: dict, warnings: list)\ first_tabby_page = first_page + 1 if first_page is not None else 1 last_tabby_page = page_count if (last_page is None) or (last_page is not None and last_page > page_count) else last_page self.logger.info(f"Reading PDF pages from {first_tabby_page} to {last_tabby_page}") - document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page) + document = self.__process_pdf(path=path, start_page=first_tabby_page, end_page=last_tabby_page, tmp_dir=tmp_dir) pages = document.get("pages", []) for page in pages: @@ -215,7 +223,6 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith lines = [] page_number, page_width, page_height = page["number"], int(page["width"]), int(page["height"]) - prev_line = None labeling_mode = self.config.get("labeling_mode", False) for block in page["blocks"]: @@ -261,15 +268,13 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith uid=uid, location=Location(bbox=bbox, page_number=page_number), order=order) - line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, prev_line, meta) - prev_line = line_with_location + line_with_location.metadata.tag_hierarchy_level = self.__get_tag(line_with_location, meta) lines.append(line_with_location) return lines - def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_type: str) -> HierarchyLevel: - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor + def __get_tag(self, line: LineWithMeta, line_type: str) -> HierarchyLevel: from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth if line_type == HierarchyLevel.header: @@ -278,18 +283,18 @@ def __get_tag(self, line: LineWithMeta, prev_line: Optional[LineWithMeta], line_ return HierarchyLevel(1, header_level, False, line_type) if line_type == "litem": # TODO automatic list depth and merge list items from multiple lines - return DefaultStructureExtractor.get_hl_list_using_regexp(line, prev_line) + return HierarchyLevel(None, None, False, HierarchyLevel.list_item) - return HierarchyLevel(None, None, True, line_type) + return HierarchyLevel.create_unknown() def __jar_path(self) -> str: import os return os.environ.get("TABBY_JAR", self.default_config["JAR_PATH"]) - def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: + def __run(self, path: str, tmp_dir: str, encoding: str = "utf-8", start_page: int = None, end_page: int = None) -> bytes: import subprocess - args = ["java"] + ["-jar", self.__jar_path(), "-i", path] + args = ["java"] + ["-jar", self.__jar_path(), "-i", path, "-tmp", f"{tmp_dir}/"] if start_page is not None and end_page is not None: args += ["-sp", str(start_page), "-ep", str(end_page)] try: @@ -302,12 +307,14 @@ def __run(self, path: str = None, encoding: str = "utf-8", start_page: int = Non except subprocess.CalledProcessError as e: raise TabbyPdfError(e.stderr.decode(encoding)) - def __process_pdf(self, path: str, start_page: int = None, end_page: int = None) -> dict: + def __process_pdf(self, path: str, tmp_dir: str, start_page: int = None, end_page: int = None) -> dict: import json + import os + + self.__run(path=path, start_page=start_page, end_page=end_page, tmp_dir=tmp_dir) + with open(os.path.join(tmp_dir, "data.json"), "r") as response: + document = json.load(response) - output = self.__run(path=path, start_page=start_page, end_page=end_page) - response = output.decode("UTF-8") - document = json.loads(response) if response else {} return document def _process_one_page(self, diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index d7bb2b6a..46528fcd 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -4,6 +4,7 @@ from numpy import ndarray from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation +from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader @@ -58,6 +59,11 @@ def _process_one_page(self, page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, parameters=parameters) if page is None: return [], [], [], [] + if parameters.need_gost_frame_analysis: + page_shift = self.gost_frame_boxes[page_number] + self._move_table_cells(tables=tables, page_shift=page_shift, page=page) + readable_block = page_shift # bbox representing the content of the gost frame + page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame unreadable_blocks = [location.bbox for table in tables for location in table.locations] page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)] lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False) @@ -65,6 +71,19 @@ def _process_one_page(self, return lines, tables, page.attachments, [] + def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: PageWithBBox) -> None: + """ + Move tables back to original coordinates when parsing a document containing a gost frame + """ + for table in tables: + shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates + for location in table.locations: + location.bbox.shift(shift_x=shift_x, shift_y=shift_y) + for row in table.matrix_cells: + for cell in row: + image_width, image_height = page.pdf_page_width, page.pdf_page_height + cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) + def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: """ Change table boxes' width height into pdf space like textual lines diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar index 61612f8b..5dc90bf1 100644 Binary files a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/ispras_tbl_extr.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar new file mode 100644 index 00000000..c13bcb91 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-annotations-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar new file mode 100644 index 00000000..34be9026 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-core-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar new file mode 100644 index 00000000..3750b8c1 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/jackson-databind-2.17.2.jar differ diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar new file mode 100644 index 00000000..5c50e177 Binary files /dev/null and b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/tabbypdf/jars/plexus-utils-1.1.jar differ diff --git a/dedoc/readers/pdf_reader/utils/line_object_linker.py b/dedoc/readers/pdf_reader/utils/line_object_linker.py index 2ddfc064..8c612e24 100644 --- a/dedoc/readers/pdf_reader/utils/line_object_linker.py +++ b/dedoc/readers/pdf_reader/utils/line_object_linker.py @@ -34,7 +34,7 @@ def link_objects(self, lines: List[LineWithLocation], tables: List[ScanTable], i @return: """ if len(lines) == 0: - metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_raw_text(), page_id=0, line_id=0) + metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel.create_unknown(), page_id=0, line_id=0) lines = [LineWithLocation(line="", metadata=metadata, annotations=[], location=Location(page_number=0, bbox=BBox(0, 0, 1, 1)))] last_page_line = self._get_last_page_line(lines) all_objects = list(lines + tables + images) diff --git a/dedoc/readers/pptx_reader/paragraph.py b/dedoc/readers/pptx_reader/paragraph.py index 129ac3a3..6953454c 100644 --- a/dedoc/readers/pptx_reader/paragraph.py +++ b/dedoc/readers/pptx_reader/paragraph.py @@ -33,7 +33,7 @@ def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta: text = "" paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level) - hierarchy_level = HierarchyLevel.create_raw_text() + hierarchy_level = HierarchyLevel.create_unknown() if is_title or paragraph_properties.title: hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False) diff --git a/dedoc/readers/txt_reader/raw_text_reader.py b/dedoc/readers/txt_reader/raw_text_reader.py index 2cb13f6d..01c6caa2 100644 --- a/dedoc/readers/txt_reader/raw_text_reader.py +++ b/dedoc/readers/txt_reader/raw_text_reader.py @@ -33,7 +33,7 @@ def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ - This method returns only document lines, some types of the lines (e.g. `list_item`) may be found using regular expressions. + This method returns only document lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters @@ -54,15 +54,14 @@ def __get_encoding(self, path: str, parameters: dict) -> str: def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: import time from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation + from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata - from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.utils.utils import calculate_file_hash lines = [] file_hash = calculate_file_hash(path=path) number_of_empty_lines = 0 previous_log_time = time.time() - prev_line = None for line_id, line in self.__get_lines(path=path, encoding=encoding): if time.time() - previous_log_time > 5: @@ -76,14 +75,10 @@ def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: indent_annotation = self.__get_indent_annotation(line) line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid) - line_with_meta.metadata.tag_hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line_with_meta, prev_line) - prev_line = line_with_meta + line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() lines.append(line_with_meta) - if line.isspace(): - number_of_empty_lines += 1 - else: - number_of_empty_lines = 0 + number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0 return lines @@ -113,15 +108,9 @@ def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int: return space_this.end() - space_this.start() def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool: - from dedoc.data_structures.hierarchy_level import HierarchyLevel - - if not line.metadata.tag_hierarchy_level.can_be_multiline and \ - line.metadata.tag_hierarchy_level.line_type not in (HierarchyLevel.raw_text, HierarchyLevel.unknown): - return True space_this = self.__get_starting_spacing(line) space_prev = self.__get_starting_spacing(previous_line) - return line.metadata.tag_hierarchy_level.line_type in (HierarchyLevel.raw_text, HierarchyLevel.unknown) \ - and not line.line.isspace() and space_this - space_prev >= 2 + return not line.line.isspace() and space_this - space_prev >= 2 def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument: previous_line = None diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py index 3bfaeb21..8b92f88f 100644 --- a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py +++ b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py @@ -1,9 +1,11 @@ -from typing import List, Optional +from typing import Optional +from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.unstructured_document import UnstructuredDocument from dedoc.structure_extractors.abstract_structure_extractor import AbstractStructureExtractor +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern +from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition class DefaultStructureExtractor(AbstractStructureExtractor): @@ -12,82 +14,68 @@ class DefaultStructureExtractor(AbstractStructureExtractor): You can find the description of this type of structure in the section :ref:`other_structure`. """ - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix - document_type = "other" - prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, AnyLetterPrefix, BulletPrefix] - def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument: """ Extract basic structure from the given document and add additional information to the lines' metadata. To get the information about the method's parameters look at the documentation of the class \ :class:`~dedoc.structure_extractors.AbstractStructureExtractor`. + + ``parameters`` parameter can contain patterns for configuring lines types and their levels in the output document tree ("patterns" key). + Please see :ref:`dedoc_structure_extractors_patterns` and :ref:`using_patterns` to get information how to use patterns for making your custom structure. """ - previous_line = None + parameters = {} if parameters is None else parameters + pattern_composition = self.__get_pattern_composition(parameters) for line in document.lines: - if line.metadata.tag_hierarchy_level is None: - line.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.unknown: - line.metadata.hierarchy_level = self.get_hl_list_using_regexp(line, previous_line) - else: - line.metadata.hierarchy_level = self.__get_hl_with_tag(line) - - assert line.metadata.hierarchy_level is not None - if line.metadata.hierarchy_level.line_type != HierarchyLevel.raw_text: - previous_line = line - + line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line) return document - def __get_hl_with_tag(self, line: LineWithMeta) -> HierarchyLevel: - assert line.metadata.tag_hierarchy_level is not None - level_1, level_2 = line.metadata.tag_hierarchy_level.level_1, line.metadata.tag_hierarchy_level.level_2 - - if level_1 is None or level_2 is None: - return line.metadata.tag_hierarchy_level - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header: - return HierarchyLevel(level_1=1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.header) - - if line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item: - return HierarchyLevel(level_1=level_1, level_2=level_2, can_be_multiline=False, line_type=HierarchyLevel.list_item) - - return line.metadata.tag_hierarchy_level - - @staticmethod - def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> HierarchyLevel: - from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_prefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix - from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix - - prefix = get_prefix(DefaultStructureExtractor.prefix_list, line) - - # TODO dotted list without space after numbering, like "1.Some text" - if prefix.name == DottedPrefix.name: # list like 1.1.1 - depth = len(prefix.numbers) - if all((n <= 1900 for n in prefix.numbers)) and depth <= 9: - return HierarchyLevel(2, depth, False, line_type=HierarchyLevel.list_item) - return HierarchyLevel.create_raw_text() - - if prefix.name == BracketPrefix.name: # list like 1) - # check if tesseract recognize russian б as 6 (bi as six) - if prefix.prefix_num == 6 and previous_line is not None and previous_line.line.lower().strip().startswith(("a)", "а)")): - return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) # here is russian and english letters - return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item) - - if prefix.name == AnyLetterPrefix.name: # list like a) - return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item) - - if prefix.name == BulletPrefix.name: # bullet list - return HierarchyLevel(5, 1, False, line_type=HierarchyLevel.list_item) # TODO make bullet list + def __get_pattern_composition(self, parameters: dict) -> PatternComposition: + patterns = parameters.get("patterns") + if not patterns: + from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern + from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern + from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern + from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern + from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern + from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern + from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern + from dedoc.structure_extractors.patterns.tag_pattern import TagPattern + + return PatternComposition( + patterns=[ + TagHeaderPattern(line_type=HierarchyLevel.header, level_1=1, can_be_multiline=False), + TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + RomanListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=6, level_2=1, can_be_multiline=False), + TagPattern(default_line_type=HierarchyLevel.raw_text) + ] + ) + + import ast + from dedoc.structure_extractors.patterns.utils import get_pattern + + if isinstance(patterns, str): + try: + patterns = ast.literal_eval(patterns) + except ValueError as e: + raise StructureExtractorError(msg=f"Bad syntax for patterns: {str(e)}") + + if not isinstance(patterns, list): + raise StructureExtractorError(msg="Patterns parameter should contain a list of patterns") + + pattern_classes = [] + for pattern in patterns: + if isinstance(pattern, dict): + pattern_classes.append(get_pattern(pattern)) + elif isinstance(pattern, AbstractPattern): + pattern_classes.append(pattern) + else: + raise StructureExtractorError(msg="Pattern should be dict or `AbstractPattern`") - # no match for any list has been found - return HierarchyLevel(None, None, line.metadata.tag_hierarchy_level.can_be_multiline, HierarchyLevel.raw_text) + return PatternComposition(patterns=pattern_classes) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py index 8089d53e..bb6f6b79 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/bracket_roman_prefix.py @@ -15,7 +15,7 @@ class BracketRomanPrefix(LinePrefix): iv) forth item """ - regexp = re.compile(r"^\s*[ivxl]\)") + regexp = re.compile(r"^\s*[ivxlcdm]\)") name = "roman" def __init__(self, prefix: str, indent: float) -> None: @@ -30,4 +30,4 @@ def is_valid(prefix_str: str) -> bool: if len(prefix_str) <= 1 or not prefix_str.endswith(")"): return False prefix_set = set(prefix_str[:-1]) - return prefix_set.issubset(set("ivxl")) + return prefix_set.issubset(set("ivxlcdm")) diff --git a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py index 02264584..f312ce45 100644 --- a/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py +++ b/dedoc/structure_extractors/feature_extractors/list_features/prefix/roman_prefix.py @@ -15,7 +15,7 @@ class RomanPrefix(LinePrefix): IV. forth item """ - regexp = re.compile(r"^\s*[ivxl]\.") + regexp = re.compile(r"^\s*[ivxlcdm]\.") name = "roman" def __init__(self, prefix: str, indent: float) -> None: @@ -32,4 +32,4 @@ def is_valid(prefix_str: str) -> bool: if len(prefix_str) <= 1 or not prefix_str.endswith("."): return False prefix_set = set(prefix_str[:-1]) - return prefix_set.issubset(set("ivxl")) + return prefix_set.issubset(set("ivxlcdm")) diff --git a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py index 91b67965..bb71a235 100644 --- a/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py +++ b/dedoc/structure_extractors/hierarchy_level_builders/diploma_builder/body_builder.py @@ -3,20 +3,31 @@ from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_with_meta import LineWithMeta -from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth from dedoc.structure_extractors.hierarchy_level_builders.abstract_hierarchy_level_builder import AbstractHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.law_builders.body_builder.abstract_body_hierarchy_level_builder import \ AbstractBodyHierarchyLevelBuilder from dedoc.structure_extractors.hierarchy_level_builders.utils_reg import regexps_digits_with_dots +from dedoc.structure_extractors.patterns import BracketListPattern, BulletListPattern, DottedListPattern, LetterListPattern, TagListPattern, TagPattern +from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition class DiplomaBodyBuilder(AbstractHierarchyLevelBuilder): named_item_keywords = ("введение", "заключение", "библиографический список", "список литературы", "глава", "приложение", "приложения") - def __int__(self) -> None: + def __init__(self) -> None: super().__init__() self.digits_with_dots_regexp = regexps_digits_with_dots + self.pattern_composition = PatternComposition( + [ + TagListPattern(line_type=HierarchyLevel.list_item, default_level_1=2, can_be_multiline=False), + DottedListPattern(line_type=HierarchyLevel.list_item, level_1=2, can_be_multiline=False), + BracketListPattern(line_type=HierarchyLevel.list_item, level_1=3, level_2=1, can_be_multiline=False), + LetterListPattern(line_type=HierarchyLevel.list_item, level_1=4, level_2=1, can_be_multiline=False), + BulletListPattern(line_type=HierarchyLevel.list_item, level_1=5, level_2=1, can_be_multiline=False), + TagPattern(line_type=HierarchyLevel.raw_text) + ] + ) def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, str]], init_hl_depth: int) -> List[LineWithMeta]: if len(lines_with_labels) > 0: @@ -27,7 +38,6 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s result = [body_line] else: result = [AbstractBodyHierarchyLevelBuilder.get_body_line(init_hl_depth=init_hl_depth)] - previous_raw_text_line = None previous_named_item_line = None for line, prediction in lines_with_labels: @@ -44,8 +54,7 @@ def get_lines_with_hierarchy(self, lines_with_labels: List[Tuple[LineWithMeta, s elif prediction == "raw_text": line = self.__postprocess_raw_text(line, init_hl_depth) if not (line.metadata.hierarchy_level is not None and line.metadata.hierarchy_level.line_type == "named_item"): - line.metadata.hierarchy_level = DefaultStructureExtractor.get_hl_list_using_regexp(line, previous_raw_text_line) - previous_raw_text_line = line + line.metadata.hierarchy_level = self.pattern_composition.get_hierarchy_level(line) else: line.metadata.hierarchy_level = HierarchyLevel.create_raw_text() line.metadata.hierarchy_level.line_type = prediction @@ -60,10 +69,10 @@ def __handle_named_item(self, init_hl_depth: int, line: LineWithMeta, prediction if text.startswith(self.named_item_keywords): hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction) elif item_depth == -1: - if previous_named_item_line and previous_named_item_line.metadata.hierarchy_level.line_type == "named_item": + if previous_named_item_line: hierarchy_level = previous_named_item_line.metadata.hierarchy_level else: - hierarchy_level = HierarchyLevel(init_hl_depth + 1, 0, True, prediction) + hierarchy_level = HierarchyLevel(init_hl_depth, 0, True, prediction) else: hierarchy_level = HierarchyLevel(init_hl_depth, item_depth - 1, True, prediction) line.metadata.hierarchy_level = hierarchy_level diff --git a/dedoc/structure_extractors/patterns/__init__.py b/dedoc/structure_extractors/patterns/__init__.py new file mode 100644 index 00000000..dad35f6c --- /dev/null +++ b/dedoc/structure_extractors/patterns/__init__.py @@ -0,0 +1,14 @@ +from dedoc.structure_extractors.patterns.bracket_list_pattern import BracketListPattern +from dedoc.structure_extractors.patterns.bracket_roman_list_pattern import BracketRomanListPattern +from dedoc.structure_extractors.patterns.bullet_list_pattern import BulletListPattern +from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern +from dedoc.structure_extractors.patterns.letter_list_pattern import LetterListPattern +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern +from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern +from dedoc.structure_extractors.patterns.start_word_pattern import StartWordPattern +from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern +from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern +from dedoc.structure_extractors.patterns.tag_pattern import TagPattern + +__all__ = [BracketListPattern, BracketRomanListPattern, BulletListPattern, DottedListPattern, LetterListPattern, RegexpPattern, RomanListPattern, + StartWordPattern, TagHeaderPattern, TagListPattern, TagPattern] diff --git a/dedoc/structure_extractors/patterns/abstract_pattern.py b/dedoc/structure_extractors/patterns/abstract_pattern.py new file mode 100644 index 00000000..21856dbd --- /dev/null +++ b/dedoc/structure_extractors/patterns/abstract_pattern.py @@ -0,0 +1,58 @@ +from abc import ABC, abstractmethod +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta + + +class AbstractPattern(ABC): + """ + Base class for all patterns to configure structure extraction by :class:`~dedoc.structure_extractors.DefaultStructureExtractor`. + """ + _name = "" + + def __init__(self, line_type: Optional[str], level_1: Optional[int], level_2: Optional[int], can_be_multiline: Optional[Union[bool, str]]) -> None: + """ + Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + They can be used in :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level` + according to specific pattern logic. + + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ + from dedoc.utils.parameter_utils import get_bool_value + + self._line_type = line_type + self._level_1 = level_1 + self._level_2 = level_2 + self._can_be_multiline = get_bool_value(can_be_multiline, default_value=True) + + @classmethod + def name(cls: "AbstractPattern") -> str: + """ + Returns ``_name`` attribute, is used in parameters configuration to choose a specific pattern. + Each pattern has a unique non-empty name. + """ + return cls._name + + @abstractmethod + def match(self, line: LineWithMeta) -> bool: + """ + Check if the given line satisfies to the pattern requirements. + Line text, annotations or metadata (``metadata.tag_hierarchy_level``) can be used to decide, if the line matches the pattern or not. + """ + pass + + @abstractmethod + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.match` + returned ``True`` for the given line. + + Get :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level`` attribute. + Please see :ref:`add_structure_type_hierarchy_level` to get more information about :class:`~dedoc.data_structures.HierarchyLevel`. + """ + pass diff --git a/dedoc/structure_extractors/patterns/bracket_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_list_pattern.py new file mode 100644 index 00000000..35a78d98 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bracket_list_pattern.py @@ -0,0 +1,44 @@ +from typing import Optional, Union + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BracketListPattern(RegexpPattern): + """ + Pattern for matching numbered lists with brackets, e.g. + + :: + + 1) first element + 2) second element + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BracketListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BracketListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bracket_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "bracket_list" + + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py new file mode 100644 index 00000000..281299e1 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py @@ -0,0 +1,50 @@ +from typing import Optional, Union + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BracketRomanListPattern(RegexpPattern): + """ + Pattern for matching roman lists with brackets, e.g. + + :: + + i) first item + ii) second item + iii) third item + iv) forth item + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BracketRomanListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BracketRomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bracket_roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "bracket_roman_list" + + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/bullet_list_pattern.py b/dedoc/structure_extractors/patterns/bullet_list_pattern.py new file mode 100644 index 00000000..0695abc1 --- /dev/null +++ b/dedoc/structure_extractors/patterns/bullet_list_pattern.py @@ -0,0 +1,46 @@ +from typing import Optional, Union + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class BulletListPattern(RegexpPattern): + """ + Pattern for matching bulleted lists, e.g. + + :: + + - first item + - second item + + or with other bullet markers ``-, —, −, –, ®, ., •, ,, ‚, ©, ⎯, °, *, >, ●, ♣, ①, ▪, *, +``. + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import BulletListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [BulletListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "bullet_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "bullet_list" + + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/dotted_list_pattern.py b/dedoc/structure_extractors/patterns/dotted_list_pattern.py new file mode 100644 index 00000000..d085ddb2 --- /dev/null +++ b/dedoc/structure_extractors/patterns/dotted_list_pattern.py @@ -0,0 +1,72 @@ +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class DottedListPattern(RegexpPattern): + """ + Pattern for matching numbered lists with dots, e.g. + + :: + + 1. first element + 1.1. first sub-element + 1.2. second sub-element + 2. second element + + The number of dots is unlimited. + There is no ``level_2`` parameter in this pattern, ``level_2`` is calculated as the number of numbers between dots, e.g. + + * ``1.`` → ``level_2=1`` + * ``1.1`` or ``1.1.`` → ``level_2=2`` + * ``1.2.3.4`` or ``1.2.3.4.`` → ``level_2=4`` + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import DottedListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [DottedListPattern(line_type="list_item", level_1=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "dotted_list", "line_type": "list_item", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "dotted_list" + + def __init__(self, line_type: str, level_1: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=DottedPrefix.regexp, line_type=line_type, level_1=level_1, level_2=None, can_be_multiline=can_be_multiline) + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel( + line_type=self._line_type, + level_1=self._level_1, + level_2=self.__get_list_depth(line=line), + can_be_multiline=self._can_be_multiline + ) + + def __get_list_depth(self, line: LineWithMeta) -> int: + text = line.line.strip().lower() + match = self._regexp.match(text) + if match is None: + raise ValueError(f'Line text "{text}" does not match dotted list pattern regexp') + + prefix = match.group().strip() + return len([number for number in prefix.split(".") if len(number) > 0]) diff --git a/dedoc/structure_extractors/patterns/letter_list_pattern.py b/dedoc/structure_extractors/patterns/letter_list_pattern.py new file mode 100644 index 00000000..b9c39591 --- /dev/null +++ b/dedoc/structure_extractors/patterns/letter_list_pattern.py @@ -0,0 +1,56 @@ +from typing import Optional, Union + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class LetterListPattern(RegexpPattern): + """ + Pattern for matching lists with letters and brackets, e.g. + + :: + + a) first element + b) second element + + or (example for Armenian language) + + :: + + ա) տեղաբաշխել + բ) Հայաստանի Հանրապետության + գ) սահմանապահ վերակարգերի + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import LetterListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [LetterListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "letter_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "letter_list" + + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/pattern_composition.py b/dedoc/structure_extractors/patterns/pattern_composition.py new file mode 100644 index 00000000..cb263d2e --- /dev/null +++ b/dedoc/structure_extractors/patterns/pattern_composition.py @@ -0,0 +1,56 @@ +from typing import List + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class PatternComposition: + """ + Class for applying patterns to get line's hierarchy level. + + Example of usage: + + .. code-block:: python + + from dedoc.data_structures.line_with_meta import LineWithMeta + from dedoc.structure_extractors.patterns import TagListPattern, TagPattern + from dedoc.structure_extractors.patterns.pattern_composition import PatternComposition + + + pattern_composition = PatternComposition( + patterns=[ + TagListPattern(line_type="list_item", default_level_1=2, can_be_multiline=False), + TagPattern(default_line_type="raw_text") + ] + ) + line = LineWithMeta(line="Some text") + line.metadata.hierarchy_level = pattern_composition.get_hierarchy_level(line=line) + """ + def __init__(self, patterns: List[AbstractPattern]) -> None: + """ + Set the list of patterns to apply to lines. + + **Note:** the order of the patterns is important. More specific patterns should go first. + Otherwise, they may be ignored because of the patterns which also are applicable to the given line. + + :param patterns: list of patterns to apply to lines. + """ + self.patterns = patterns + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + Choose the suitable pattern from the list of patterns for applying to the given line. + The first applicable pattern will be chosen. + If no applicable pattern was found, the default ``raw_text`` :class:`~dedoc.data_structures.HierarchyLevel` is used as result. + + :param line: line to get hierarchy level for. + """ + line_pattern = None + + for pattern in self.patterns: + if pattern.match(line): + line_pattern = pattern + break + + return line_pattern.get_hierarchy_level(line) if line_pattern else HierarchyLevel.create_raw_text() diff --git a/dedoc/structure_extractors/patterns/regexp_pattern.py b/dedoc/structure_extractors/patterns/regexp_pattern.py new file mode 100644 index 00000000..82741bd0 --- /dev/null +++ b/dedoc/structure_extractors/patterns/regexp_pattern.py @@ -0,0 +1,90 @@ +import re +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class RegexpPattern(AbstractPattern): + """ + Pattern for matching line text by a regular expression. + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + Before regular expression matching, the line text is stripped (space symbols are deleted from both sides). + + .. seealso:: + + Syntax for writing regular expressions is described in the `Python documentation `_. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import RegexpPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [ + RegexpPattern(regexp="^chapter\s\d+\.", line_type="chapter", level_1=1, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^part\s\d+\.\d+\."), line_type="part", level_1=2, can_be_multiline=False) + ] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "regexp", "regexp": "^chapter\s\d+\.", "line_type": "chapter", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ # noqa + _name = "regexp" + + def __init__(self, + regexp: str or re.Pattern, + line_type: str, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[Union[bool, str]] = None) -> None: + """ + Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + + :param regexp: regular expression for checking, if the line text matches the pattern. + Note that regular expression is used on the lowercase and stripped line. + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp + + def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line. + Line text is checked by applying pattern's regular expression, text is stripped and made lowercase beforehand. + """ + text = line.line.strip().lower() + match = self._regexp.match(text) + return match is not None + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.RegexpPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attributes ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` are equal to values given during class initialisation. + """ + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/roman_list_pattern.py b/dedoc/structure_extractors/patterns/roman_list_pattern.py new file mode 100644 index 00000000..03e8761a --- /dev/null +++ b/dedoc/structure_extractors/patterns/roman_list_pattern.py @@ -0,0 +1,50 @@ +from typing import Optional, Union + +from dedoc.structure_extractors.feature_extractors.list_features.prefix.roman_prefix import RomanPrefix +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern + + +class RomanListPattern(RegexpPattern): + """ + Pattern for matching roman lists with dots, e.g. + + :: + + I. first item + II. second item + III. third item + IV. forth item + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + + Example of library usage: + + .. code-block:: python + + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import RomanListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [RomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "roman_list" + + def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: + super().__init__(regexp=RomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/start_word_pattern.py b/dedoc/structure_extractors/patterns/start_word_pattern.py new file mode 100644 index 00000000..afce7124 --- /dev/null +++ b/dedoc/structure_extractors/patterns/start_word_pattern.py @@ -0,0 +1,82 @@ +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class StartWordPattern(AbstractPattern): + """ + Pattern for lines that begin with some specific text (e.g. Introduction, Chapter, etc.). + + .. note:: + + The pattern is case-insensitive (lower and upper letters are not differed). + Before matching, the line text is stripped (space symbols are deleted from both sides). + Start word for marching is also stripped and made lowercase. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import StartWordPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [StartWordPattern(start_word="chapter", line_type="chapter", level_1=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "start_word", "start_word": "chapter", "line_type": "chapter", "level_1": 1, "can_be_multiline": "false"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "start_word" + + def __init__(self, + start_word: str, + line_type: str, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[Union[bool, str]] = None) -> None: + """ + Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + + :param start_word: string for checking of line text beginning. + Note that start_word will be stripped and made lowercase, and will be used on the lowercase and stripped line. + :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc. + :param level_1: value of a line primary importance + :param level_2: level of the line inside specific class + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``. + """ + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + self.__start_word = start_word.strip().lower() + + def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line. + Line text is checked if it starts with the given ``start_word``, text is stripped and made lowercase beforehand. + """ + text = line.line.strip().lower() + return text.startswith(self.__start_word) + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.StartWordPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attributes ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` are equal to values given during class initialisation. + """ + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) diff --git a/dedoc/structure_extractors/patterns/tag_header_pattern.py b/dedoc/structure_extractors/patterns/tag_header_pattern.py new file mode 100644 index 00000000..692be791 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_header_pattern.py @@ -0,0 +1,75 @@ +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.tag_pattern import TagPattern + + +class TagHeaderPattern(TagPattern): + """ + Pattern for using information about heading lines (header) from readers saved in ``line.metadata.tag_hierarchy_level``. + Also allows to calculate ``level_2`` based on dotted list depth (same as in :class:`~dedoc.structure_extractors.patterns.DottedListPattern`) + **if level_2, tag_hierarchy_level.level_2, default_level_2 are empty**. + + .. seealso:: + + Please see :ref:`readers_line_types` to find out which readers can extract lines with type "header". + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import TagHeaderPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [TagHeaderPattern(line_type="header", level_1=1, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "tag_header", "line_type": "header", "level_1": 1, "can_be_multiline": "False"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "tag_header" + + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[Union[bool, str]] = None, + default_line_type: str = HierarchyLevel.header, + default_level_1: int = 1, + default_level_2: Optional[int] = None) -> None: + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline, default_line_type=default_line_type, + default_level_1=default_level_1, default_level_2=default_level_2) + + def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line: + + * ``line.metadata.tag_hierarchy_level`` should not be empty; + * ``line.metadata.tag_hierarchy_level.line_type == "header"`` + + ``line.metadata.tag_hierarchy_level`` is filled during reading step, + please see :ref:`readers_line_types` to find out which readers can extract lines with type "header". + """ + return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.header + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=self._get_level_1(line), + level_2=self._get_regexp_level_2(line), + can_be_multiline=self._get_can_be_multiline(line) + ) diff --git a/dedoc/structure_extractors/patterns/tag_list_pattern.py b/dedoc/structure_extractors/patterns/tag_list_pattern.py new file mode 100644 index 00000000..992dbfdc --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_list_pattern.py @@ -0,0 +1,75 @@ +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.tag_pattern import TagPattern + + +class TagListPattern(TagPattern): + """ + Pattern for using information about list item lines (list_item) from readers saved in ``line.metadata.tag_hierarchy_level``. + Also allows to calculate ``level_2`` based on dotted list depth (same as in :class:`~dedoc.structure_extractors.patterns.DottedListPattern`) + **if level_2, tag_hierarchy_level.level_2, default_level_2 are empty**. + + .. seealso:: + + Please see :ref:`readers_line_types` to find out which readers can extract lines with type "list_item". + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import TagListPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [TagListPattern(line_type="list_item", default_level_1=2, can_be_multiline=False)] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "tag_list", "line_type": "list_item", "default_level_1": 2, "can_be_multiline": "False"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "tag_list" + + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[Union[bool, str]] = None, + default_line_type: str = HierarchyLevel.list_item, + default_level_1: int = 2, + default_level_2: Optional[int] = None) -> None: + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline, default_line_type=default_line_type, + default_level_1=default_level_1, default_level_2=default_level_2) + + def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line: + + * ``line.metadata.tag_hierarchy_level`` should not be empty; + * ``line.metadata.tag_hierarchy_level.line_type == "list_item"`` + + ``line.metadata.tag_hierarchy_level`` is filled during reading step, + please see :ref:`readers_line_types` to find out which readers can extract lines with type "list_item". + """ + return line.metadata.tag_hierarchy_level is not None and line.metadata.tag_hierarchy_level.line_type == HierarchyLevel.list_item + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=self._get_level_1(line), + level_2=self._get_regexp_level_2(line), + can_be_multiline=self._get_can_be_multiline(line) + ) diff --git a/dedoc/structure_extractors/patterns/tag_pattern.py b/dedoc/structure_extractors/patterns/tag_pattern.py new file mode 100644 index 00000000..350bf0a4 --- /dev/null +++ b/dedoc/structure_extractors/patterns/tag_pattern.py @@ -0,0 +1,143 @@ +from typing import Optional, Union + +from dedoc.data_structures.hierarchy_level import HierarchyLevel +from dedoc.data_structures.line_with_meta import LineWithMeta +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +class TagPattern(AbstractPattern): + """ + Pattern for using information from readers saved in ``line.metadata.tag_hierarchy_level``. + Can be useful for paragraph extraction in PDF documents and images, + because PDF and image readers save information about paragraphs in ``line.metadata.tag_hierarchy_level.can_be_multiline``. + + .. seealso:: + + Please see :ref:`readers_line_types` if you need information, which line types can be extracted by each reader. + + Example of library usage: + + .. code-block:: python + + import re + from dedoc.structure_extractors import DefaultStructureExtractor + from dedoc.structure_extractors.patterns import TagPattern + + reader = ... + structure_extractor = DefaultStructureExtractor() + patterns = [TagPattern(default_line_type="raw_text")] + document = reader.read(file_path=file_path) + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + Example of API usage: + + .. code-block:: python + + import requests + + patterns = [{"name": "tag", "default_line_type": "raw_text"}] + parameters = {"patterns": str(patterns)} + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + """ + _name = "tag" + + def __init__(self, + line_type: Optional[str] = None, + level_1: Optional[int] = None, + level_2: Optional[int] = None, + can_be_multiline: Optional[Union[bool, str]] = None, + default_line_type: str = HierarchyLevel.raw_text, + default_level_1: Optional[int] = None, + default_level_2: Optional[int] = None) -> None: + """ + Initialize pattern for configuring values of :class:`~dedoc.data_structures.HierarchyLevel` attributes. + It is recommended to configure ``default_*`` values in case ``line.metadata.tag_hierarchy_level`` miss some values. + If you want to use values from ``line.metadata.tag_hierarchy_level``, it is recommended to leave + ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` empty. + + ``can_be_multiline`` is filled in PDF and images readers during paragraph detection, so if you want to extract paragraphs, + you shouldn't set ``can_be_multiline`` during pattern initialization. + + :param line_type: type of the line, replaces line_type from tag_hierarchy_level if non-empty. + :param level_1: value of a line primary importance, replaces level_1 from tag_hierarchy_level if non-empty. + :param level_2: level of the line inside specific class, replaces level_2 from tag_hierarchy_level if non-empty. + :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`, + if line can be multiline, it can be joined with another line. If not None, replaces can_be_multiline from tag_hierarchy_level. + :param default_line_type: type of the line, is used when tag_hierarchy_level.line_type == "unknown". + :param default_level_1: value of a line primary importance, is used when tag_hierarchy_level.level_1 is None. + :param default_level_2: level of the line inside specific class, is used when tag_hierarchy_level.level_2 is None. + """ + super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) + self._can_be_multiline_none = can_be_multiline is None + self._default_line_type = default_line_type + self._default_level_1 = default_level_1 + self._default_level_2 = default_level_2 + + def match(self, line: LineWithMeta) -> bool: + """ + Check if the pattern is suitable for the given line: ``line.metadata.tag_hierarchy_level`` should not be empty. + ``line.metadata.tag_hierarchy_level`` is filled during reading step, some readers can skip ``tag_hierarchy_level`` initialisation. + """ + return line.metadata.tag_hierarchy_level is not None + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + """ + This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.TagPattern.match` + returned ``True`` for the given line. + + Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``. + The attribute ``line_type`` is initialized according to the following rules: + + * if non-empty ``line_type`` is given during pattern initialisation, then its value is used in the result; + * if ``line_type`` is not given (or ``None`` is given) and ``line.metadata.tag_hierarchy_level`` is not ``unknown``, \ + the ``line_type`` value from ``line.metadata.tag_hierarchy_level`` is used in the result; + * otherwise (``line_type`` is empty and ``line.metadata.tag_hierarchy_level`` is ``unknown``) ``default_line_type`` value is used in the result. + + Similar rules work for ``level_1`` and ``level_2`` with comparing with ``None`` instead of ``unknown``. + + The ``can_be_multiline`` attribute is initialized according to the following rules: + + * if non-empty ``can_be_multiline`` is given during pattern initialisation, then its value is used in the result; + * otherwise ``can_be_multiline`` value from ``line.metadata.tag_hierarchy_level`` is used in the result. + """ + return HierarchyLevel( + line_type=self._get_line_type(line), + level_1=self._get_level_1(line), + level_2=self._get_level_2(line), + can_be_multiline=self._get_can_be_multiline(line) + ) + + def _get_line_type(self, line: LineWithMeta) -> str: + if self._line_type is not None: + return self._line_type + + return self._default_line_type if line.metadata.tag_hierarchy_level.is_unknown() else line.metadata.tag_hierarchy_level.line_type + + def _get_level_1(self, line: LineWithMeta) -> Optional[int]: + if self._level_1 is not None: + return self._level_1 + + return self._default_level_1 if line.metadata.tag_hierarchy_level.level_1 is None else line.metadata.tag_hierarchy_level.level_1 + + def _get_level_2(self, line: LineWithMeta) -> Optional[int]: + if self._level_2 is not None: + return self._level_2 + + return self._default_level_2 if line.metadata.tag_hierarchy_level.level_2 is None else line.metadata.tag_hierarchy_level.level_2 + + def _get_regexp_level_2(self, line: LineWithMeta) -> int: + if self._level_2 is not None: + return self._level_2 + elif line.metadata.tag_hierarchy_level.level_2 is not None: + return line.metadata.tag_hierarchy_level.level_2 + elif self._default_level_2 is not None: + return self._default_level_2 + + from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth + depth = get_dotted_item_depth(line.line.strip()) + return depth if depth > 0 else 1 + + def _get_can_be_multiline(self, line: LineWithMeta) -> bool: + return line.metadata.tag_hierarchy_level.can_be_multiline if self._can_be_multiline_none else self._can_be_multiline diff --git a/dedoc/structure_extractors/patterns/utils.py b/dedoc/structure_extractors/patterns/utils.py new file mode 100644 index 00000000..e720a8a8 --- /dev/null +++ b/dedoc/structure_extractors/patterns/utils.py @@ -0,0 +1,25 @@ +from copy import deepcopy + +from dedoc.common.exceptions.structure_extractor_error import StructureExtractorError +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +def get_pattern(pattern_parameters: dict) -> AbstractPattern: + import dedoc.structure_extractors.patterns as patterns_module + + if "name" not in pattern_parameters: + raise StructureExtractorError(msg="Pattern parameter missing 'name'") + + supported_patterns = {pattern.name(): pattern for pattern in patterns_module.__all__} + pattern_class = supported_patterns.get(pattern_parameters["name"]) + + if pattern_class is None: + raise StructureExtractorError(msg=f"Pattern {pattern_parameters['name']} is not found in supported patterns: {supported_patterns.keys()}") + + pattern_parameters_copy = deepcopy(pattern_parameters) + pattern_parameters_copy.pop("name") + try: + pattern = pattern_class(**pattern_parameters_copy) + except TypeError as e: + raise StructureExtractorError(msg=str(e)) + return pattern diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index 461f3654..3df9f6ca 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -6,6 +6,13 @@ from dedoc.config import get_config +def get_bool_value(parameter: Optional[bool or str], default_value: bool = False) -> bool: + if parameter is None: + return default_value + + return parameter if isinstance(parameter, bool) else str(parameter).lower() == "true" + + def get_param_language(parameters: Optional[dict]) -> str: if parameters is None: return "rus+eng" @@ -59,6 +66,13 @@ def get_param_need_pdf_table_analysis(parameters: Optional[dict]) -> bool: return need_pdf_table_analysis +def get_param_need_gost_frame_analysis(parameters: Optional[dict]) -> bool: + if parameters is None: + return False + need_gost_frame_analysis = str(parameters.get("need_gost_frame_analysis", "False")).lower() == "true" + return need_gost_frame_analysis + + def get_param_need_binarization(parameters: Optional[dict]) -> bool: if parameters is None: return False diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py new file mode 100644 index 00000000..5c8e81b9 --- /dev/null +++ b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py @@ -0,0 +1,122 @@ +import re +from typing import List + +import html2text + +from dedoc.api.api_utils import json2html +from dedoc.data_structures import BoldAnnotation, HierarchyLevel, LineWithMeta, UnstructuredDocument +from dedoc.metadata_extractors import DocxMetadataExtractor, PdfMetadataExtractor +from dedoc.readers import DocxReader, PdfTabbyReader +from dedoc.structure_constructors import TreeConstructor +from dedoc.structure_extractors import DefaultStructureExtractor +from dedoc.structure_extractors.patterns import DottedListPattern, LetterListPattern, RegexpPattern, TagHeaderPattern, TagListPattern +from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + +# example for docx + +docx_reader = DocxReader() +docx_metadata_extractor = DocxMetadataExtractor() +structure_extractor = DefaultStructureExtractor() +structure_constructor = TreeConstructor() + +docx_file_path = "test_dir/with_tags.docx" + +docx_document = docx_reader.read(file_path=docx_file_path) +print("\n\nDocument lines\n") +for document_line in docx_document.lines: + print(document_line) + +patterns = [ + TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False), + TagListPattern(line_type="custom_list", level_1=2), +] +docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns}) + +docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path) +docx_parsed_document = structure_constructor.construct(document=docx_document) +html = json2html( + paragraph=docx_parsed_document.content.structure, + attachments=docx_parsed_document.attachments, + tables=docx_parsed_document.content.tables, + text="" +) +print(f"\n\nDocument tree\n{html2text.html2text(html)}") + + +def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None: + document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) + parsed_document = structure_constructor.construct(document=document) + html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="") + print(f"\n\nDocument tree\n{html2text.html2text(html)}") + + +patterns = [ + TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False), + TagListPattern(line_type="custom_list", level_1=2), + DottedListPattern(line_type="custom_list", level_1=2, can_be_multiline=False), # for lists like 1. + LetterListPattern(line_type="custom_list", level_1=3, level_2=1, can_be_multiline=False), # for lists like a) + RegexpPattern(regexp=re.compile(r"^header\s+\d+\.\d+"), line_type="custom_header", level_1=1, level_2=2, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^header\s+\d+"), line_type="custom_header", level_1=1, level_2=1, can_be_multiline=False) +] +print_document_tree(document=docx_document, patterns=patterns) + +# example for pdf + +pdf_reader = PdfTabbyReader() +pdf_metadata_extractor = PdfMetadataExtractor() +pdf_file_path = "test_dir/law.pdf" + +pdf_document = pdf_reader.read(file_path=pdf_file_path) +pdf_document.metadata = pdf_metadata_extractor.extract(file_path=pdf_file_path) +print("\n\nDocument lines\n") +for document_line in pdf_document.lines[:10]: + print(document_line) + +patterns = [ + RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=1, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=2, can_be_multiline=False), + DottedListPattern(line_type="point", level_1=2, can_be_multiline=False), # for lists like 1. + RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False), # for lists like (1) + RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False) # for lists like (a) +] +print_document_tree(document=pdf_document, patterns=patterns) + + +print("\n\nDocument lines\n") +for document_line in pdf_document.lines[:10]: + print(document_line, document_line.annotations) + + +class SubHeaderPattern(AbstractPattern): + _name = "sub_header" + + def match(self, line: LineWithMeta) -> bool: + return self._is_bold(line) + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline) + + def _is_bold(self, line: LineWithMeta) -> bool: + bold_annotations = [annotation for annotation in line.annotations if annotation.name == BoldAnnotation.name and annotation.value == "True"] + bold_character_number = sum([annotation.end - annotation.start for annotation in bold_annotations]) + return bold_character_number / len(line.line) > 0.5 + + +class TitlePattern(SubHeaderPattern): + _name = "title" + + def match(self, line: LineWithMeta) -> bool: + return line.line.isupper() and self._is_bold(line) + + +patterns = [ + RegexpPattern(regexp=re.compile(r"^part\s+\d+$"), line_type="part", level_1=1, level_2=2, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^chapter\s+\d+$"), line_type="chapter", level_1=1, level_2=3, can_be_multiline=False), + DottedListPattern(line_type="point", level_1=2, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^\(\d+\)\s"), line_type="item", level_1=3, level_2=1, can_be_multiline=False), + RegexpPattern(regexp=re.compile(r"^\(\w\)\s"), line_type="sub_item", level_1=3, level_2=2, can_be_multiline=False), + TitlePattern(line_type="title", level_1=1, level_2=2, can_be_multiline=False), + SubHeaderPattern(line_type="sub_header", level_1=1, level_2=4, can_be_multiline=True) +] +print_document_tree(document=pdf_document, patterns=patterns) diff --git a/docs/source/_static/code_examples/langchain/dedoc_loader.py b/docs/source/_static/code_examples/langchain/dedoc_loader.py index 2e037694..c11786c7 100644 --- a/docs/source/_static/code_examples/langchain/dedoc_loader.py +++ b/docs/source/_static/code_examples/langchain/dedoc_loader.py @@ -76,6 +76,7 @@ def __init__( result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images + need_gost_frame_analysis: detect and ignore GOST frame of the document need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files @@ -374,6 +375,7 @@ def __init__( result for parsing PDF and images need_binarization: clean pages background (binarize) for PDF without a textual layer and images + need_gost_frame_analysis: detect and ignore GOST frame need_pdf_table_analysis: parse tables for PDF without a textual layer and images delimiter: column separator for CSV, TSV files diff --git a/docs/source/_static/code_examples/langchain/pdf.py b/docs/source/_static/code_examples/langchain/pdf.py index 336bfbcd..d851ce93 100644 --- a/docs/source/_static/code_examples/langchain/pdf.py +++ b/docs/source/_static/code_examples/langchain/pdf.py @@ -28,6 +28,7 @@ class DedocPDFLoader(DedocBaseLoader): need_header_footer_analysis: remove headers and footers from the output result need_binarization: clean pages background (binarize) for PDF without a textual layer + need_gost_frame_analysis: detect and ignore GOST frame need_pdf_table_analysis: parse tables for PDF without a textual layer Examples diff --git a/docs/source/_static/code_examples/test_dir/law.pdf b/docs/source/_static/code_examples/test_dir/law.pdf new file mode 100644 index 00000000..1575f13c Binary files /dev/null and b/docs/source/_static/code_examples/test_dir/law.pdf differ diff --git a/docs/source/_static/code_examples/test_dir/law.png b/docs/source/_static/code_examples/test_dir/law.png new file mode 100644 index 00000000..b588168a Binary files /dev/null and b/docs/source/_static/code_examples/test_dir/law.png differ diff --git a/docs/source/_static/code_examples/test_dir/with_tags.docx b/docs/source/_static/code_examples/test_dir/with_tags.docx new file mode 100644 index 00000000..f50bf345 Binary files /dev/null and b/docs/source/_static/code_examples/test_dir/with_tags.docx differ diff --git a/docs/source/_static/code_examples/test_dir/with_tags.png b/docs/source/_static/code_examples/test_dir/with_tags.png new file mode 100644 index 00000000..96a620aa Binary files /dev/null and b/docs/source/_static/code_examples/test_dir/with_tags.png differ diff --git a/docs/source/_static/notebooks_data/doc_example.jpeg b/docs/source/_static/notebooks_data/doc_example.jpeg new file mode 100644 index 00000000..25915a03 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_example.jpeg differ diff --git a/docs/source/_static/notebooks_data/doc_tables.pdf b/docs/source/_static/notebooks_data/doc_tables.pdf new file mode 100644 index 00000000..db96312e Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables.pdf differ diff --git a/docs/source/_static/notebooks_data/doc_tables_1.jpeg b/docs/source/_static/notebooks_data/doc_tables_1.jpeg new file mode 100644 index 00000000..ad211018 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables_1.jpeg differ diff --git a/docs/source/_static/notebooks_data/doc_tables_2.jpeg b/docs/source/_static/notebooks_data/doc_tables_2.jpeg new file mode 100644 index 00000000..1d299c04 Binary files /dev/null and b/docs/source/_static/notebooks_data/doc_tables_2.jpeg differ diff --git a/docs/source/_static/notebooks_data/table_1.png b/docs/source/_static/notebooks_data/table_1.png new file mode 100644 index 00000000..c585e440 Binary files /dev/null and b/docs/source/_static/notebooks_data/table_1.png differ diff --git a/docs/source/_static/notebooks_data/table_2.png b/docs/source/_static/notebooks_data/table_2.png new file mode 100644 index 00000000..5728f069 Binary files /dev/null and b/docs/source/_static/notebooks_data/table_2.png differ diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 361f5ac2..ec38c249 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -1,6 +1,17 @@ Changelog ========= +v2.3 (2024-09-19) +----------------- +Release note: `v2.3 `_ + +* `Dedoc telegram chat `_ created. +* Added `patterns` parameter for configuring default structure type (:ref:`using_patterns`). +* Added notebooks with Dedoc usage :ref:`table_notebooks` (see `issue 484 `_). +* Fix bug `OutOfMemoryError: Java heap space` in `PdfTabbyReader` (see `issue 489 `_). +* Fix bug with numeration in `DocxReader` (see `issue 494 `_). +* Added GOST (Russian government standard) frame recognition in `PdfImageReader` and `PdfTxtlayerReader` (`need_gost_frame_analysis` parameter). + v2.2.7 (2024-08-16) ------------------- Release note: `v2.2.7 `_ diff --git a/docs/source/conf.py b/docs/source/conf.py index cd56e3cf..edd4500c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -31,6 +31,24 @@ exclude_patterns = [] highlight_language = "python3" +# -- Options for the nitpicky mode ------------------------------------------- + +nitpicky = True +nitpick_ignore = [ + ("py:class", "abc.ABC"), + ("py:class", "pydantic.main.BaseModel"), + ("py:class", "scipy.stats._multivariate.dirichlet_multinomial_gen.cov"), + ("py:class", "pandas.core.series.Series"), + ("py:class", "numpy.ndarray"), + ("py:class", "pandas.core.frame.DataFrame"), + ("py:class", "dedoc.structure_extractors.feature_extractors.toc_feature_extractor.TocItem"), + ("py:class", "logging.Logger"), + ("py:class", "train_dataset.data_structures.line_with_label.LineWithLabel"), + ("py:class", "xgboost.sklearn.XGBClassifier"), + ("py:class", "collections.Counter"), + ("py:obj", "typing.Pattern") +] + # -- Options for HTML output ------------------------------------------------- html_theme = "sphinx_rtd_theme" diff --git a/docs/source/dedoc_api_usage/api.rst b/docs/source/dedoc_api_usage/api.rst index 13cd0eaf..c357ac78 100644 --- a/docs/source/dedoc_api_usage/api.rst +++ b/docs/source/dedoc_api_usage/api.rst @@ -42,10 +42,10 @@ Post-requests should be sent to ``http://localhost:1231/upload``. "is_one_column_document": "true", "return_format": 'html' } - with open(filename, 'rb') as file: - files = {'file': (filename, file)} + with open(filename, "rb") as file: + files = {"file": (filename, file)} r = requests.post("http://localhost:1231/upload", files=files, data=data) - result = r.content.decode('utf-8') + result = r.content.decode("utf-8") The ``data`` dictionary in the example contains some parameters to parse the given file. They are described in the section :ref:`api_parameters`. @@ -85,6 +85,12 @@ Api parameters description This type is used for choosing a specific structure extractor (and, in some cases, a specific reader). + * - patterns + - list of patterns dictionaries converted to string + - None + - This parameter is used only when ``document_type="other"``. + Configuration of default document structure, please see :ref:`using_patterns` for more details. + * - structure_type - tree, linear - tree @@ -218,6 +224,11 @@ Api parameters description * **true** -- if any text is detected in a PDF file, Dedoc assumes that textual layer is detected and it is correct. Much faster but less accurate. * **false** -- use the textual layer classifier to detect textual layer and prove its correctness. + * - need_gost_frame_analysis + - true, false + - false + - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. + The GOST frame recognizer is used recognize and ignore GOST frame on images and PDF documents. * - language - rus, eng, rus+eng, fra, spa diff --git a/docs/source/dedoc_api_usage/api_schema.rst b/docs/source/dedoc_api_usage/api_schema.rst index ea8d5b8a..adb3d48a 100644 --- a/docs/source/dedoc_api_usage/api_schema.rst +++ b/docs/source/dedoc_api_usage/api_schema.rst @@ -8,69 +8,20 @@ Json schema of the output is also available during dedoc application running on .. autoclass:: dedoc.api.schema.ParsedDocument - .. autoattribute:: content - .. autoattribute:: metadata - .. autoattribute:: version - .. autoattribute:: warnings - .. autoattribute:: attachments - .. autoclass:: dedoc.api.schema.DocumentContent - .. autoattribute:: structure - .. autoattribute:: tables - .. autoclass:: dedoc.api.schema.DocumentMetadata - .. autoattribute:: uid - .. autoattribute:: file_name - .. autoattribute:: temporary_file_name - .. autoattribute:: size - .. autoattribute:: modified_time - .. autoattribute:: created_time - .. autoattribute:: access_time - .. autoattribute:: file_type - .. autoclass:: dedoc.api.schema.TreeNode - .. autoattribute:: node_id - .. autoattribute:: text - .. autoattribute:: annotations - .. autoattribute:: metadata - .. autoattribute:: subparagraphs - .. autoclass:: dedoc.api.schema.LineWithMeta - .. autoattribute:: text - .. autoattribute:: annotations - .. autoclass:: dedoc.api.schema.LineMetadata - .. autoattribute:: paragraph_type - .. autoattribute:: page_id - .. autoattribute:: line_id - .. autoclass:: dedoc.api.schema.Table - .. autoattribute:: cells - .. autoattribute:: metadata - .. autoclass:: dedoc.api.schema.TableMetadata - .. autoattribute:: page_id - .. autoattribute:: uid - .. autoattribute:: rotated_angle - .. autoattribute:: title - .. autoclass:: dedoc.api.schema.CellWithMeta - .. autoattribute:: lines - .. autoattribute:: rowspan - .. autoattribute:: colspan - .. autoattribute:: invisible - .. autoclass:: dedoc.api.schema.Annotation - - .. autoattribute:: start - .. autoattribute:: end - .. autoattribute:: name - .. autoattribute:: value diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 781d5ef9..60344b5c 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -25,7 +25,7 @@ For this purpose one can use :class:`~dedoc.converters.DocxConverter` class: :language: python :lines: 10 -Method :meth:`~dedoc.converters.DocxConverter.can_convert` allows to check if the converter can convert the given file: +Method :meth:`~dedoc.converters.AbstractConverter.can_convert` allows to check if the converter can convert the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -70,7 +70,7 @@ one can use :class:`~dedoc.readers.DocxReader` class: :language: python :lines: 17 -Method :meth:`~dedoc.readers.DocxReader.can_read` allows to check if the reader can parse the given file: +Method :meth:`~dedoc.readers.BaseReader.can_read` allows to check if the reader can parse the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python @@ -196,7 +196,7 @@ we can add some metadata using :class:`~dedoc.metadata_extractors.DocxMetadataEx :language: python :lines: 64 -Method :meth:`~dedoc.metadata_extractors.DocxMetadataExtractor.can_extract` allows to check if +Method :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.can_extract` allows to check if the metadata extractor can extract metadata from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py @@ -228,7 +228,7 @@ For example, in the :ref:`docx_example_image` we can use :class:`~dedoc.attachme :language: python :lines: 74 -Method :meth:`~dedoc.attachments_extractors.DocxAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: +Method :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` allows to check if the attachments extractor can extract attachments from the given file: .. literalinclude:: ../_static/code_examples/dedoc_usage_tutorial.py :language: python diff --git a/docs/source/index.rst b/docs/source/index.rst index 0f2aed1d..fade9141 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -213,6 +213,7 @@ Currently the following domains can be handled: For a document of unknown or unsupported domain there is an option to use default structure extractor (``document_type=other`` at :ref:`api_parameters`), the default document structure described :ref:`here `. +This type of structure is configurable (see :ref:`using_patterns`). .. toctree:: @@ -223,14 +224,17 @@ For a document of unknown or unsupported domain there is an option to use defaul getting_started/usage parameters/parameters + .. toctree:: :maxdepth: 1 :caption: Tutorials + tutorials/notebooks tutorials/add_new_doc_format tutorials/add_new_structure_type - tutorials/creating_document_classes tutorials/add_new_language + tutorials/creating_document_classes + tutorials/using_patterns .. toctree:: diff --git a/docs/source/modules/data_structures.rst b/docs/source/modules/data_structures.rst index 35efcbc8..d6785ac7 100644 --- a/docs/source/modules/data_structures.rst +++ b/docs/source/modules/data_structures.rst @@ -7,33 +7,28 @@ Main classes defining a document -------------------------------- .. autoclass:: dedoc.data_structures.UnstructuredDocument - :special-members: __init__ .. autoclass:: dedoc.data_structures.ParsedDocument :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.DocumentContent :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.DocumentMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.TreeNode :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.LineWithMeta :show-inheritance: :special-members: __init__, __lt__ :members: - :undoc-members: line, uid, metadata, annotations + :undoc-members: set_line, set_metadata .. automethod:: __len__ .. automethod:: __getitem__ @@ -41,26 +36,22 @@ Main classes defining a document .. autoclass:: dedoc.data_structures.LineMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.HierarchyLevel - :special-members: __init__, __eq__, __lt__ + :special-members: __eq__, __lt__ :members: .. autoclass:: dedoc.data_structures.Table :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.TableMetadata :show-inheritance: - :special-members: __init__ :members: .. autoclass:: dedoc.data_structures.CellWithMeta :show-inheritance: - :special-members: __init__ :members: @@ -83,7 +74,6 @@ Helper classes .. autoattribute:: height .. autoclass:: dedoc.data_structures.AttachedFile - :special-members: __init__ :members: .. _annotations: @@ -93,7 +83,6 @@ Annotations of the text lines .. autoclass:: dedoc.data_structures.Annotation :show-inheritance: - :special-members: __init__ Concrete annotations ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/modules/structure_extractors.rst b/docs/source/modules/structure_extractors.rst index 86ded2c3..3d60c593 100644 --- a/docs/source/modules/structure_extractors.rst +++ b/docs/source/modules/structure_extractors.rst @@ -64,3 +64,101 @@ dedoc.structure_extractors :members: .. autoattribute:: document_type + + +.. _dedoc_structure_extractors_patterns: + +Patterns for :class:`~dedoc.structure_extractors.DefaultStructureExtractor` +--------------------------------------------------------------------------- + +Structure patterns are used for a more flexible configuring of lines types and levels during structure extraction step. +They are useful only for :class:`~dedoc.structure_extractors.DefaultStructureExtractor` (in API when "document_type"="other"). +Please see :ref:`using_patterns` to get examples of patterns usage. + + +.. autoclass:: dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.pattern_composition.PatternComposition + :special-members: __init__ + :members: + +.. autoclass:: dedoc.structure_extractors.patterns.RegexpPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.StartWordPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BracketListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BracketRomanListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.BulletListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.DottedListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.LetterListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.RomanListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagHeaderPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name + +.. autoclass:: dedoc.structure_extractors.patterns.TagListPattern + :show-inheritance: + :special-members: __init__ + :members: + + .. autoattribute:: _name diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 3323c2de..d8788089 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -62,7 +62,7 @@ PDF and images handling - rus, eng, rus+eng, fra, spa - rus+eng - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` * :meth:`dedoc.structure_extractors.FintocStructureExtractor.extract` - Language of the document without a textual layer. The following values are available: @@ -77,7 +77,7 @@ PDF and images handling - :, start:, :end, start:end - : - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read`, :meth:`dedoc.readers.PdfTabbyReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - If you need to read a part of the PDF document, you can use page slice to define the reading range. If the range is set like ``start_page:end_page``, document will be processed from ``start_page`` to ``end_page`` @@ -96,7 +96,7 @@ PDF and images handling - true, false, auto - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to set the number of columns if the PDF document is without a textual layer in case it's known beforehand. The following values are available: @@ -111,7 +111,7 @@ PDF and images handling - auto, no_change - auto - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to control document orientation analysis for PDF documents without a textual layer. The following values are available: @@ -125,7 +125,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to **remove** headers and footers of PDF documents from the output result. If ``need_header_footer_analysis=False``, header and footer lines will present in the output as well as all other document lines. @@ -134,7 +134,7 @@ PDF and images handling - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to clean background (binarize) for pages of PDF documents without a textual layer. If the document's background is heterogeneous, this option may help to improve the result of document text recognition. @@ -144,18 +144,29 @@ PDF and images handling - True, False - True - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used to enable table recognition for PDF documents or images. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader`. If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`, in this case tables will be parsed much easier and faster. + * - need_gost_frame_analysis + - True, False + - False + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - This option is used to enable GOST (Russian government standard) frame recognition for PDF documents or images. + The GOST frame recognizer is used in :meth:`dedoc.readers.PdfBaseReader.read`. Its main function is to recognize and + ignore the GOST frame on the document. It allows :class:`dedoc.readers.PdfImageReader` and :class:`dedoc.readers.PdfTxtlayerReader` + to properly process the content of the document containing GOST frame. + * - orient_analysis_cells - True, False - False - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False``. @@ -166,7 +177,7 @@ PDF and images handling - 90, 270 - 90 - * :meth:`dedoc.DedocManager.parse` - * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfBaseReader.read` * :meth:`dedoc.readers.ReaderComposition.read` - This option is used for a table recognition for PDF documents or images. It is ignored when ``need_pdf_table_analysis=False`` or ``orient_analysis_cells=False``. diff --git a/docs/source/parameters/structure_type.rst b/docs/source/parameters/structure_type.rst index 09b592c2..bb4205e9 100644 --- a/docs/source/parameters/structure_type.rst +++ b/docs/source/parameters/structure_type.rst @@ -37,6 +37,16 @@ Structure type configuring If you use your custom configuration, look to the documentation of :class:`~dedoc.structure_extractors.StructureExtractorComposition` + * - patterns + - list of patterns based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`, + or list of patterns dicts, or list of dictionaries converted to string + - None + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.structure_extractors.StructureExtractorComposition.extract` + * :meth:`dedoc.structure_extractors.DefaultStructureExtractor.extract` + - This parameter is used only by :class:`~dedoc.structure_extractors.DefaultStructureExtractor` (``document_type="other"``). + Configuration of default document structure, please see :ref:`using_patterns` for more details. + * - structure_type - tree, linear - tree diff --git a/docs/source/readers_output/line_types.rst b/docs/source/readers_output/line_types.rst index 666a8d35..6ae464fc 100644 --- a/docs/source/readers_output/line_types.rst +++ b/docs/source/readers_output/line_types.rst @@ -4,8 +4,8 @@ Types of textual lines ====================== Each reader returns :class:`~dedoc.data_structures.UnstructuredDocument` with textual lines. -Readers don't fill ``hierarchy_level`` metadata field (structure extractors do this), but they can fill ``hierarchy_level_tag`` with information about line types. -Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` in document lines metadata: +Readers don't fill ``hierarchy_level`` metadata field (structure extractors do this), but they can fill ``tag_hierarchy_level`` with information about line types. +Below the readers are enlisted that can return non-empty ``tag_hierarchy_level`` in document lines metadata: * `+` means that the reader can return lines of this type. * `-` means that the reader doesn't return lines of this type due to complexity of the task or lack of information provided by the format. @@ -19,7 +19,7 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - **Reader** - **header** - **list_item** - - **raw_text, unknown** + - **unknown** - **key** * - :class:`~dedoc.readers.DocxReader` @@ -42,7 +42,7 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.RawTextReader` - `-` - - `+` + - `-` - `+` - `-` @@ -54,7 +54,7 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.PdfImageReader` - `-` - - `+` + - `-` - `+` - `-` @@ -66,6 +66,6 @@ Below the readers are enlisted that can return non-empty ``hierarchy_level_tag`` * - :class:`~dedoc.readers.PdfTxtlayerReader` - `-` - - `+` + - `-` - `+` - `-` diff --git a/docs/source/structure_types/other.rst b/docs/source/structure_types/other.rst index 13a4e716..1f6a2d62 100644 --- a/docs/source/structure_types/other.rst +++ b/docs/source/structure_types/other.rst @@ -3,6 +3,11 @@ Default document structure type =============================== +.. note:: + + This structure type is configurable: you can change lines types and levels in the tree hierarchy. + Please see :ref:`using_patterns` for more details. + Below we will consider document lines as nodes of the document tree. In some cases document lines are paragraphs of the text (e.g. in docx). @@ -50,7 +55,7 @@ The detailed description of each line type: Its text is an empty string. This type of node is optional, it occurs only if lists are found in the given document. - For each list type (dotted, bracket, bullet) the new list node is created. + For each list type (dotted, roman, bracket, bullet) the new list node is created. This type of node is more important than list_item and raw_text. List nodes for less important lists are are nested into list items of more important list types. For example, list node for bullet list beginning is less important than a list item of a dotted list. diff --git a/docs/source/tutorials/add_new_doc_format.rst b/docs/source/tutorials/add_new_doc_format.rst index a1e6ec1a..4fb0e1ca 100644 --- a/docs/source/tutorials/add_new_doc_format.rst +++ b/docs/source/tutorials/add_new_doc_format.rst @@ -164,8 +164,7 @@ You should implement the following methods: * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract()`: use file extension or mime to check if we could read the given file. You can learn more about extensions and mime using file ``dedoc/extensions.py`` * :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.extract()` : use information about file path and file name to extract attachments from the given file. -The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using -:meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor._content2attach_file` method. +The method returns the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` using ``_content2attach_file`` method. This method is inherited from the abstract class, it makes the list of :class:`~dedoc.data_structures.attached_file.AttachedFile` from the list of tuples: the name of the attached file and binary content of the file. diff --git a/docs/source/tutorials/add_new_structure_type/features_extraction.rst b/docs/source/tutorials/add_new_structure_type/features_extraction.rst index 584710b3..3c3330ed 100644 --- a/docs/source/tutorials/add_new_structure_type/features_extraction.rst +++ b/docs/source/tutorials/add_new_structure_type/features_extraction.rst @@ -29,9 +29,6 @@ Let's implement the basic methods of the parent class: * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.parameters` -- we don't plan to use any parameters in the ``__init__`` method, so an empty dictionary can be returned; - * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.fit` -- - we don't need to train our feature extractor, so the method can be empty; - * :meth:`~dedoc.structure_extractors.feature_extractors.abstract_extractor.AbstractFeatureExtractor.transform` -- here we implement a basic scheme of features extraction from each document and their concatenation. diff --git a/docs/source/tutorials/notebooks.rst b/docs/source/tutorials/notebooks.rst new file mode 100644 index 00000000..9abea57f --- /dev/null +++ b/docs/source/tutorials/notebooks.rst @@ -0,0 +1,32 @@ +Notebooks with examples of Dedoc usage +====================================== + +.. _table_notebooks: + +.. flat-table:: Notebooks with Dedoc usage examples + :widths: 70 30 + :header-rows: 1 + :class: tight-table + + * - Task description + - Link to the notebook + + * - Document text preprocessing for the following document classification: + * automatic detection of document format: DOC, DOCX, PDF or any image format; + * text extraction and its structuring; + * saving the result to JSON file. + - `Notebook 1 `_ + + * - Tables text and structure extraction from images of scanned documents: + * automatic detection of document format: PDF or any image format; + * tables extraction including multi-paged tables; + * grouping tables by document page where they are located; + * saving each page to CSV file. + - `Notebook 2 `_ + + * - ADVANCED: Extract text from scanned documents and get its location on the document image: + * automatic detection of image format; + * text extraction from image; + * text location visualization; + * text recognition confidence visualization. + - `Notebook 3 `_ diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst new file mode 100644 index 00000000..e2ea2d71 --- /dev/null +++ b/docs/source/tutorials/using_patterns.rst @@ -0,0 +1,478 @@ +.. _using_patterns: + +Configure structure extraction using patterns +============================================= + +It is possible to configure structure type in Dedoc: option ``document_type`` in the ``parameters`` dictionary +(:ref:`api_parameters`, :ref:`structure_type_parameters`). +The default structure type (when ``document_type="other"``, see :ref:`other_structure`) allows to get a basic document structure which is fixed. +If you want to change this structure, e.g. names of line types (nodes) or their levels in the tree hierarchy, you can use structure patterns. + +Use patterns in Dedoc library +----------------------------- + +If you use Dedoc as a library, you can use existing pattern classes :ref:`dedoc_structure_extractors_patterns` +or implement your own custom pattern based on :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern`. + +Let's see some examples. First of all, we enlist all the required imports: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 1-13 + +Using information from readers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Assume we need to parse file :download:`with_tags.docx <../_static/code_examples/test_dir/with_tags.docx>`, which looks like follows: + +.. _docx_with_tags_image: + +.. figure:: ../_static/code_examples/test_dir/with_tags.png + :width: 400 + + DOCX document example + +In this document, there are headers that are marked as heading lines by user, as well as headers highlighted by formatting +(bold font of a larger size, e.g. Header 2 or Header 2.1). +Also, there are automatic lists (list items, bullet list items) and raw text list items (custom list items). + +Let's read the document by :class:`~dedoc.readers.DocxReader` and see a result: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 18-28 + +.. code-block:: text + + Document lines + + LineWithMeta(Title, tagHL=(1, 1, 'header'), HL=None) + LineWithMeta(Header 1, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(Header 1.1, tagHL=(1, 3, 'header'), HL=None) + LineWithMeta(Text, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(· bullet_list_item1, tagHL=(2, 1, 'list_item'), HL=None) + LineWithMeta(◦ subitem1, tagHL=(2, 1, 'list_item'), HL=None) + LineWithMeta(◦ subitem2, tagHL=(2, 1, 'list_item'), HL=None) + ... + LineWithMeta(Header 2, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(1. Custom item, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(a) custom subitem, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(2. Custom item 2, tagHL=(None, None, 'unknown'), HL=None) + LineWithMeta(3. Custom item 3, tagHL=(None, None, 'unknown'), HL=None) + ... + + +For each line, its text and hierarchy level information is printed (``tagHL``, ``HL``). + +.. seealso:: + + * Documentation of classes :class:`~dedoc.data_structures.LineWithMeta`, :class:`~dedoc.data_structures.LineMetadata`, :class:`~dedoc.data_structures.HierarchyLevel` may be helpful; + * :ref:`add_structure_type_hierarchy_level` may be useful for understanding :class:`~dedoc.data_structures.HierarchyLevel`. + * :ref:`readers_line_types` can be helpful to find out which readers are able to extract certain types of lines. + +As we see, the reader filled the ``metadata.tag_hierarchy_level`` field (``tagHL``): + + * some lines have types ``header`` or ``list_item`` and non-empty values of ``level_1`` and ``level_2``; + * some lines have ``unknown`` type and empty values of levels - it means that the reader couldn't extract any information for these lines. + +To extract structure and construct a document tree, we need: + + 1. add metadata to the document (call metadata extractor); + 2. fill ``metadata.hierarchy_level`` (call structure extractor); + 3. construct a document tree (call structure constructor). + +Let's use information from :class:`~dedoc.readers.DocxReader` about headers and list items during structure extraction step. +For this purpose, we initialize :class:`~dedoc.structure_extractors.patterns.TagHeaderPattern` and +:class:`~dedoc.structure_extractors.patterns.TagListPattern` classes. +These patterns are given to the :meth:`~dedoc.structure_extractors.DefaultStructureExtractor.extract` method, +which applies patterns if lines match them, else line becomes simple raw text line. + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 30-37 + +Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.data_structures.ParsedDocument` +to the HTML representation and print it: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 38-44 + +.. code-block:: text + + Document tree + **** id = 0 ; type = root + + **Title** id = 0.0 ; type = custom_header + + **Header 1** id = 0.0.0 ; type = raw_text + + **Header 1.1** id = 0.0.1 ; type = custom_header + + Text id = 0.0.1.0 ; type = raw_text + + · bullet_list_item1 id = 0.0.1.1 ; type = custom_list + + ◦ subitem1 id = 0.0.1.2 ; type = custom_list + + ◦ subitem2 id = 0.0.1.3 ; type = custom_list + + ... + + **Header 2 + **1\. Custom item + a) custom subitem + 2\. Custom item 2 + 3\. Custom item 3 + ... id = 0.0.2.4.0 ; type = raw_text + ... + +As wee see, lines with types ``header`` and ``list_item`` from ``tagHL`` became ``custom_header`` and ``custom_list`` according to the patterns settings. +But `Header 2` and custom items became ``raw_text`` and were merged into one node (``can_be_multiline=True`` for ``raw_text`` lines). + +Using regular expressions +~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, we'll transform `Header 2` and custom items to the lines with types ``header`` and ``list_item``. + +First of all, we introduce an auxiliary function ``print_document_tree`` to avoid code duplication: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 47-51 + +To handle lines basing on their text only (without ``tagHL`` information), we can use patterns based on regular expressions. +For this purpose, we can use class :class:`~dedoc.structure_extractors.patterns.RegexpPattern` and classes that are based on it, +e.g. :class:`~dedoc.structure_extractors.patterns.DottedListPattern` and :class:`~dedoc.structure_extractors.patterns.LetterListPattern`. + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 54-62 + +.. code-block:: text + + Document tree + **** id = 0 ; type = root + + **Title** id = 0.0 ; type = custom_header + + **Header 1** id = 0.1 ; type = custom_header + + **Header 1.1** id = 0.1.0 ; type = custom_header + + Text id = 0.1.0.0 ; type = raw_text + + · bullet_list_item1 id = 0.1.0.1 ; type = custom_list + + ◦ subitem1 id = 0.1.0.2 ; type = custom_list + + ◦ subitem2 id = 0.1.0.3 ; type = custom_list + + ... + + **Header 2 + ********** id = 0.2 ; type = custom_header + + 1\. Custom item + id = 0.2.0 ; type = custom_list + + a) custom subitem + id = 0.2.0.0 ; type = custom_list + + 2\. Custom item 2 + id = 0.2.1 ; type = custom_list + + 3\. Custom item 3 + id = 0.2.2 ; type = custom_list + ... + +In this case, `Header 2` and custom items became ``custom_header`` and ``custom_list`` as well as `Header 1` and bullet list items. + + +.. note:: + + The order of the patterns is important: if you place regexp patterns before tag patterns, then tag patterns will be ignored. + It happens because some lines match both regexp and tag patterns. + +.. seealso:: + + You can see the full list of patterns with their descriptions here: :ref:`dedoc_structure_extractors_patterns`. + +The next section contains a more real-life example. + +Practical example: get structured PDF +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Assume we need to parse file :download:`law.pdf <../_static/code_examples/test_dir/law.pdf>`, the first page of which looks like follows: + +.. _pdf_law_image: + +.. figure:: ../_static/code_examples/test_dir/law.png + :width: 400 + + PDF document example + +This document has a certain structure with parts, chapters and numbered lists. +Let's read the document using :class:`~dedoc.readers.PdfTabbyReader` and see the result: + + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 66-74 + +.. code-block:: text + + Document lines + + LineWithLocation(S T A T U T O R Y I N S T R U M E N T S, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(2024 No. 853, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(EXITING THE EUROPEAN UNION, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(The Windsor Framework (Retail Movement Scheme: Plant and, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(Animal Health) (Amendment etc.) Regulations 2024, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(- - - - Made 8th August 2024, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(Laid before Parliament 9th August 2024, tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(Coming into force in accordance with regulation 1(2), tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(The Secretary of State makes these Regulations in exercise of ..., tagHL=(None, None, 'unknown'), HL=None) + LineWithLocation(8C(1) and (2) of, and paragraph 21 of Schedule 7 to, the Europ..., tagHL=(None, None, 'unknown'), HL=None) + + +Here we consider class ``LineWithLocation`` almost the same as :class:`~dedoc.data_structures.LineWithMeta`. +As we see, ``tagHL=(None, None, 'unknown')`` for each line: +this means that the reader couldn't extract any useful information about lines types and levels. +So, :class:`~dedoc.structure_extractors.patterns.TagHeaderPattern` and +:class:`~dedoc.structure_extractors.patterns.TagListPattern` are useless in this case. + +.. note:: + + :class:`~dedoc.readers.PdfTabbyReader` is able to extract information about headers and list items from PDF if possible. + But, in reality, most PDF documents don't contain information about headers and list items. + +Let's use regexp-based patterns to extract a simple structure and see the result: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 76-83 + +.. code-block:: text + + Document tree + **** id = 0 ; type = root + + S T A T U T O R Y I N S T R U M E N T S + **2024 No. 853 + EXITING THE EUROPEAN UNION** + The Windsor Framework (Retail Movement Scheme: Plant and + Animal Health) (Amendment etc.) Regulations 2024 + _-_ \- - _\- Made 8th August 2024 + Laid before Parliament 9th August 2024 + Coming into force in accordance with regulation 1(2)_ + The Secretary of State makes these Regulations in exercise of the powers conferred by section + 8C(1) and (2) of, and paragraph 21 of Schedule 7 to, the European Union (Withdrawal) Act + 2018(a). + In making these Regulations, the Secretary of State has had special regard to the matters listed + in section 46 of the United Kingdom Internal Market Act 2020. + id = 0.0 ; type = raw_text + + PART 1 + id = 0.1 ; type = part + + Introductory + **Citation, commencement and extent** + id = 0.1.0 ; type = raw_text + + **1.—(1)** These Regulations may be cited as the Windsor Framework (Retail Movement Scheme: + id = 0.1.1 ; type = point + + Plant and Animal Health) (Amendment etc.) Regulations 2024. + id = 0.1.1.0 ; type = raw_text + + (2) These Regulations come into force— + id = 0.1.1.1 ; type = item + + (a) for the purposes of regulation 3, on 26th April 2025; + id = 0.1.1.1.0 ; type = sub_item + + (b) for all other purposes, on 2nd September 2024. + id = 0.1.1.1.1 ; type = sub_item + + ... + +As we see, parts and list items were extracted successfully, but headers highlighted in bold became raw text lines. +Information about bold font can be found in the ``annotations`` attribute of :class:`~dedoc.data_structures.LineWithMeta`: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 86-88 + +In the result below, some lines contain ``Bold`` annotation among others. + +.. code-block:: text + + LineWithLocation(S T A T U T O R Y I N S T R U M E N T S, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...)] + LineWithLocation(2024 No. 853, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...)] + LineWithLocation(EXITING THE EUROPEAN UNION, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...), Bounding box(...), Size(...), Style(...), Bold(...)] + LineWithLocation(The Windsor Framework (Retail Movement Scheme: Plant and, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...)] + LineWithLocation(Animal Health) (Amendment etc.) Regulations 2024, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...)] + LineWithLocation(- - - - Made 8th August 2024, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...)] + LineWithLocation(Laid before Parliament 9th August 2024, tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...)] + LineWithLocation(Coming into force in accordance with regulation 1(2), tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...), Bounding box(...), Size(...), Style(...), Italic(...)] + LineWithLocation(The Secretary of State makes these Regulations in exercise of ..., tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...)] + LineWithLocation(8C(1) and (2) of, and paragraph 21 of Schedule 7 to, the Europ..., tagHL=(None, None, 'unknown'), HL=(None, None, 'raw_text')) [Indentation(...), Spacing(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...), Bounding box(...), Size(...), Style(...)] + +.. seealso:: + + * More information about each type of annotation, e.g. :class:`~dedoc.data_structures.BoldAnnotation` can be found here: :ref:`annotations`. + * :ref:`readers_annotations` can be helpful to find out which readers are able to extract certain types of annotations. + + +Let's use the information about bold font for titles and headers detection. +There is no such a pattern in Dedoc, that uses lines annotations. +Don't worry! We can write them from scratch. + +Each pattern should be based on the class :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern` +and implement all its methods: + +* :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.match` to check if the line matches the pattern; +* :meth:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern.get_hierarchy_level` to get \ + line type (``line_type``) and hierarchy levels (``level_1``, ``level_2``), and if it can be merged with other lines (``can_be_multiline``); +* ``_name`` attribute to differentiate this pattern from others. + +.. code-block:: python + + from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern + + class CustomPattern(AbstractPattern): + _name = "custom_pattern" + + def match(self, line: LineWithMeta) -> bool: + pass + + def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel: + pass + +.. seealso:: + + * Documentation of classes :class:`~dedoc.structure_extractors.patterns.abstract_pattern.AbstractPattern` and :class:`~dedoc.data_structures.HierarchyLevel` may be helpful; + * :ref:`add_structure_type_hierarchy_level` may be useful for understanding :class:`~dedoc.data_structures.HierarchyLevel`; + * You can also see the source code of other patterns through :ref:`dedoc_structure_extractors_patterns`. + + +The code below shows implementation of two patterns for titles and sub-headers: + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 91-110 + + +Now we can use all the patterns together and see the resulting document tree. + +.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py + :language: python + :lines: 113-122 + +.. code-block:: text + + Document tree + **** id = 0 ; type = root + + S T A T U T O R Y I N S T R U M E N T S + **_**<**i> <**/**i> <**i**>_ >**>**__<** i**>_**_____________________________ id = 0.0 ; type = title + + **2024 No. 853** + id = 0.0.0 ; type = sub_header + + **EXITING THE EUROPEAN UNION** + id = 0.1 ; type = title + + The Windsor Framework (Retail Movement Scheme: Plant and + Animal Health) (Amendment etc.) Regulations 2024 + _-_ \- - _\- Made 8th August 2024 + Laid before Parliament 9th August 2024 + Coming into force in accordance with regulation 1(2)_ + The Secretary of State makes these Regulations in exercise of the powers conferred by section + 8C(1) and (2) of, and paragraph 21 of Schedule 7 to, the European Union (Withdrawal) Act + 2018(a). + In making these Regulations, the Secretary of State has had special regard to the matters listed + in section 46 of the United Kingdom Internal Market Act 2020. + id = 0.1.0 ; type = raw_text + + PART 1 + id = 0.2 ; type = part + + Introductory + **Citation, commencement and extent** + id = 0.2.0 ; type = sub_header + + **1.—(1)** These Regulations may be cited as the Windsor Framework (Retail Movement Scheme: + id = 0.2.0.0 ; type = point + + Plant and Animal Health) (Amendment etc.) Regulations 2024. + id = 0.2.0.0.0 ; type = raw_text + + (2) These Regulations come into force— + id = 0.2.0.0.1 ; type = item + + (a) for the purposes of regulation 3, on 26th April 2025; + id = 0.2.0.0.1.0 ; type = sub_item + + (b) for all other purposes, on 2nd September 2024. + id = 0.2.0.0.1.1 ; type = sub_item + +As a result, we extracted basic information using simple regular expressions and information about document formatting. +One can come up with more complicated patterns to enrich this document representation by new line types and node depths. + + +Conclusions +~~~~~~~~~~~ + +In this tutorial, we used Dedoc as a library and extracted configurable structure using patterns: + +* tag-based and regexp-based pattens for a DOCX document with information about headers and automatic lists; +* regexp-based and custom pattens for a real-life PDF document. + +The full script with the code above can be downloaded here: :download:`dedoc_using_patterns_tutorial.py <../_static/code_examples/dedoc_using_patterns_tutorial.py>`. + +Use patterns in Dedoc API +------------------------- + +Patterns are configurable via API, each pattern is represented by: + +* a dictionary with parameters for pattern class initialisation (they may differ, see documentation :ref:`dedoc_structure_extractors_patterns`); +* a name of the required pattern, each pattern has a unique name that can be found in the ``_name`` attribute (see :ref:`dedoc_structure_extractors_patterns`). + +The example below shows patterns usage via API in the aforementioned real-life example of PDF document: + +.. code-block:: python + + import requests + + file_path = "test_dir/law.pdf" + file_name = "law.pdf" + patterns = [ + {"name": "regexp", "regexp": "^part\s+\d+$", "line_type": "part", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}, + {"name": "regexp", "regexp": "^chapter\s+\d+$", "line_type": "chapter", "level_1": 1, "level_2": 2, "can_be_multiline": "false"}, + {"name": "dotted_list", "line_type": "point", "level_1": 2, "can_be_multiline": "false"}, + {"name": "regexp", "regexp": "^\(\d+\)\s", "line_type": "item", "level_1": 3, "level_2": 1, "can_be_multiline": "false"}, + {"name": "regexp", "regexp": "^\(\w\)\s", "line_type": "sub_item", "level_1": 3, "level_2": 2, "can_be_multiline": "false"} + ] + parameters = {"patterns": str(patterns)} + + with open(file_path, "rb") as file: + files = {"file": (file_name, file)} + r = requests.post("http://localhost:1231/upload", files=files, data=parameters) + +Using your own custom pattern is complicated: + +* clone the repository: + + .. code-block:: bash + + git clone https://github.com/ispras/dedoc + +* implement the required patterns and place the files with code in the ``dedoc/structure_extractors/patterns`` directory; +* add patterns imports and classes (``__all__`` list) to the file ``dedoc/structure_extractors/patterns/__init__.py``; +* run Dedoc API, e.g. using Docker: + + .. code-block:: bash + + docker compose up --build diff --git a/pyproject.toml b/pyproject.toml index 307b4797..9b0b640d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ docs = [ "sphinx-togglebutton==0.3.2", # for using toggle button "linuxdoc==20230506", # for using flat-table "tabula-py==2.8.1", # for adding new doc type tutorial + "html2text==2024.2.26" # for using patterns tutorial ] lint = [ "flake8==5.0.4", diff --git a/requirements.txt b/requirements.txt index 9ddd1350..392895af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ beautifulsoup4>=4.10.0,<=4.12.2 charset-normalizer>=2.0.12,<=3.2.0 Cython>=0.29.28,<=3.0.2 -dedoc-utils==0.3.7 +dedoc-utils==0.3.8 fastapi>=0.77.0,<1.0 huggingface-hub>=0.14.1,<1.0 imutils==0.5.4 diff --git a/tests/api_tests/test_api_doctype_default.py b/tests/api_tests/test_api_doctype_default.py new file mode 100644 index 00000000..fcf4a2c6 --- /dev/null +++ b/tests/api_tests/test_api_doctype_default.py @@ -0,0 +1,47 @@ +from tests.api_tests.abstract_api_test import AbstractTestApiDocReader + + +class TestApiDefaultStructure(AbstractTestApiDocReader): + + def test_patterns(self) -> None: + file_name = "docx/without_numbering.docx" + patterns = [ + {"name": "regexp", "regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}, # noqa + {"name": "start_word", "start_word": "статья", "level_1": 2, "line_type": "статья"}, + {"name": "dotted_list", "level_1": 3, "line_type": "list_item", "can_be_multiline": False}, + {"name": "bracket_list", "level_1": 4, "level_2": 1, "line_type": "bracket_list_item", "can_be_multiline": "false"} + ] + result = self._send_request(file_name, {"patterns": str(patterns)}) + structure = result["content"]["structure"] + + node = self._get_by_tree_path(structure, "0.1") + self.assertEqual(node["text"].strip(), "Глава 1. Общие положения") + self.assertEqual(node["metadata"]["paragraph_type"], "глава") + node = self._get_by_tree_path(structure, "0.1.1") + self.assertIn("Статья 1.1.", node["text"]) + self.assertEqual(node["metadata"]["paragraph_type"], "статья") + node = self._get_by_tree_path(structure, "0.1.1.0") + self.assertEqual(node["metadata"]["paragraph_type"], "list") + node = self._get_by_tree_path(structure, "0.1.1.0.0") + self.assertIn("1. Законодательство", node["text"]) + self.assertEqual(node["metadata"]["paragraph_type"], "list_item") + node = self._get_by_tree_path(structure, "0.1.2.0.0.0") + self.assertEqual(node["text"].strip(), "1) предупреждение;") + self.assertEqual(node["metadata"]["paragraph_type"], "bracket_list_item") + node = self._get_by_tree_path(structure, "0.2") + self.assertEqual(node["text"].strip(), "Глава 2. Административные правонарушения, посягающие на права граждан и здоровье населения") + self.assertEqual(node["metadata"]["paragraph_type"], "глава") + + def test_empty_patterns(self) -> None: + file_name = "docx/example.docx" + self._send_request(file_name, {"patterns": ""}) + self._send_request(file_name, {"patterns": "[]"}) + + def test_wrong_patterns(self) -> None: + file_name = "docx/example.docx" + self._send_request(file_name, {"patterns": str([{"regexp": "^глава\s\d+\.", "line_type": "глава", "level_1": 1}])}, expected_code=400) # noqa + self._send_request(file_name, {"patterns": str([{"name": "start_word", "line_type": "глава", "level_1": 1}])}, expected_code=400) + self._send_request(file_name, {"patterns": str([{"name": "unknown", "line_type": "глава", "level_1": 1}])}, expected_code=400) + self._send_request(file_name, {"patterns": "{1: blabla}"}, expected_code=400) + self._send_request(file_name, {"patterns": "{1: 2}"}, expected_code=400) + self._send_request(file_name, {"patterns": "[1]"}, expected_code=400) diff --git a/tests/api_tests/test_api_format_txt.py b/tests/api_tests/test_api_format_txt.py index 3be5b0e4..aa38ab9e 100644 --- a/tests/api_tests/test_api_format_txt.py +++ b/tests/api_tests/test_api_format_txt.py @@ -41,7 +41,7 @@ def test_text2(self) -> None: result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"] self.assertIn("УТВЕРЖДЕНЫ", get_by_tree_path(content, "0.0")["text"]) - self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом", get_by_tree_path(content, "0.1.0")["text"]) + self.assertIn("1. Настоящие Требования разработаны в соответствии с Федеральным законом", get_by_tree_path(content, "0.2.0")["text"]) def test_special_symbols(self) -> None: file_name = "special_symbol.txt" diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py index f340073c..a1e48a78 100644 --- a/tests/api_tests/test_api_module_table_recognizer.py +++ b/tests/api_tests/test_api_module_table_recognizer.py @@ -213,3 +213,29 @@ def test_detect_small_table(self) -> None: result = self._send_request(file_name, data={"language": "rus"}) tables = result["content"]["tables"] self.assertEqual(2, len(tables)) + + def test_multipage_gost_table(self) -> None: + file_name = "gost_multipage_table.pdf" + result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader + self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35) + self.assertTrue("KR13" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) # check the last row of multipage table + self.assertTrue("R13.1" in result["content"]["tables"][0]["cells"][-1][1]["lines"][0]["text"]) # check that it belongs to first and only table + self.assertTrue("Испытание по проверке" in result["content"]["tables"][0]["cells"][-1][2]["lines"][0]["text"]) + self.assertTrue("3.6" in result["content"]["tables"][0]["cells"][-1][3]["lines"][0]["text"]) + self.assertTrue("7.4.9" in result["content"]["tables"][0]["cells"][-1][4]["lines"][0]["text"]) + + def test_multipage_gost_table_with_text_layer(self) -> None: + file_name = "gost_multipage_table_2.pdf" + result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True"}) + self.assertEqual(len(result["content"]["tables"][0]["cells"]), 14) + self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) + self.assertEqual(len(result["content"]["tables"]), 1) + + def test_multipage_gost_table_with_text_layer_and_pages_param(self) -> None: + file_name = "gost_multipage_table_2.pdf" + result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "True", "pages": "2:"}) + self.assertEqual(len(result["content"]["tables"]), 1) + self.assertEqual(len(result["content"]["tables"][0]["cells"]), 5) + self.assertTrue("SAMPLE TEXT" in result["content"]["tables"][0]["cells"][0][0]["lines"][0]["text"]) + self.assertTrue("2" in result["content"]["tables"][0]["cells"][-1][0]["lines"][0]["text"]) diff --git a/tests/data/tables/gost_frame_1.jpg b/tests/data/tables/gost_frame_1.jpg new file mode 100644 index 00000000..5cae5141 Binary files /dev/null and b/tests/data/tables/gost_frame_1.jpg differ diff --git a/tests/data/tables/gost_frame_2.png b/tests/data/tables/gost_frame_2.png new file mode 100644 index 00000000..a78d5c81 Binary files /dev/null and b/tests/data/tables/gost_frame_2.png differ diff --git a/tests/data/tables/gost_frame_3.jpg b/tests/data/tables/gost_frame_3.jpg new file mode 100644 index 00000000..5b3a8f66 Binary files /dev/null and b/tests/data/tables/gost_frame_3.jpg differ diff --git a/tests/data/tables/gost_multipage_table.pdf b/tests/data/tables/gost_multipage_table.pdf new file mode 100644 index 00000000..0adbe3bb Binary files /dev/null and b/tests/data/tables/gost_multipage_table.pdf differ diff --git a/tests/data/tables/gost_multipage_table_2.pdf b/tests/data/tables/gost_multipage_table_2.pdf new file mode 100644 index 00000000..295df746 Binary files /dev/null and b/tests/data/tables/gost_multipage_table_2.pdf differ diff --git a/tests/data/tables/not_gost_frame.jpg b/tests/data/tables/not_gost_frame.jpg new file mode 100644 index 00000000..1602a8c2 Binary files /dev/null and b/tests/data/tables/not_gost_frame.jpg differ diff --git a/tests/unit_tests/test_doctype_default_structure_extractor.py b/tests/unit_tests/test_doctype_default_structure_extractor.py new file mode 100644 index 00000000..05f6c56d --- /dev/null +++ b/tests/unit_tests/test_doctype_default_structure_extractor.py @@ -0,0 +1,84 @@ +import os +import re +import unittest + +from dedoc.readers.docx_reader.docx_reader import DocxReader +from dedoc.readers.reader_composition import ReaderComposition +from dedoc.readers.txt_reader.raw_text_reader import RawTextReader +from dedoc.structure_extractors.concrete_structure_extractors.default_structure_extractor import DefaultStructureExtractor +from dedoc.structure_extractors.patterns.dotted_list_pattern import DottedListPattern +from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern +from dedoc.structure_extractors.patterns.roman_list_pattern import RomanListPattern +from dedoc.structure_extractors.patterns.tag_header_pattern import TagHeaderPattern +from dedoc.structure_extractors.patterns.tag_list_pattern import TagListPattern +from tests.test_utils import get_test_config + + +class TestDefaultStructureExtractor(unittest.TestCase): + data_directory_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) + structure_extractor = DefaultStructureExtractor(config=get_test_config()) + reader = ReaderComposition(readers=[RawTextReader(), DocxReader()]) + + def test_tag_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "with_tags.docx") + patterns = [ + TagHeaderPattern(line_type="custom_header", level_1=1, can_be_multiline=False), + TagListPattern(line_type="custom_list", level_1=2), + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "custom_header") + self.assertEqual(document.lines[0].metadata.hierarchy_level.level_1, 1) + self.assertEqual(document.lines[0].metadata.hierarchy_level.level_2, 1) + self.assertFalse(document.lines[0].metadata.hierarchy_level.can_be_multiline) + + self.assertEqual(document.lines[1].metadata.hierarchy_level.line_type, "custom_header") + self.assertEqual(document.lines[1].metadata.hierarchy_level.level_1, 1) + self.assertEqual(document.lines[1].metadata.hierarchy_level.level_2, 2) + + self.assertEqual(document.lines[3].metadata.hierarchy_level.line_type, "raw_text") + self.assertTrue(document.lines[3].metadata.hierarchy_level.can_be_multiline) + + self.assertEqual(document.lines[4].metadata.hierarchy_level.line_type, "custom_list") + self.assertEqual(document.lines[4].metadata.hierarchy_level.level_1, 2) + self.assertEqual(document.lines[4].metadata.hierarchy_level.level_2, 1) + self.assertFalse(document.lines[4].metadata.hierarchy_level.can_be_multiline) + + def test_list_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "txt", "pr_17.txt") + patterns = [ + RomanListPattern(line_type="chapter", level_1=1, level_2=1, can_be_multiline=False), + DottedListPattern(line_type="dotted_list", level_1=2, can_be_multiline=False), + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "raw_text") + self.assertEqual(document.lines[12].metadata.hierarchy_level.line_type, "chapter") + self.assertEqual(document.lines[14].metadata.hierarchy_level.line_type, "dotted_list") + + def test_regexp_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "without_numbering.docx") + patterns = [ + RegexpPattern(regexp="^глава\s\d+\.", line_type="глава", level_1=1), # noqa + RegexpPattern(regexp=re.compile(r"^статья\s\d+\.\d+\."), line_type="статья", level_1=2) + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[0].metadata.hierarchy_level.line_type, "raw_text") + self.assertEqual(document.lines[9].metadata.hierarchy_level.line_type, "глава") + self.assertEqual(document.lines[11].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[15].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[83].metadata.hierarchy_level.line_type, "глава") + + def test_start_word_patterns(self) -> None: + file_path = os.path.join(self.data_directory_path, "docx", "example.docx") + patterns = [ + {"name": "start_word", "start_word": "глава", "level_1": 1, "line_type": "глава"}, + {"name": "start_word", "start_word": "статья", "level_1": 2, "line_type": "статья"}, + ] + document = self.reader.read(file_path=file_path) + document = self.structure_extractor.extract(document=document, parameters={"patterns": patterns}) + self.assertEqual(document.lines[1].metadata.hierarchy_level.line_type, "глава") + self.assertEqual(document.lines[3].metadata.hierarchy_level.line_type, "статья") + self.assertEqual(document.lines[5].metadata.hierarchy_level.line_type, "статья") diff --git a/tests/unit_tests/test_module_gost_frame_recognizer.py b/tests/unit_tests/test_module_gost_frame_recognizer.py new file mode 100644 index 00000000..d3f35938 --- /dev/null +++ b/tests/unit_tests/test_module_gost_frame_recognizer.py @@ -0,0 +1,88 @@ +import os.path +import unittest +from typing import Optional + +import cv2 +import numpy as np + +import dedoc.utils.parameter_utils as param_utils +from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader +from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc +from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader +from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer +from tests.test_utils import get_test_config + + +class TestGOSTFrameRecognizer(unittest.TestCase): + + gost_frame_recognizer = GOSTFrameRecognizer(config=get_test_config()) + test_data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "tables")) + pdf_image_reader = PdfImageReader(config=get_test_config()) + pdf_auto_reader = PdfAutoReader(config=get_test_config()) + + def _get_params_for_parse(self, parameters: Optional[dict], file_path: Optional[str]) -> ParametersForParseDoc: + parameters = parameters if parameters else {} + file_path = file_path if file_path else "" + params_for_parse = ParametersForParseDoc( + language=param_utils.get_param_language(parameters), + orient_analysis_cells=param_utils.get_param_orient_analysis_cells(parameters), + orient_cell_angle=param_utils.get_param_orient_cell_angle(parameters), + is_one_column_document=param_utils.get_param_is_one_column_document(parameters), + document_orientation=param_utils.get_param_document_orientation(parameters), + need_header_footers_analysis=param_utils.get_param_need_header_footers_analysis(parameters), + need_pdf_table_analysis=param_utils.get_param_need_pdf_table_analysis(parameters), + first_page=0, + last_page=0, + need_binarization=param_utils.get_param_need_binarization(parameters), + table_type=param_utils.get_param_table_type(parameters), + with_attachments=param_utils.get_param_with_attachments(parameters), + attachments_dir=param_utils.get_param_attachments_dir(parameters, file_path), + need_content_analysis=param_utils.get_param_need_content_analysis(parameters), + need_gost_frame_analysis=param_utils.get_param_need_gost_frame_analysis(parameters), + pdf_with_txt_layer=param_utils.get_param_pdf_with_txt_layer(parameters) + ) + return params_for_parse + + def test_gost_frame_recognition(self) -> None: + image_names = [ + "gost_frame_1.jpg", "gost_frame_2.png", "gost_frame_3.jpg", "example_with_table6.png", "example_with_table5.png", "example_with_table3.png" + ] + gt = [True, True, True, False, False, False] + for index, image_name in enumerate(image_names): + path_image = os.path.join(self.test_data_folder, image_name) + image = cv2.imread(path_image) + result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image) + self.assertEqual(not np.array_equal(result_image, image), gt[index]) # check if we cut something from original image or not + + def test_not_gost_frame(self) -> None: + path_image = os.path.join(self.test_data_folder, "not_gost_frame.jpg") + image = cv2.imread(path_image) + result_image, result_bbox, original_image_shape = self.gost_frame_recognizer.rec_and_clean_frame(image) + self.assertTrue(abs(result_bbox.x_top_left - 26) < 10) + self.assertTrue(abs(result_bbox.y_top_left - 26) < 10) + self.assertTrue(abs(result_bbox.width - 722) < 10) + self.assertTrue(abs(result_bbox.height - 969) < 10) + + def test_coordinates_shift(self) -> None: + file_path = os.path.join(self.test_data_folder, "gost_frame_2.png") + parameters = {"need_gost_frame_analysis": "True"} + params_for_parse = self._get_params_for_parse(parameters=parameters, file_path=file_path) + result = self.pdf_image_reader._parse_document(path=file_path, parameters=params_for_parse) + self.assertTrue(len(result[0]) > 0) + self.assertTrue(abs(result[0][0].location.bbox.x_top_left - 365) < 10) + self.assertTrue(abs(result[0][0].location.bbox.y_top_left - 37) < 10) + self.assertTrue(abs(result[0][1].location.bbox.x_top_left - 84) < 10) + self.assertTrue(abs(result[0][1].location.bbox.y_top_left - 580) < 10) + self.assertTrue(len(result[1]) > 0) + self.assertTrue(abs(result[1][0].location.bbox.x_top_left - 81) < 10) + self.assertTrue(abs(result[1][0].location.bbox.y_top_left - 49) < 10) + + def test_pdf_auto_reader(self) -> None: + file_path = os.path.join(self.test_data_folder, "gost_frame_2.png") + parameters = {"need_gost_frame_analysis": "True"} + result = self.pdf_auto_reader.read(file_path=file_path, parameters=parameters) + self.assertTrue(len(result.tables) == 1) + self.assertEqual(result.tables[0].cells[0][1].get_text(), "Колонка 2") + self.assertEqual(result.tables[0].cells[0][2].get_text(), "Колонка 3") + self.assertEqual(len(result.tables[0].cells), 22) + self.assertTrue("Название таблицы (продолжение)" in result.lines[0].line)