update master (#311)

* TLDR-429 flake8 style testing added (#306) * TLDR-429 flake8 style testing added * Fix tests * Docs fix * Fix tests * Review fixes * TLDR-420 word bbox annotation (#307) * add bbox annotations for words * update tests * fix lint * fix confidence annotation and save relative bbox in BBoxAnnotation * add docstrings for page_width and page_height * new version 0.11.0 (#310) --------- Co-authored-by: Bogatenkova Anastasiya <[email protected]> Co-authored-by: Andrew Perminov <[email protected]>
ispras · Aug 22, 2023 · 9a1f7ff · 9a1f7ff
1 parent 73b7c3c
commit 9a1f7ff
Show file tree

Hide file tree

Showing 292 changed files with 2,487 additions and 3,211 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,26 @@
+[flake8]
+
+max-line-length = 160
+max-complexity = 13
+inline-quotes = "
+
+application-import-names = dedoc, tests
+import-order-style = pycharm
+
+exclude =
+    .git,
+    __pycache__,
+    .idea,
+    .github,
+    *__init__.py,
+    resources,
+    dedoc/scripts,
+    examples,
+    docs,
+    venv,
+    build,
+    dedoc.egg-info
+
+# ANN101 - type annotations for self
+ignore =
+    ANN101
diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml
@@ -29,11 +29,11 @@ jobs:
       uses: actions/setup-python@v2
       with:
         python-version: '3.8'
-    - name: Install dependencies
+    - name: Run lint
       run: |
         python3 -m pip install --upgrade pip
-        pip3 install pycodestyle==2.7.0 flake8==3.9.2 flake8-annotations==2.6.2 pyflakes==2.3.1
+        pip3 install .[lint]
+        flake8 .
     - name: Run tests
       run: |
-        python3 -m unittest -v -f tests/test_style.py
         test="true" docker-compose up --build --exit-code-from test
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+repos:
+-   repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+    -   id: flake8
+        exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
+        args:
+            - "--config=.flake8"
+        additional_dependencies: [
+            flake8-absolute-import==1.0.0.1,
+            flake8-annotations==2.9.1,
+            flake8-bugbear==23.3.12,
+            flake8-builtins==2.1.0,
+            flake8-import-order==0.18.2,
+            flake8-print==5.0.0,
+            flake8-quotes==3.3.2,
+            flake8-use-fstring==1.4,
+            pycodestyle==2.9.0,
+            pep8-naming==0.13.3
+        ]
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.10.0
+0.11.0
diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -1,5 +1,5 @@
-# noqa
 from typing import Any, Optional
+
 from fastapi import Body
 from pydantic import BaseModel
 
@@ -36,68 +36,68 @@ class QueryParameters(BaseModel):
 
     def __init__(self,
                  # type of document structure parsing
-                 document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),
-                 structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
-                 return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),
+                 document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),  # noqa
+                 structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),  # noqa
+                 return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),  # noqa
 
                  # attachments handling
-                 with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),
-                 need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
-                 recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
-                 return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
-                 attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),
+                 with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),  # noqa
+                 need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),  # noqa
+                 recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),  # noqa
+                 return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),  # noqa
+                 attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),  # noqa
 
                  # tables handling
-                 insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
-                 need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),
-                 table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),
-                 orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),
-                 orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),
+                 insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),  # noqa
+                 need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),  # noqa
+                 table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),  # noqa
+                 orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),  # noqa
+                 orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),  # noqa
 
                  # pdf handling
-                 pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),
-                 language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),
-                 pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),
-                 is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),
-                 document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),
-                 need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),
-                 need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),
+                 pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),  # noqa
+                 language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),  # noqa
+                 pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),  # noqa
+                 is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),  # noqa
+                 document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),  # noqa
+                 need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),  # noqa
+                 need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),  # noqa
 
                  # other formats handling
-                 delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),
-                 encoding: Optional[str] = Body(description="a document encoding", default=None),
-                 html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),
-                 handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),
+                 delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),  # noqa
+                 encoding: Optional[str] = Body(description="a document encoding", default=None),  # noqa
+                 html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),  # noqa
+                 handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),  # noqa
 
 
-                 **data: Any) -> None:
+                 **data: Any) -> None:  # noqa
 
         super().__init__(**data)
-        self.document_type: str                 = document_type or ""
-        self.structure_type: str                = structure_type or 'tree'
-        self.return_format: str                 = return_format or 'json'
-
-        self.with_attachments: str              = with_attachments or 'false'
-        self.need_content_analysis: str         = need_content_analysis or 'false'
-        self.recursion_deep_attachments: str    = recursion_deep_attachments or '10'
-        self.return_base64: str                 = return_base64 or 'false'
-        self.attachments_dir: str               = attachments_dir
-
-        self.insert_table: str                  = insert_table or 'false'
-        self.need_pdf_table_analysis: str       = need_pdf_table_analysis or 'true'
-        self.table_type: str                    = table_type or ''
-        self.orient_analysis_cells: str         = orient_analysis_cells or 'false'
-        self.orient_cell_angle: str             = orient_cell_angle or "90"
-
-        self.pdf_with_text_layer: str           = pdf_with_text_layer or 'auto_tabby'
-        self.language: str                      = language or "rus+eng"
-        self.pages: str                         = pages or ':'
-        self.is_one_column_document: str        = is_one_column_document or 'auto'
-        self.document_orientation: str          = document_orientation or "auto"
-        self.need_header_footer_analysis: str   = need_header_footer_analysis or 'false'
-        self.need_binarization: str             = need_binarization or 'false'
-
-        self.delimiter: str                     = delimiter
-        self.encoding: str                      = encoding
-        self.html_fields: str                   = html_fields or ''
-        self.handle_invisible_table: str        = handle_invisible_table or 'false'
+        self.document_type: str = document_type or ""
+        self.structure_type: str = structure_type or "tree"
+        self.return_format: str = return_format or "json"
+
+        self.with_attachments: str = with_attachments or "false"
+        self.need_content_analysis: str = need_content_analysis or "false"
+        self.recursion_deep_attachments: str = recursion_deep_attachments or "10"
+        self.return_base64: str = return_base64 or "false"
+        self.attachments_dir: str = attachments_dir
+
+        self.insert_table: str = insert_table or "false"
+        self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
+        self.table_type: str = table_type or ""
+        self.orient_analysis_cells: str = orient_analysis_cells or "false"
+        self.orient_cell_angle: str = orient_cell_angle or "90"
+
+        self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby"
+        self.language: str = language or "rus+eng"
+        self.pages: str = pages or ":"
+        self.is_one_column_document: str = is_one_column_document or "auto"
+        self.document_orientation: str = document_orientation or "auto"
+        self.need_header_footer_analysis: str = need_header_footer_analysis or "false"
+        self.need_binarization: str = need_binarization or "false"
+
+        self.delimiter: str = delimiter
+        self.encoding: str = encoding
+        self.html_fields: str = html_fields or ""
+        self.handle_invisible_table: str = handle_invisible_table or "false"