Skip to content

Commit

Permalink
update master (#311)
Browse files Browse the repository at this point in the history
* TLDR-429 flake8 style testing added (#306)

* TLDR-429 flake8 style testing added

* Fix tests

* Docs fix

* Fix tests

* Review fixes

* TLDR-420 word bbox annotation (#307)

* add bbox annotations for words

* update tests

* fix lint

* fix confidence annotation and save relative bbox in BBoxAnnotation

* add docstrings for page_width and page_height

* new version 0.11.0 (#310)

---------

Co-authored-by: Bogatenkova Anastasiya <[email protected]>
Co-authored-by: Andrew Perminov <[email protected]>
  • Loading branch information
3 people authored Aug 22, 2023
1 parent 73b7c3c commit 9a1f7ff
Show file tree
Hide file tree
Showing 292 changed files with 2,487 additions and 3,211 deletions.
26 changes: 26 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[flake8]

max-line-length = 160
max-complexity = 13
inline-quotes = "
application-import-names = dedoc, tests
import-order-style = pycharm
exclude =
.git,
__pycache__,
.idea,
.github,
*__init__.py,
resources,
dedoc/scripts,
examples,
docs,
venv,
build,
dedoc.egg-info
# ANN101 - type annotations for self
ignore =
ANN101
6 changes: 3 additions & 3 deletions .github/workflows/test_on_push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
- name: Run lint
run: |
python3 -m pip install --upgrade pip
pip3 install pycodestyle==2.7.0 flake8==3.9.2 flake8-annotations==2.6.2 pyflakes==2.3.1
pip3 install .[lint]
flake8 .
- name: Run tests
run: |
python3 -m unittest -v -f tests/test_style.py
test="true" docker-compose up --build --exit-code-from test
20 changes: 20 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
repos:
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
exclude: \.github|.*__init__\.py|resources|dedoc/scripts|examples|docs|venv|build|dedoc\.egg-info
args:
- "--config=.flake8"
additional_dependencies: [
flake8-absolute-import==1.0.0.1,
flake8-annotations==2.9.1,
flake8-bugbear==23.3.12,
flake8-builtins==2.1.0,
flake8-import-order==0.18.2,
flake8-print==5.0.0,
flake8-quotes==3.3.2,
flake8-use-fstring==1.4,
pycodestyle==2.9.0,
pep8-naming==0.13.3
]
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.10.0
0.11.0
108 changes: 54 additions & 54 deletions dedoc/api/api_args.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# noqa
from typing import Any, Optional

from fastapi import Body
from pydantic import BaseModel

Expand Down Expand Up @@ -36,68 +36,68 @@ class QueryParameters(BaseModel):

def __init__(self,
# type of document structure parsing
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None), # noqa
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None), # noqa
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None), # noqa

# attachments handling
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None),
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None),
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None),
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None),
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None),
with_attachments: Optional[str] = Body(description="an option to enable the analysis of attached files. Default: 'false'", default=None), # noqa
need_content_analysis: Optional[str] = Body(description="turn on if you need parse the contents of the document attachments. Default: 'false'", default=None), # noqa
recursion_deep_attachments: Optional[str] = Body(description="the depth on which nested attachments will be parsed if need_content_analysis=true. Default: '10'", default=None), # noqa
return_base64: Optional[str] = Body(description="returns images in base64 format. Default: 'false'", default=None), # noqa
attachments_dir: Optional[str] = Body(description="path to the directory where to save files' attachments", default=None), # noqa

# tables handling
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None),
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None),
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None),
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None),
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None),
insert_table: Optional[str] = Body(description="Insert table into the result tree's content or not. Default: 'false'", default=None), # noqa
need_pdf_table_analysis: Optional[str] = Body(description="include a table analysis into pdfs. Default: 'true'", default=None), # noqa
table_type: Optional[str] = Body(description="a pipeline mode for a table recognition. Default: ''", default=None), # noqa
orient_analysis_cells: Optional[str] = Body(description="a table recognition option enables analysis of rotated cells in table headers. Default: 'false'", default=None), # noqa
orient_cell_angle: Optional[str] = Body(description="an option to set orientation of cells in table headers. \"270\" - cells are rotated 90 degrees clockwise, \"90\" - cells are rotated 90 degrees counterclockwise (or 270 clockwise)", default=None), # noqa

# pdf handling
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None),
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None),
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None),
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None),
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None),
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None),
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None),
pdf_with_text_layer: Optional[str] = Body(description="an option to extract text from a text layer to PDF or using OCR methods for image-documents. Default: 'auto_tabby'", enum=["true", "false", "auto", "auto_tabby", "tabby"], default=None), # noqa
language: Optional[str] = Body(description="a recognition language. Default: 'rus+eng'", enum=["rus+eng", "rus", "eng"], default=None), # noqa
pages: Optional[str] = Body(description="an option to limit page numbers in pdf, archives with images. left:right, read pages from left to right. Default: ':'", default=None), # noqa
is_one_column_document: Optional[str] = Body(description="an option to set one or multiple column document. \"auto\" - system predict number of columns in document pages, \"true\" - is one column documents, \"false\" - is multiple column documents. Default: 'auto'", default=None), # noqa
document_orientation: Optional[str] = Body(description="an option to set vertical orientation of the document without using an orientation classifier \"auto\" - system predict angle (0, 90, 180, 270) and rotate document, \"no_change\" - do not predict orientation. Default: 'auto'", enum=["auto", "no_change"], default=None), # noqa
need_header_footer_analysis: Optional[str] = Body(description="include header-footer analysis into pdf with text layer. Default: 'false'", default=None), # noqa
need_binarization: Optional[str] = Body(description="include an adaptive binarization into pdf without a text layer. Default: 'false'", default=None), # noqa

# other formats handling
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None),
encoding: Optional[str] = Body(description="a document encoding", default=None),
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None),
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None),
delimiter: Optional[str] = Body(description="a column separator for csv-files", default=None), # noqa
encoding: Optional[str] = Body(description="a document encoding", default=None), # noqa
html_fields: Optional[str] = Body(description="a list of fields for JSON documents to be parsed as HTML documents. It is written as a json string of a list, where each list item is a list of keys to get the field. Default: ''", default=None), # noqa
handle_invisible_table: Optional[str] = Body(description="handle table without visible borders as tables in html. Default: 'false'", default=None), # noqa


**data: Any) -> None:
**data: Any) -> None: # noqa

super().__init__(**data)
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or 'tree'
self.return_format: str = return_format or 'json'

self.with_attachments: str = with_attachments or 'false'
self.need_content_analysis: str = need_content_analysis or 'false'
self.recursion_deep_attachments: str = recursion_deep_attachments or '10'
self.return_base64: str = return_base64 or 'false'
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or 'false'
self.need_pdf_table_analysis: str = need_pdf_table_analysis or 'true'
self.table_type: str = table_type or ''
self.orient_analysis_cells: str = orient_analysis_cells or 'false'
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or 'auto_tabby'
self.language: str = language or "rus+eng"
self.pages: str = pages or ':'
self.is_one_column_document: str = is_one_column_document or 'auto'
self.document_orientation: str = document_orientation or "auto"
self.need_header_footer_analysis: str = need_header_footer_analysis or 'false'
self.need_binarization: str = need_binarization or 'false'

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ''
self.handle_invisible_table: str = handle_invisible_table or 'false'
self.document_type: str = document_type or ""
self.structure_type: str = structure_type or "tree"
self.return_format: str = return_format or "json"

self.with_attachments: str = with_attachments or "false"
self.need_content_analysis: str = need_content_analysis or "false"
self.recursion_deep_attachments: str = recursion_deep_attachments or "10"
self.return_base64: str = return_base64 or "false"
self.attachments_dir: str = attachments_dir

self.insert_table: str = insert_table or "false"
self.need_pdf_table_analysis: str = need_pdf_table_analysis or "true"
self.table_type: str = table_type or ""
self.orient_analysis_cells: str = orient_analysis_cells or "false"
self.orient_cell_angle: str = orient_cell_angle or "90"

self.pdf_with_text_layer: str = pdf_with_text_layer or "auto_tabby"
self.language: str = language or "rus+eng"
self.pages: str = pages or ":"
self.is_one_column_document: str = is_one_column_document or "auto"
self.document_orientation: str = document_orientation or "auto"
self.need_header_footer_analysis: str = need_header_footer_analysis or "false"
self.need_binarization: str = need_binarization or "false"

self.delimiter: str = delimiter
self.encoding: str = encoding
self.html_fields: str = html_fields or ""
self.handle_invisible_table: str = handle_invisible_table or "false"
Loading

0 comments on commit 9a1f7ff

Please sign in to comment.