diff --git a/tests/api_tests/content_checker.py b/tests/api_tests/content_checker.py index 549ec375..025397ee 100644 --- a/tests/api_tests/content_checker.py +++ b/tests/api_tests/content_checker.py @@ -77,7 +77,7 @@ def __check_metadata(self, metadata: dict) -> None: if "other_fields" in metadata: self.assertIsInstance(metadata["other_fields"], dict) - def check_english_doc(self, result: dict) -> None: + def _check_english_doc(self, result: dict) -> None: content = result["content"] structure = content["structure"] self._check_tree_sanity(structure) diff --git a/tests/api_tests/test_api_doctype_law.py b/tests/api_tests/test_api_doctype_law.py index 3c9807d5..6fe3079c 100644 --- a/tests/api_tests/test_api_doctype_law.py +++ b/tests/api_tests/test_api_doctype_law.py @@ -616,16 +616,6 @@ def test_foiv_html(self) -> None: self.assertEqual("4.2.1.", node["text"].strip()) self.assertEqual("item", node['metadata']['paragraph_type']) - @unittest.skip("TODO fix this") - def test_number_not_part(self) -> None: - file_name = "31(1).txt" - result = self._send_request(file_name, dict(document_type="law"), expected_code=200) - document_tree = result["content"]["structure"] - self.__test_law_tree_sanity(document_tree) - node = self._get_by_tree_path(document_tree, "0.0.3.5.0.0") - self.assertTrue(node["text"].strip().endswith("2 настоящей статьи.")) - self.assertEqual("raw_text", node['metadata']['paragraph_type']) - def test_html_invisible_table(self) -> None: file_name = "invisibly_table4.html" result = self._send_request(file_name, dict(document_type="law"), expected_code=200) diff --git a/tests/api_tests/test_api_doctype_tz.py b/tests/api_tests/test_api_doctype_tz.py index 294b6994..11247e2c 100644 --- a/tests/api_tests/test_api_doctype_tz.py +++ b/tests/api_tests/test_api_doctype_tz.py @@ -6,7 +6,7 @@ from tests.test_utils import tree2linear -class TestLawApiDocReader(AbstractTestApiDocReader): +class TestTZApiDocReader(AbstractTestApiDocReader): def test_doc_tz(self) -> None: file_name = "alpaca_tz.doc" diff --git a/tests/api_tests/test_api_format_archives.py b/tests/api_tests/test_api_format_archives.py index 48f740d5..0fffa991 100644 --- a/tests/api_tests/test_api_format_archives.py +++ b/tests/api_tests/test_api_format_archives.py @@ -12,7 +12,7 @@ def _check_archive_with_english_doc(self, file_name: str) -> None: result = self._send_request(file_name, dict(with_attachments="True")) self.assertEqual(len(result['attachments']), 4) english_doc = [doc for doc in result['attachments'] if doc["metadata"]["file_name"].startswith("english_doc")][0] - self.check_english_doc(english_doc) + self._check_english_doc(english_doc) def test_zip(self) -> None: file_name = "arch_with_attachs.zip" @@ -77,4 +77,4 @@ def test_broken_archive(self) -> None: result = self._send_request(file_name, dict(with_attachments="True")) self.assertEqual(len(result['attachments']), 7) english_doc = [doc for doc in result['attachments'] if doc["metadata"]["file_name"].startswith("english_doc")][0] - self.check_english_doc(english_doc) + self._check_english_doc(english_doc) diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py index a6f2cff2..e7c9a5b4 100644 --- a/tests/api_tests/test_api_format_docx.py +++ b/tests/api_tests/test_api_format_docx.py @@ -102,18 +102,12 @@ def test_tricky_doc(self) -> None: file_name = "doc.docx" _ = self._send_request(file_name) - def test_broken_docx(self) -> None: - self._send_request("broken.docx", expected_code=415) - def test_not_stripped_xml(self) -> None: self._send_request("not_stripped_xml.docx", expected_code=200) def test_docx_with_comments(self) -> None: _ = self._send_request("with_comments.docx", expected_code=200) - def test_send_wo_file(self) -> None: - self._send_request_wo_file(expected_code=422) - def test_return_html(self) -> None: file_name = "example.doc" result = self._send_request(file_name, data={"structure_type": "tree", "return_format": "html"}) diff --git a/tests/api_tests/test_api_doctype_email.py b/tests/api_tests/test_api_format_email.py similarity index 100% rename from tests/api_tests/test_api_doctype_email.py rename to tests/api_tests/test_api_format_email.py diff --git a/tests/api_tests/test_api_format_json.py b/tests/api_tests/test_api_format_json.py index ce8ba13e..99968320 100644 --- a/tests/api_tests/test_api_format_json.py +++ b/tests/api_tests/test_api_format_json.py @@ -5,7 +5,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiCSVReader(AbstractTestApiDocReader): +class TestApiJSONReader(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.abspath(os.path.join(self.data_directory_path, "json", file_name)) diff --git a/tests/api_tests/test_api_format_pdf_auto_text_layer.py b/tests/api_tests/test_api_format_pdf_auto_text_layer.py index b5dc6264..b232798f 100644 --- a/tests/api_tests/test_api_format_pdf_auto_text_layer.py +++ b/tests/api_tests/test_api_format_pdf_auto_text_layer.py @@ -44,13 +44,13 @@ def test_auto_pdf_with_text_layer(self) -> None: file_name = os.path.join("..", "pdf_with_text_layer", "english_doc.pdf") result = self._send_request(file_name, dict(pdf_with_text_layer="auto")) self.assertIn("Assume document has a correct textual layer", result["warnings"]) - self.check_english_doc(result) + self._check_english_doc(result) def test_auto_pdf_with_wrong_text_layer(self) -> None: file_name = "english_doc_bad_text.pdf" result = self._send_request(file_name, dict(pdf_with_text_layer="auto")) self.assertIn("Assume document has incorrect textual layer", result["warnings"]) - self.check_english_doc(result) + self._check_english_doc(result) def test_auto_document_mixed(self) -> None: file_name = "mixed_pdf.pdf" @@ -58,7 +58,7 @@ def test_auto_document_mixed(self) -> None: result = self._send_request(file_name, dict(pdf_with_text_layer=pdf_with_text_layer)) self.assertIn("Assume document has a correct textual layer", result["warnings"]) self.assertIn("Assume the first page hasn't a textual layer", result["warnings"]) - self.check_english_doc(result) + self._check_english_doc(result) structure = result["content"]["structure"] list_items = structure["subparagraphs"][1]["subparagraphs"] self.assertEqual("3) продолжаем список\n", list_items[2]["text"]) diff --git a/tests/api_tests/test_api_format_pdf_page_limit.py b/tests/api_tests/test_api_format_pdf_page_limit.py index 1729ce9d..d2533d64 100644 --- a/tests/api_tests/test_api_format_pdf_page_limit.py +++ b/tests/api_tests/test_api_format_pdf_page_limit.py @@ -3,7 +3,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiPdfReader(AbstractTestApiDocReader): +class TestApiPdfPageLimit(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "pdf_with_text_layer", file_name) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 6921b4d6..b273fef8 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -16,7 +16,7 @@ def __filter_by_name(self, annotations: List[dict], name: str) -> List[dict]: def test_example_file(self) -> None: file_name = "english_doc.pdf" result = self._send_request(file_name, data=dict(pdf_with_text_layer="tabby")) - self.check_english_doc(result) + self._check_english_doc(result) @unittest.skip("TODO: add two layers output order support, e.g footnotes and main text.") def test_former_txt_file(self) -> None: @@ -231,93 +231,6 @@ def test_pdf_with_tables(self) -> None: self.assertEqual("", table[1][0]) self.assertEqual("Прогноз", table[1][1]) self.assertEqual("Прогноз бюджета", table[1][2]) - self.assertEqual("Прогноз бюджета", table[1][3]) - self.assertEqual("Прогноз бюджета", table[1][4]) - self.assertEqual("Расходы", table[2][0]) - self.assertEqual("19,8", table[2][1]) - self.assertEqual("18,6", table[2][2]) - self.assertEqual("17,3", table[2][3]) - self.assertEqual("16,1", table[2][4]) - self.assertEqual("Доходы", table[3][0]) - self.assertEqual("16,1", table[3][1]) - self.assertEqual("15,4", table[3][2]) - self.assertEqual("15,1", table[3][3]) - self.assertEqual("15,0", table[3][4]) - self.assertEqual("Нефтегазовые\nдоходы", table[4][0]) - self.assertEqual("5,8", table[4][1]) - self.assertEqual("5,8", table[4][2]) - self.assertEqual("5,5", table[4][3]) - self.assertEqual("5,4", table[4][4]) - self.assertEqual("Ненефтегазов\nые доходы", table[5][0]) - self.assertEqual("10,4", table[5][1]) - self.assertEqual("9,6", table[5][2]) - self.assertEqual("9,6", table[5][3]) - self.assertEqual("9,6", table[5][4]) - self.assertEqual("Сальдо\nбюджета", table[6][0]) - self.assertEqual("-3,7", table[6][1]) - self.assertEqual("-3,2", table[6][2]) - self.assertEqual("-2,2", table[6][3]) - self.assertEqual("-1,2", table[6][4]) - self.assertEqual("", table[7][0]) - self.assertEqual("2016", table[7][1]) - self.assertEqual("2017", table[7][2]) - self.assertEqual("2018", table[7][3]) - self.assertEqual("2019", table[7][4]) - self.assertEqual("", table[8][0]) - self.assertEqual("Прогноз", table[8][1]) - self.assertEqual("Прогноз бюджета", table[8][2]) - self.assertEqual("Прогноз бюджета", table[8][3]) - self.assertEqual("Прогноз бюджета", table[8][4]) - self.assertEqual("Расходы", table[9][0]) - self.assertEqual("19,8", table[9][1]) - self.assertEqual("18,6", table[9][2]) - self.assertEqual("17,3", table[9][3]) - self.assertEqual("16,1", table[9][4]) - self.assertEqual("Доходы", table[10][0]) - self.assertEqual("16,1", table[10][1]) - self.assertEqual("15,4", table[10][2]) - self.assertEqual("15,1", table[10][3]) - self.assertEqual("15,0", table[10][4]) - self.assertEqual("Нефтегазовые\nдоходы", table[11][0]) - self.assertEqual("5,8", table[11][1]) - self.assertEqual("5,8", table[11][2]) - self.assertEqual("5,5", table[11][3]) - self.assertEqual("5,4", table[11][4]) - self.assertEqual("Ненефтегазов\nые доходы", table[12][0]) - self.assertEqual("10,4", table[12][1]) - self.assertEqual("9,6", table[12][2]) - self.assertEqual("9,6", table[12][3]) - self.assertEqual("9,6", table[12][4]) - self.assertEqual("Сальдо\nбюджета", table[13][0]) - self.assertEqual("-3,7", table[13][1]) - self.assertEqual("-3,2", table[13][2]) - self.assertEqual("-2,2", table[13][3]) - self.assertEqual("-1,2", table[13][4]) - self.assertEqual("", table[14][0]) - self.assertEqual("2016", table[14][1]) - self.assertEqual("2017", table[14][2]) - self.assertEqual("2018", table[14][3]) - self.assertEqual("2019", table[14][4]) - self.assertEqual("", table[15][0]) - self.assertEqual("Прогноз", table[15][1]) - self.assertEqual("Прогноз бюджета", table[15][2]) - self.assertEqual("Прогноз бюджета", table[15][3]) - self.assertEqual("Прогноз бюджета", table[15][4]) - self.assertEqual("Расходы", table[16][0]) - self.assertEqual("19,8", table[16][1]) - self.assertEqual("18,6", table[16][2]) - self.assertEqual("17,3", table[16][3]) - self.assertEqual("16,1", table[16][4]) - self.assertEqual("Доходы", table[17][0]) - self.assertEqual("16,1", table[17][1]) - self.assertEqual("15,4", table[17][2]) - self.assertEqual("15,1", table[17][3]) - self.assertEqual("15,0", table[17][4]) - self.assertEqual("Нефтегазовые\nдоходы", table[18][0]) - self.assertEqual("5,8", table[18][1]) - self.assertEqual("5,8", table[18][2]) - self.assertEqual("5,5", table[18][3]) - self.assertEqual("5,4", table[18][4]) self.assertEqual("Ненефтегазов\nые доходы", table[19][0]) self.assertEqual("10,4", table[19][1]) self.assertEqual("9,6", table[19][2]) diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index e4bc8d64..54265beb 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -5,7 +5,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiPdfReader(AbstractTestApiDocReader): +class TestApiPdfWithText(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "pdf_with_text_layer", file_name) diff --git a/tests/api_tests/test_api_format_pptx.py b/tests/api_tests/test_api_format_pptx.py index 22e029a1..14d0921a 100644 --- a/tests/api_tests/test_api_format_pptx.py +++ b/tests/api_tests/test_api_format_pptx.py @@ -3,7 +3,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiExcelReader(AbstractTestApiDocReader): +class TestApiPPTXReader(AbstractTestApiDocReader): data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "pptx") diff --git a/tests/api_tests/test_api_misc_language.py b/tests/api_tests/test_api_misc_language.py index 63978578..29980924 100644 --- a/tests/api_tests/test_api_misc_language.py +++ b/tests/api_tests/test_api_misc_language.py @@ -3,26 +3,26 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiDocReader(AbstractTestApiDocReader): +class TestLanguage(AbstractTestApiDocReader): data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx") def test_en_doc(self) -> None: file_name = "english_doc.doc" result = self._send_request(file_name, dict(language="eng", structure_type="tree")) - self.check_english_doc(result) + self._check_english_doc(result) def test_en_docx(self) -> None: file_name = "english_doc.docx" result = self._send_request(file_name, dict(language="eng", structure_type="tree")) - self.check_english_doc(result) + self._check_english_doc(result) def test_en_odt(self) -> None: file_name = "english_doc.odt" result = self._send_request(file_name, dict(language="eng", structure_type="tree")) - self.check_english_doc(result) + self._check_english_doc(result) def test_en_pdf(self) -> None: file_name = "../pdf_with_text_layer/english_doc.pdf" result = self._send_request(file_name, dict(language="eng")) - self.check_english_doc(result) + self._check_english_doc(result) diff --git a/tests/api_tests/test_api_misc_list_patching.py b/tests/api_tests/test_api_misc_list_patching.py index 1f73ce05..18e5eb54 100644 --- a/tests/api_tests/test_api_misc_list_patching.py +++ b/tests/api_tests/test_api_misc_list_patching.py @@ -1,7 +1,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiDocReader(AbstractTestApiDocReader): +class TestListPatching(AbstractTestApiDocReader): def test_list_patching(self) -> None: file_name = "docx/13_moloko_1_polug.docx" diff --git a/tests/api_tests/test_api_misc_main.py b/tests/api_tests/test_api_misc_main.py index 4302e119..1527c915 100644 --- a/tests/api_tests/test_api_misc_main.py +++ b/tests/api_tests/test_api_misc_main.py @@ -4,9 +4,6 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -# test_structure -# тесты на ошибки вынести в отдельный файл -# вести файлик с темами тестов class TestApi(AbstractTestApiDocReader): @@ -45,6 +42,3 @@ def test_text(self) -> None: self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["text"].strip(), '1. Элемент нумерованного списка') self.assertEqual(content["subparagraphs"][1]["subparagraphs"][0]["metadata"]['paragraph_type'], 'list_item') self._check_metainfo(result['metadata'], 'text/plain', file_name) - - def test_bin_file(self) -> None: - self._send_request("file.bin", expected_code=415) diff --git a/tests/api_tests/test_api_misc_multipage_table.py b/tests/api_tests/test_api_misc_multipage_table.py index 306adff7..5531b265 100644 --- a/tests/api_tests/test_api_misc_multipage_table.py +++ b/tests/api_tests/test_api_misc_multipage_table.py @@ -4,7 +4,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestRecognizedTable(AbstractTestApiDocReader): +class TestMultipageTable(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "tables", file_name) diff --git a/tests/api_tests/test_api_misc_nesting_list.py b/tests/api_tests/test_api_misc_nesting_list.py index 527fa9dd..ecc0c96d 100644 --- a/tests/api_tests/test_api_misc_nesting_list.py +++ b/tests/api_tests/test_api_misc_nesting_list.py @@ -1,7 +1,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiDocReader(AbstractTestApiDocReader): +class TestNestingList(AbstractTestApiDocReader): def test_list_nesting_content(self) -> None: file_name = "docx/pr14tz_v5_2007_03_01.docx" diff --git a/tests/api_tests/test_api_misc_structure.py b/tests/api_tests/test_api_misc_structure.py index ce8cafaa..978e8ec4 100644 --- a/tests/api_tests/test_api_misc_structure.py +++ b/tests/api_tests/test_api_misc_structure.py @@ -3,7 +3,7 @@ from tests.api_tests.abstract_api_test import AbstractTestApiDocReader -class TestApiDocReader(AbstractTestApiDocReader): +class TestStructure(AbstractTestApiDocReader): data_directory_path = os.path.join(AbstractTestApiDocReader.data_directory_path, "docx") diff --git a/tests/api_tests/test_api_misc_with_attachments.py b/tests/api_tests/test_api_misc_with_attachments.py index 8060ba3a..437d044d 100644 --- a/tests/api_tests/test_api_misc_with_attachments.py +++ b/tests/api_tests/test_api_misc_with_attachments.py @@ -11,18 +11,6 @@ class TestApiAttachmentsReader(AbstractTestApiDocReader): data_directory_path = AbstractTestApiDocReader.data_directory_path - def check_pdf_1(self, pdf: dict) -> None: - content = pdf["content"]['structure'] - self.assertEqual("Глава 543\n", content["subparagraphs"][0]["text"]) - self.assertEqual("Какой-то текст.\n", content["subparagraphs"][0]["subparagraphs"][0]["text"]) - self.assertEqual(content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][0]['text'], '1.\n') - self.assertEqual(content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][1]['text'], '2.\n') - self.assertEqual(content["subparagraphs"][0]["subparagraphs"][1]["subparagraphs"][2]['text'], '3.\n') - - def check_pdf_2(self, pdf: dict) -> None: - content = pdf["content"] - self.assertEqual("Пример документа\n", content['structure']['subparagraphs'][0]["text"]) - def _check_attachments(self, attachments: List[dict]) -> None: for attachment in attachments: self.assertTrue(attachment["attachments"] is not None) @@ -172,7 +160,7 @@ def test_docx_images_base64(self) -> None: with open(path, "wb") as file_out: file_out.write(base64.decodebytes(base64_encode.encode())) result_english = self._send_request(file_name=path, data={}) - self.check_english_doc(result_english) + self._check_english_doc(result_english) def test_docx_images_no_base64(self) -> None: metadata = self.__check_base64(False) diff --git a/tests/unit_tests/abstract_converter_test.py b/tests/unit_tests/abstract_converter_test.py index 2eab1c93..30644646 100644 --- a/tests/unit_tests/abstract_converter_test.py +++ b/tests/unit_tests/abstract_converter_test.py @@ -7,9 +7,6 @@ class AbstractConverterTest(TestCase): - """ - Class for testing abstract converter - """ path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) def setUp(self) -> None: @@ -27,9 +24,6 @@ def tearDown(self) -> None: self.tmp_dir.cleanup() def _convert(self, filename: str, extension: str, converter: AbstractConverter) -> None: - """ - Method for converting file into another extension - """ filename_with_extension = filename + extension file = os.path.join(self.path, filename_with_extension) tmp_file = os.path.join(self.tmp_dir.name, filename_with_extension) diff --git a/tests/unit_tests/test_doctype_law_dynamic_classifier.py b/tests/unit_tests/test_doctype_law_dynamic_classifier.py index 37570f01..46c763b4 100644 --- a/tests/unit_tests/test_doctype_law_dynamic_classifier.py +++ b/tests/unit_tests/test_doctype_law_dynamic_classifier.py @@ -18,7 +18,7 @@ class TestFoivApiDocreader(unittest.TestCase): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_path, "doctypes", file_name) - def _test_classifier_type(self, file_name: str, expected_type: str) -> None: + def _test_document_type(self, file_name: str, expected_type: str) -> None: config = {} base_reader = RawTextReader(config=config) unstructured_document = base_reader.read(path=self._get_abs_path(file_name), @@ -31,44 +31,44 @@ def _test_classifier_type(self, file_name: str, expected_type: str) -> None: def test_law(self) -> None: file_name = 'закон.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_instruction(self) -> None: file_name = 'инструкция.txt' expected_type = 'foiv_law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_codex(self) -> None: file_name = 'кодекс.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_definition(self) -> None: file_name = 'определение.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_resolution(self) -> None: file_name = 'постановление.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_order(self) -> None: file_name = 'приказ.txt' expected_type = 'foiv_law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_disposal(self) -> None: file_name = 'распоряжение.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_decree(self) -> None: file_name = 'указ.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) def test_fz(self) -> None: file_name = 'федеральный_закон.txt' expected_type = 'law' - self._test_classifier_type(file_name, expected_type) + self._test_document_type(file_name, expected_type) diff --git a/tests/unit_tests/test_doctype_law_structure_extractor.py b/tests/unit_tests/test_doctype_law_structure_extractor.py index 3b0f3ce7..e73ab545 100644 --- a/tests/unit_tests/test_doctype_law_structure_extractor.py +++ b/tests/unit_tests/test_doctype_law_structure_extractor.py @@ -1,4 +1,3 @@ -import os import random import time import unittest @@ -61,7 +60,7 @@ def test_begin_application(self) -> None: for application_start in application_starts: self.assertIsNotNone(self.structure_extractor.classifier.regexp_application_begin.match(application_start.lower())) - def test_number_regexp(self) -> None: + def test_string_number_correctness_with_regexp(self) -> None: lines = ['03.06.2009 № 17, от 07.10.2009 № 42, от 10.03.2010 № 6, от 14.04.2010 № 11, от', 'правонарушениях. (В редакции Закона Москвы от 24.06.2015 г. № 39)', '2. Нарушение административного регламента', @@ -82,34 +81,34 @@ def test_number_ends(self) -> None: res = AbstractFeatureExtractor.ends_of_number.search(number) self.assertEqual(number[:res.start()], without_ends[num]) - def __get_line(self, hierarchy_level: HierarchyLevel, text: str) -> LineWithMeta: + def __get_line_with_meta(self, hierarchy_level: HierarchyLevel, text: str) -> LineWithMeta: metadata = LineMetadata(page_id=0, line_id=0, hierarchy_level=hierarchy_level) return LineWithMeta(line=text, metadata=metadata, annotations=[]) - def __check_one_postprocess(self, text: str, text_expected: str) -> None: + def __check_postprocess_of_one_string_w_roman_numeral(self, text: str, text_expected: str) -> None: hierarchy_level = HierarchyLevel(4, 0, True, "subsection") - line = self.__get_line(hierarchy_level=hierarchy_level, text=text) + line = self.__get_line_with_meta(hierarchy_level=hierarchy_level, text=text) result = self.structure_extractor._postprocess_roman(hierarchy_level=hierarchy_level, line=line) self.assertEqual(text_expected, result.line) - def test_postprocess_roman(self) -> None: - self.__check_one_postprocess("I. Общие положения", "I. Общие положения") - self.__check_one_postprocess("Т. Общие положения", "I. Общие положения") - self.__check_one_postprocess("Г. Общие положения", "I. Общие положения") - self.__check_one_postprocess("T. Общие положения", "I. Общие положения") - self.__check_one_postprocess("П. Общие положения", "II. Общие положения") - self.__check_one_postprocess("Ш. Общие положения", "III. Общие положения") - self.__check_one_postprocess("ТУ. Общие положения", "IV. Общие положения") - self.__check_one_postprocess("УТ. Общие положения", "VI. Общие положения") - self.__check_one_postprocess(" УТ. Общие положения", " VI. Общие положения") - self.__check_one_postprocess("У. Общие положения", "V. Общие положения") - self.__check_one_postprocess("V. Общие положения", "V. Общие положения") - self.__check_one_postprocess("Общие положения", "Общие положения") + def test_postprocessing_of_strings_with_roman_numerals(self) -> None: + self.__check_postprocess_of_one_string_w_roman_numeral("I. Общие положения", "I. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("Т. Общие положения", "I. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("Г. Общие положения", "I. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("T. Общие положения", "I. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("П. Общие положения", "II. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("Ш. Общие положения", "III. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("ТУ. Общие положения", "IV. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("УТ. Общие положения", "VI. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral(" УТ. Общие положения", " VI. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("У. Общие положения", "V. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("V. Общие положения", "V. Общие положения") + self.__check_postprocess_of_one_string_w_roman_numeral("Общие положения", "Общие положения") def test_empty_document(self) -> None: self.assertListEqual([], self.structure_extractor.classifier.predict([])) - def test__fix_labels(self) -> None: + def test_fix_labels(self) -> None: labels = ["title", "raw_text", "title", "structure_unit", "title", "cellar", "structure_unit", "cellar", "application"] labels_expected = ["title", "title", "title", "structure_unit", "raw_text", "raw_text", "structure_unit", diff --git a/tests/unit_tests/test_doctype_law_text_features_regexps.py b/tests/unit_tests/test_doctype_law_text_features_regexps.py index 7233845b..84c1c6ad 100644 --- a/tests/unit_tests/test_doctype_law_text_features_regexps.py +++ b/tests/unit_tests/test_doctype_law_text_features_regexps.py @@ -12,7 +12,7 @@ def test_roman_regexp(self) -> None: self.assertTrue(self.features.roman_regexp.fullmatch(' XI.') is None) self.assertTrue(self.features.roman_regexp.fullmatch('\tIII. ')) - def test_application_begin_regexp(self) -> None: + def test_application_beginnings_with_regexp(self) -> None: self.assertTrue(self.features.regexp_application_begin.fullmatch('приложение')) self.assertTrue(self.features.regexp_application_begin.fullmatch('Приложение')) self.assertTrue(self.features.regexp_application_begin.fullmatch('утверждены')) @@ -21,7 +21,7 @@ def test_application_begin_regexp(self) -> None: self.assertTrue(self.features.regexp_application_begin.fullmatch('постановление') is None) self.assertTrue(self.features.regexp_application_begin.fullmatch('к приказу') is None) - def test_glava(self) -> None: + def test_chapter_beginnings(self) -> None: # note to rewrites this test if we change the num of regexps self.assertEqual(1, len(LawTextFeatures.named_regexp)) diff --git a/tests/unit_tests/test_doctype_law_txt_reader.py b/tests/unit_tests/test_doctype_law_txt_reader.py index 7da510cc..db543516 100644 --- a/tests/unit_tests/test_doctype_law_txt_reader.py +++ b/tests/unit_tests/test_doctype_law_txt_reader.py @@ -16,7 +16,7 @@ class TestLawTxtReader(AbstractTestApiDocReader): def _get_abs_path(self, file_name: str) -> str: return os.path.join(self.data_directory_path, "laws", file_name) - def test_spaces(self) -> None: + def test_law_document_spaces_correctness(self) -> None: path = self._get_abs_path("коап_москвы_8_7_2015_utf.txt") directory, filename = os.path.split(path) document = self.txt_reader.read(path=path, document_type="law", parameters={}) diff --git a/tests/unit_tests/test_format_csv_reader.py b/tests/unit_tests/test_format_csv_reader.py index 226ed9c0..8befd3f4 100644 --- a/tests/unit_tests/test_format_csv_reader.py +++ b/tests/unit_tests/test_format_csv_reader.py @@ -6,15 +6,9 @@ class TestCSVReader(TestCase): - """ - Class with implemented tests for CSVReader - """ path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "csvs")) - def test__get_lines_with_meta(self) -> None: - """ - Tests .csv file parsing correctness - """ + def test_get_lines_with_meta(self) -> None: file = os.path.join(self.path, "books_2.csv") reader = CSVReader() document = reader.read(path=file, parameters={}, document_type=None) diff --git a/tests/unit_tests/test_format_docx_reader.py b/tests/unit_tests/test_format_docx_reader.py index 60a81091..f0ad4cfd 100644 --- a/tests/unit_tests/test_format_docx_reader.py +++ b/tests/unit_tests/test_format_docx_reader.py @@ -10,9 +10,6 @@ class TestDocxReader(unittest.TestCase): - """ - Class with implemented tests for DocxReader - """ directory = os.path.join(os.path.dirname(__file__), "..", "data", "docx") tmpdir = None @@ -32,9 +29,6 @@ def tearDown(self) -> None: super().tearDown() def test_docx_with_table(self) -> None: - """ - Tests parsing correctness for .docx file with tables - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("example.docx") result = docx_reader.read(path) @@ -71,9 +65,6 @@ def test_docx_with_table(self) -> None: self.assertTrue(found) def test_docx_without_tables(self) -> None: - """ - Tests parsing correctness for .docx file without tables - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("header_test.docx") result = docx_reader.read(path) @@ -95,9 +86,6 @@ def test_docx_without_tables(self) -> None: self.assertEqual("4.6. п", lines[11].line.strip()) def test_tz_file(self) -> None: - """ - Tests parsing correctness for .docx tz file - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("tz.docx") result = docx_reader.read(path) @@ -115,9 +103,6 @@ def test_docx_without_numbering(self) -> None: self.assertTrue(result is not None) def test_caps_letters1(self) -> None: - """ - Tests parsing correctness for .docx with caps letters - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("caps_1.docx") result = docx_reader.read(path) @@ -125,9 +110,6 @@ def test_caps_letters1(self) -> None: self.assertEqual('АНАСТАСИЯ АЙГУЗИНА', result.lines[3].line) def test_caps_letters2(self) -> None: - """ - Tests parsing correctness for .docx with caps letters and complex structure - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("caps_2.docx") result = docx_reader.read(path) @@ -135,9 +117,6 @@ def test_caps_letters2(self) -> None: self.assertEqual('I глава\n', result.lines[2].line) def test_justification(self) -> None: - """ - Tests justification of text in .docx file - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("justification.docx") result = docx_reader.read(path) @@ -148,9 +127,6 @@ def test_justification(self) -> None: self.assertEqual(answer[1], annotation.value) def test_numeration(self) -> None: - """ - Tests numeration in .docx file - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("numeration.docx") result = docx_reader.read(path) @@ -166,10 +142,7 @@ def test_numeration(self) -> None: self.assertEqual("5.4.\tlist", lines[11].line) self.assertEqual("5.5.\tlist", lines[13].line) - def test_tables(self) -> None: - """ - Tests table parsing in .docx file - """ + def test_table_parsing_correctness(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("merged_cells.docx") result = docx_reader.read(path) @@ -245,10 +218,7 @@ def test_tables(self) -> None: self.assertEqual(result.tables[1].metadata.cell_properties[1][0].rowspan, 2) self.assertEqual(result.tables[1].metadata.cell_properties[1][0].colspan, 2) - def test_merged_tables(self) -> None: - """ - Tests parsing of .docx file containing big table with merged cells - """ + def test_tables_with_merged_cells(self) -> None: docx_reader = DocxReader(config=get_config()) path = self._get_path("big_table_with_merged_cells.docx") result = docx_reader.read(path) @@ -270,9 +240,6 @@ def test_merged_tables(self) -> None: self.assertEqual(result.tables[0].metadata.cell_properties[3][0].colspan, 4) def test_diagram_annotation(self) -> None: - """ - Tests parsing of diagram annotation in .docx file - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("diagram_1.docx") result = docx_reader.read(path) @@ -293,9 +260,6 @@ def test_diagram_annotation(self) -> None: self.assertTrue(annotation_found) def test_tags(self) -> None: - """ - Tests parsing of tags in .docx file - """ docx_reader = DocxReader(config=get_config()) path = self._get_path("with_tags.docx") result = docx_reader.read(path) diff --git a/tests/unit_tests/test_format_image_metadata_extractor.py b/tests/unit_tests/test_format_image_metadata_extractor.py index 8e299330..348b0a2c 100644 --- a/tests/unit_tests/test_format_image_metadata_extractor.py +++ b/tests/unit_tests/test_format_image_metadata_extractor.py @@ -6,17 +6,11 @@ class TestImageMetadataExtractor(unittest.TestCase): - """ - Class with implemented tests for image metadata extractor - """ extractor = ImageMetadataExtractor(config=get_test_config()) data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) assert os.path.isdir(data_path) - def test_exif(self) -> None: - """ - Test for metadata extraction from broken image - """ + def test_broken_image_metadata_extraction(self) -> None: file = os.path.join(self.data_path, "exif_nan.jpg") exif = self.extractor._get_exif(file) self.assertIsNone(exif.get("digital_zoom_ratio")) diff --git a/tests/unit_tests/test_format_image_reader_bbox.py b/tests/unit_tests/test_format_image_reader_bbox.py index cfa720fb..882b6b6c 100644 --- a/tests/unit_tests/test_format_image_reader_bbox.py +++ b/tests/unit_tests/test_format_image_reader_bbox.py @@ -8,16 +8,10 @@ class TestImageReaderWithBBox(unittest.TestCase): - """ - Class with implemented tests for OCR line extractor - """ abs_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "scanned")) reader = OCRLineExtractor(config=get_test_config()) def test_line_order(self) -> None: - """ - Test for extracted line order correctness - """ image = cv2.imread(os.path.join(self.abs_path, "part.png")) page = self.reader.split_image2lines(image=image, page_num=1, is_one_column_document=True) bboxes = [bbox for bbox in page.bboxes if bbox.text.strip() != ""] diff --git a/tests/unit_tests/test_format_pdf_reader.py b/tests/unit_tests/test_format_pdf_reader.py index 603a84e6..4bf2cb4a 100644 --- a/tests/unit_tests/test_format_pdf_reader.py +++ b/tests/unit_tests/test_format_pdf_reader.py @@ -111,7 +111,7 @@ def test_header_footer_search_3(self) -> None: self.assertEqual(len(headers), 1) self.assertEqual(len(footers), 0) - def test_long_list_pdf(self) -> None: + def test_long_list_in_pdf(self) -> None: config = get_test_config() any_doc_reader = PdfImageReader(config=config) path = os.path.join(os.path.dirname(__file__), "../data/scanned/doc_with_long_list.pdf") diff --git a/tests/unit_tests/test_format_txt_reader.py b/tests/unit_tests/test_format_txt_reader.py index 70d5a1a2..19f971ea 100644 --- a/tests/unit_tests/test_format_txt_reader.py +++ b/tests/unit_tests/test_format_txt_reader.py @@ -7,17 +7,11 @@ class TestRawTextReader(TestCase): - """ - Class with implemented tests for raw text reader - """ config = get_test_config() reader = RawTextReader(config=config) path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) def test_read_law(self) -> None: - """ - Tests reader with law document - """ file = os.path.join(self.path, "laws", "коап_москвы_8_7_2015_utf.txt") uids_set = set() prefix = "txt_6210f1fb59150aae33a09f49c8724baf" # это строка, содержащая хэш файла, который обратаывается ридером @@ -28,9 +22,6 @@ def test_read_law(self) -> None: self.assertEqual(prefix, line.uid[:len(prefix)]) # в поле uid содержится хэш файла, в котором находитс строка, и id самой строки def test_read_tz(self) -> None: - """ - Tests reader with tz document - """ file = os.path.join(self.path, "tz", "tz.txt") uids_set = set() prefix = "txt_0e576a9e0008225ac27f961af60c0bee" @@ -40,9 +31,8 @@ def test_read_tz(self) -> None: uids_set.add(line.uid) self.assertEqual(prefix, line.uid[:len(prefix)]) - def test__get_lines_with_meta(self) -> None: - path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "txt")) - file = os.path.join(path, "pr_17.txt") + def test_get_lines_with_meta(self) -> None: + file = os.path.join(self.path, "txt", "pr_17.txt") reader = RawTextReader(config=get_config()) for line in reader._get_lines_with_meta(path=file, encoding="utf-8"): expected_uid = "txt_1a3cd561910506d56a65db1d1dcb5049_{}".format(line.metadata.line_id) diff --git a/tests/unit_tests/test_misc_annotations.py b/tests/unit_tests/test_misc_annotations.py index 267b35f5..5f1af8a7 100644 --- a/tests/unit_tests/test_misc_annotations.py +++ b/tests/unit_tests/test_misc_annotations.py @@ -8,28 +8,16 @@ class TestAnnotationMerger(unittest.TestCase): - """ - Class with implemented tests for the AnnotationMerger - """ def merge(self, annotations: List[Annotation], text: str) -> Set[Tuple[int, int, str, str]]: - """ - Class method to merge given annotations in a given string - """ res = AnnotationMerger().merge_annotations(annotations, text) return {(annotation.start, annotation.end, annotation.name, annotation.value) for annotation in res} def test_annotation_merge_zero(self) -> None: - """ - Tests merging of empty list of annotations - """ annotations = [] text = "hello my friend" self.assertSetEqual(set(), self.merge(annotations, text)) def test_annotation_merge_one(self) -> None: - """ - Tests merging of list consisting of only one annotation - """ annotations = [Annotation(start=0, end=4, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 4, "size", "1")}, self.merge(annotations, text)) @@ -43,17 +31,11 @@ def test_annotation_merge_one_near_space(self) -> None: self.assertSetEqual({(0, 5, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value(self) -> None: - """ - Tests the case where two annotations match on a space symbol and cover the whole string - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=5, end=15, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value_no_spaces(self) -> None: - """ - Tests the case where two annotations match on a letter and cover the whole string - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=5, end=15, name="size", value="1")] text = "hellomyfriend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) @@ -75,25 +57,16 @@ def test_annotation_merge_same_value_separating_by_many_space_end_space(self) -> self.assertSetEqual({(0, 25, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value_separating_by_space(self) -> None: - """ - Tests the case where two annotations are separated by space symbol - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=15, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value_separating_by_tab(self) -> None: - """ - Tests the case where two annotations are separated by tab symbol - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=15, name="size", value="1")] text = "hello\tmy friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_same_value_separating_by_newline(self) -> None: - """ - Tests the case where two annotations are separated by newline symbol - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=15, name="size", value="1")] text = "hello\nmy friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) @@ -107,62 +80,41 @@ def test_annotation_merge_included(self) -> None: self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_annotations(self) -> None: - """ - Tests the case of merging three disjoint annotations that cover the whole string - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=6, end=10, name="size", value="1"), Annotation(start=10, end=15, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_nested_annotations(self) -> None: - """ - Tests the case of merging three nested annotations that cover the whole string - """ annotations = [Annotation(start=0, end=15, name="size", value="1"), Annotation(start=6, end=10, name="size", value="1"), Annotation(start=3, end=8, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_intersected_annotations(self) -> None: - """ - Tests the case of merging three intersecting annotations - """ annotations = [Annotation(start=0, end=5, name="size", value="1"), Annotation(start=3, end=8, name="size", value="1"), Annotation(start=6, end=9, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 9, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_three_one_intersected_annotations(self) -> None: - """ - Tests the case of merging two intersecting annotations and the one that is separate - """ annotations = [Annotation(start=0, end=3, name="size", value="1"), Annotation(start=3, end=6, name="size", value="1"), Annotation(start=8, end=15, name="size", value="1")] text = "hello my friend" self.assertSetEqual({(0, 6, "size", "1"), (8, 15, "size", "1")}, self.merge(annotations, text)) def test_annotation_merge_different_value(self) -> None: - """ - Tests the case of merging two annotations with different values - """ annotations = [Annotation(start=0, end=5, name="bold", value="True"), Annotation(start=5, end=15, name="italic", value="True")] text = "hello my friend" self.assertSetEqual({(0, 5, "bold", "True"), (5, 15, "italic", "True")}, self.merge(annotations, text)) def test_annotation_merge_mixed(self) -> None: - """ - Tests the case of merging many annotations with mixed values - """ annotations = [Annotation(start=0, end=5, name="bold", value="True"), Annotation(start=5, end=15, name="bold", value="True"), Annotation(start=4, end=6, name="italic", value="True"), Annotation(start=6, end=66, name="italic", value="True")] text = "hello my friend, hello my friend, hello my friend, hello my friend" self.assertSetEqual({(0, 15, "bold", "True"), (4, 66, "italic", "True")}, self.merge(annotations, text)) def test_merge_1000_annotations(self) -> None: - """ - Tests the case of merging one hundred annotations with the same values - """ timeout = 10 n = 1000 annotations = [Annotation(start=i, end=i + 1, name="bold", value="True") for i in range(n)] @@ -172,9 +124,6 @@ def test_merge_1000_annotations(self) -> None: self.assertSetEqual({(0, n, "bold", "True")}, result) def test_merge_1000_pair_annotations(self) -> None: - """ - Tests the case of merging many annotations with the same positions but with different values - """ timeout = 10 n = 1000 annotations = [] @@ -188,9 +137,6 @@ def test_merge_1000_pair_annotations(self) -> None: self.assertSetEqual({(0, n, "bold", "True"), (0, n, "size", "1")}, result) def test_merge_1000_no_intersection(self) -> None: - """ - Tests the case of merging many annotations with no intersections - """ timeout = 10 n = 1000 annotations = [] @@ -204,9 +150,6 @@ def test_merge_1000_no_intersection(self) -> None: class TestAbstractStructureExtractor(unittest.TestCase): - """ - Class with implemented tests for the AbstractStructureExtractor - """ def test_annotation_extractor_left(self) -> None: """ Tests the case where extraction region is one pixel to the left of the annotation region @@ -263,9 +206,6 @@ def test_annotation_extractor_multiple(self) -> None: self.assertEqual(res[1].end, 3) def test_annotation_extractor_zero(self) -> None: - """ - Tests the case with extracting empty list of annotations - """ annotations = [] res = AbstractStructureExtractor._select_annotations(annotations, 1, 4) self.assertEqual(len(res), 0) diff --git a/tests/unit_tests/test_misc_dedoc_manager.py b/tests/unit_tests/test_misc_dedoc_manager.py index 4aed42ac..345d4715 100644 --- a/tests/unit_tests/test_misc_dedoc_manager.py +++ b/tests/unit_tests/test_misc_dedoc_manager.py @@ -7,18 +7,12 @@ class TestDedocManager(TestCase): - """ - Class with implemented tests for DedocManager - """ path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "csvs")) config = get_config() manager_config = get_manager_config(config=config) dedoc_manager = DedocManager.from_config(version="tests", manager_config=manager_config, config=config) def test_parse_file(self) -> None: - """ - Tests .tsv file parsing correctness - """ filename = "csv_tab.tsv" result = self.dedoc_manager.parse_file(os.path.join(self.path, "csv_tab.tsv"), {}) self.assertEqual(filename, result.metadata.file_name) @@ -28,14 +22,8 @@ def test_parse_file(self) -> None: self.assertLessEqual(["5", "3", "1"], result.content.tables[0].cells[2]) def test_version(self) -> None: - """ - Tests DedocManager version - """ self.assertEqual("tests", self.dedoc_manager.version) def test_file_not_exists(self) -> None: - """ - Tests the case of parsing a non-existent file - """ with self.assertRaises(FileNotFoundError): self.dedoc_manager.parse_file("afagahcr", {}) diff --git a/tests/unit_tests/test_misc_feature_extractor.py b/tests/unit_tests/test_misc_feature_extractor.py index f18ec485..7a954c82 100644 --- a/tests/unit_tests/test_misc_feature_extractor.py +++ b/tests/unit_tests/test_misc_feature_extractor.py @@ -6,15 +6,9 @@ class TestRegexpFeatures(unittest.TestCase): - """ - Class with implemented tests for feature extractor - """ result_matrix = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]]) def compare_with_expected(self, expected_matrix: List[List[int]], result: np.ndarray) -> None: - """ - Method for comparison of two matrices - """ self.assertEqual(len(expected_matrix), result.shape[0]) self.assertEqual(len(expected_matrix[0]), result.shape[1]) for row_res, row_exp in zip(result, expected_matrix): diff --git a/tests/unit_tests/test_misc_hierarchy_level.py b/tests/unit_tests/test_misc_hierarchy_level.py index 6ba3d636..ab46a11b 100644 --- a/tests/unit_tests/test_misc_hierarchy_level.py +++ b/tests/unit_tests/test_misc_hierarchy_level.py @@ -4,14 +4,8 @@ class TestHierarchyLevel(unittest.TestCase): - """ - Class with implemented tests for hierarchy level extraction - """ - def test_two_raw_text(self) -> None: - """ - Test for three hierarchy levels comparison - """ + def test_three_lines_equal_levels(self) -> None: h1 = HierarchyLevel.create_raw_text() h2 = HierarchyLevel.create_raw_text() h3 = HierarchyLevel(level_1=1, level_2=2, can_be_multiline=False, line_type=HierarchyLevel.raw_text) @@ -23,9 +17,6 @@ def test_two_raw_text(self) -> None: self.assertTrue(h1 <= h3) def test_raw_text_greater_than_any_other(self) -> None: - """ - Test for hierarchy level comparison where raw text is greater than any other - """ list_item = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, line_type=HierarchyLevel.list_item) raw_text = HierarchyLevel.create_raw_text() self.assertFalse(list_item > raw_text) @@ -35,9 +26,6 @@ def test_raw_text_greater_than_any_other(self) -> None: self.assertTrue(list_item <= raw_text) def test_one_greater_than_other_level1(self) -> None: - """ - Test hierarchy comparison of two different levels - """ h1 = HierarchyLevel(level_1=2, level_2=2, can_be_multiline=False, line_type=HierarchyLevel.list_item) h2 = HierarchyLevel(level_1=3, level_2=1, can_be_multiline=False, line_type=HierarchyLevel.list_item) self.assertTrue(h1 < h2) @@ -47,9 +35,6 @@ def test_one_greater_than_other_level1(self) -> None: self.assertFalse(h1 == h2) def test_one_greater_than_other_level2(self) -> None: - """ - Test hierarchy comparison of two different levels - """ h1 = HierarchyLevel(level_1=2, level_2=1, can_be_multiline=False, line_type=HierarchyLevel.list_item) h2 = HierarchyLevel(level_1=2, level_2=2, can_be_multiline=False, line_type=HierarchyLevel.list_item) self.assertTrue(h1 < h2) @@ -58,10 +43,7 @@ def test_one_greater_than_other_level2(self) -> None: self.assertFalse(h1 >= h2) self.assertFalse(h1 == h2) - def test_equal(self) -> None: - """ - Test hierarchy comparison of two equal levels - """ + def test_four_lines_with_mixed_levels(self) -> None: h1 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, line_type=HierarchyLevel.header) h2 = HierarchyLevel(level_1=3, level_2=3, can_be_multiline=True, line_type=HierarchyLevel.header) h3 = HierarchyLevel(level_1=None, level_2=None, can_be_multiline=True, line_type=HierarchyLevel.unknown) diff --git a/tests/unit_tests/test_misc_line_split.py b/tests/unit_tests/test_misc_line_split.py index 333c8804..725afc32 100644 --- a/tests/unit_tests/test_misc_line_split.py +++ b/tests/unit_tests/test_misc_line_split.py @@ -118,7 +118,7 @@ def test_split_empty_line(self) -> None: split = line.split("\n") self.assertListEqual([line], split) - def test_split_empty_sep(self) -> None: + def test_split_empty_separator(self) -> None: line = self._get_line("some text", [BoldAnnotation(0, 3, "True")]) with self.assertRaises(ValueError): line.split("") @@ -188,7 +188,7 @@ def test_no_sep(self) -> None: split = line.split("\n") self.assertListEqual([line], split) - def test_two_annotations(self) -> None: + def test_split_line_with_two_intersecting_annotations(self) -> None: line = self._get_line("some\ntext", [SizeAnnotation(0, 9, "14"), BoldAnnotation(0, 9, "True")]) split = line.split("\n") self.assertEqual(2, len(split)) @@ -204,7 +204,7 @@ def test_two_annotations(self) -> None: self.assertEqual(0, annotation_size.start) self.assertEqual(len(line.line), annotation_size.end) - def test_two_annotations_no_intersection(self) -> None: + def test_line_with_two_annotations_no_intersection(self) -> None: line = self._get_line("some\ntext", [SizeAnnotation(0, 5, "14"), SizeAnnotation(5, 9, "10")]) split = line.split("\n") self.assertEqual(2, len(split)) @@ -230,7 +230,7 @@ def test_two_annotations_no_intersection_by_sep(self) -> None: self.assertEqual(start, annotation_size.start) self.assertEqual(end, annotation_size.end) - def test_up_to_sep(self) -> None: + def test_split_of_one_annotation_ending_close_to_sep(self) -> None: line = self._get_line("some\ntext", [SizeAnnotation(0, 5, "14")]) split = line.split("\n") self.assertEqual(2, len(split)) @@ -239,6 +239,9 @@ def test_up_to_sep(self) -> None: self.assertEqual(0, len(right.annotations)) def __annotation_all_line(self, split: List[LineWithMeta]) -> None: + """ + Tests if annotation has the same length as line + """ for line in split: annotations = line.annotations self.assertEqual(1, len(annotations)) diff --git a/tests/unit_tests/test_misc_list_feature_extractor.py b/tests/unit_tests/test_misc_list_feature_extractor.py index 4b2ded17..bf7b3c5d 100644 --- a/tests/unit_tests/test_misc_list_feature_extractor.py +++ b/tests/unit_tests/test_misc_list_feature_extractor.py @@ -79,7 +79,7 @@ def test_letter(self) -> None: line = self._get_line_with_meta("\tё) some text") self.assertEqual(LetterPrefix("ё)", 10), self.feature_extractor._get_prefix(line)) - def test_nonletter(self) -> None: + def test_non_letter_prefix(self) -> None: line = self._get_line_with_meta("- some text") self.assertEqual(BulletPrefix("-", 10), self.feature_extractor._get_prefix(line)) @@ -89,7 +89,7 @@ def test_nonletter(self) -> None: line = self._get_line_with_meta("+ some text") self.assertEqual(BulletPrefix("+", 10), self.feature_extractor._get_prefix(line)) - def test_empty(self) -> None: + def test_empty_prefix(self) -> None: line = self._get_line_with_meta("some text") self.assertEqual(EmptyPrefix(indent=10), self.feature_extractor._get_prefix(line)) @@ -102,7 +102,7 @@ def test_empty(self) -> None: line = self._get_line_with_meta("\nsome text") self.assertEqual(EmptyPrefix(indent=10), self.feature_extractor._get_prefix(line)) - def test__get_window(self) -> None: + def test_get_window(self) -> None: prefixes = [BracketPrefix("{})".format(i), 1.01 * i) for i in range(0, 300)] doc_size = len(prefixes) assert doc_size == 300 diff --git a/tests/unit_tests/test_misc_list_patcher.py b/tests/unit_tests/test_misc_list_patcher.py index 5d1135ce..35e04cc8 100644 --- a/tests/unit_tests/test_misc_list_patcher.py +++ b/tests/unit_tests/test_misc_list_patcher.py @@ -28,7 +28,7 @@ def test_correct_list(self) -> None: result = self.patcher.patch(lines) self.assertListEqual(self.__get_text(lines), self.__get_text(result)) - def test_hl_raw_text4(self) -> None: + def test_hierarchy_level_raw_text(self) -> None: line1 = self.__get_line("2 item", None, None, HierarchyLevel.raw_text) line2 = self.__get_line("some item", None, None, HierarchyLevel.raw_text) line3 = self.__get_line("2 item", None, None, HierarchyLevel.raw_text) diff --git a/tests/unit_tests/test_misc_tree_node.py b/tests/unit_tests/test_misc_tree_node.py index 9430b99c..35d38dff 100644 --- a/tests/unit_tests/test_misc_tree_node.py +++ b/tests/unit_tests/test_misc_tree_node.py @@ -10,7 +10,7 @@ class TestTreeNode(TestCase): - def test_root_annotations(self) -> None: + def test_root_node_annotations(self) -> None: lines = [LineWithMeta(line="bold text\n", metadata=LineMetadata(hierarchy_level=HierarchyLevel.create_root(), page_id=0, line_id=0), annotations=[BoldAnnotation(start=0, end=10, value="True")]), diff --git a/tests/unit_tests/test_module_attachment_extractor.py b/tests/unit_tests/test_module_attachment_extractor.py index 9a370c5d..58cb8073 100644 --- a/tests/unit_tests/test_module_attachment_extractor.py +++ b/tests/unit_tests/test_module_attachment_extractor.py @@ -11,9 +11,6 @@ class TestAttachmentsExtractor(unittest.TestCase): - """ - Class with implemented tests for the attachment extractor - """ src_dir = os.path.join(os.path.dirname(__file__), "..", "data", "with_attachments") def test_docx_attachments_extractor(self) -> None: @@ -51,9 +48,6 @@ def test_docx_attachments_extractor(self) -> None: self.assertEqual(extracted, len(attachments_name_list)) def test_pptx_attachments_extractor(self) -> None: - """ - Tests attachment extraction from pptx files - """ attachments_name_list = [ "Microsoft_Excel_97-2004_Worksheet.xls", "image3.png", @@ -85,9 +79,6 @@ def test_pptx_attachments_extractor(self) -> None: self.assertEqual(extracted, len(attachments_name_list)) def test_docx_diagrams_extraction(self) -> None: - """ - Tests diagram extraction from docx files - """ docx_attachment_extractor = DocxAttachmentsExtractor() docx_dir = os.path.join(os.path.dirname(__file__), "..", "data", "docx") files = [('diagram_1.docx', 1), ('diagram_2.docx', 5)] @@ -103,15 +94,12 @@ def test_archive_with_slash(self) -> None: file_name_template = "attachments.{}" for extension in "7z", "tar", "tar.gz", "zip": file_name = file_name_template.format(extension) - files = self.__get_list_of_files(file_name) + files = self.__get_list_of_files_in_archive(file_name) self.assertEqual(2, len(files)) self.assertIn(r"som_file⁄wiht\slash.txt", files) self.assertIn("other_file.csv", files) - def __get_list_of_files(self, file_name: str) -> List[str]: - """ - Class method for getting list of files in an archive - """ + def __get_list_of_files_in_archive(self, file_name: str) -> List[str]: with tempfile.TemporaryDirectory() as tmp_dir: file_path = os.path.join(tmp_dir, file_name) shutil.copyfile(os.path.join(self.src_dir, file_name), file_path) diff --git a/tests/unit_tests/test_module_builders.py b/tests/unit_tests/test_module_builders.py index 62bae226..d52da744 100644 --- a/tests/unit_tests/test_module_builders.py +++ b/tests/unit_tests/test_module_builders.py @@ -13,9 +13,6 @@ class TestBuilders(unittest.TestCase): - """ - Class with implemented tests for different hierarchy level builders - """ builders = [HeaderHierarchyLevelBuilder(), BodyLawHierarchyLevelBuilder(), BodyFoivHierarchyLevelBuilder(), @@ -23,10 +20,7 @@ class TestBuilders(unittest.TestCase): ApplicationFoivHierarchyLevelBuilder()] composition_builder = HierarchyLevelBuilderComposition(builders=builders) - def test_item(self) -> None: - """ - Tests different hierarchy level builders - """ + def test_creation_of_builders(self) -> None: builders = self.composition_builder._get_builders(["header"], 'law') self.assertTrue(isinstance(builders[0], HeaderHierarchyLevelBuilder)) diff --git a/tests/unit_tests/test_module_cell_splitter.py b/tests/unit_tests/test_module_cell_splitter.py index 8cdaaa8f..c12aa7a9 100644 --- a/tests/unit_tests/test_module_cell_splitter.py +++ b/tests/unit_tests/test_module_cell_splitter.py @@ -5,15 +5,9 @@ class TestCellSplitter(unittest.TestCase): - """ - Class with implemented tests for cell splitter - """ splitter = CellSplitter() - def test__merge_close_borders(self) -> None: - """ - Test merging multiple cells with close borders - """ + def test_merge_close_borders(self) -> None: cells = [ [Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30), Cell(x_top_left=51, y_top_left=2, x_bottom_right=90, y_bottom_right=29)], @@ -41,10 +35,7 @@ def test__merge_close_borders(self) -> None: self.assertEqual(90, cells_merged[1][1].x_bottom_right) self.assertEqual(50, cells_merged[1][1].y_bottom_right) - def test__merge_close_borders_one_cell(self) -> None: - """ - Test merging of one cell - """ + def test_merge_close_borders_one_cell(self) -> None: cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=50, y_bottom_right=30)]] cells_merged = self.splitter._merge_close_borders(cells) self.assertEqual(0, cells_merged[0][0].x_top_left) @@ -52,26 +43,17 @@ def test__merge_close_borders_one_cell(self) -> None: self.assertEqual(50, cells_merged[0][0].x_bottom_right) self.assertEqual(30, cells_merged[0][0].y_bottom_right) - def test__merge_close_borders_none_cells(self) -> None: - """ - Test merging of zero cells - """ + def test_merge_zero_cells(self) -> None: cells = [[]] cells_merged = self.splitter._merge_close_borders(cells) self.assertListEqual([[]], cells_merged) def test_split_zero_cells(self) -> None: - """ - Test split of zero cells - """ cells = [[]] matrix = self.splitter.split(cells=cells) self.assertListEqual([[]], matrix) def test_split_one_cell(self) -> None: - """ - Test split of one cell - """ cells = [[Cell(x_top_left=0, y_top_left=0, x_bottom_right=10, y_bottom_right=15)]] matrix = self.splitter.split(cells=cells) self.assertEqual(1, len(matrix)) @@ -83,9 +65,6 @@ def test_split_one_cell(self) -> None: self.assertEqual(15, new_cell.y_bottom_right) def test_horizontal_split(self) -> None: - """ - Test for horizontal split - """ cells = [ [ Cell(x_top_left=0, y_top_left=0, x_bottom_right=3, y_bottom_right=5), @@ -121,9 +100,6 @@ def test_horizontal_split(self) -> None: self.assertEqual(5, cell_d.y_bottom_right) def test_vertical_split(self) -> None: - """ - Test for vertical split - """ cells = [ [ Cell(x_top_left=0, y_top_left=0, x_bottom_right=8, y_bottom_right=2), @@ -159,9 +135,6 @@ def test_vertical_split(self) -> None: self.assertEqual(5, cell_d.y_bottom_right) def test_no_split(self) -> None: - """ - Test for the case with no split - """ cells = [[Cell(x_top_left=160, y_top_left=321, x_bottom_right=825, y_bottom_right=369), Cell(x_top_left=825, y_top_left=321, x_bottom_right=1494, y_bottom_right=369)], [Cell(x_top_left=160, y_top_left=374, x_bottom_right=825, y_bottom_right=423), diff --git a/tests/unit_tests/test_module_convertor_docx.py b/tests/unit_tests/test_module_converter_docx.py similarity index 75% rename from tests/unit_tests/test_module_convertor_docx.py rename to tests/unit_tests/test_module_converter_docx.py index f6f85463..5c1fa6fd 100644 --- a/tests/unit_tests/test_module_convertor_docx.py +++ b/tests/unit_tests/test_module_converter_docx.py @@ -6,34 +6,22 @@ class TestDocxConverter(AbstractConverterTest): - """ - Class with implemented tests for DocxConvertor - """ converter = DocxConverter(config={"need_content_analysis": True}) path = os.path.join(AbstractConverterTest.path, "docx") - def test_convert_broken(self) -> None: - """ - Tests the conversion of a broken file - """ + def test_convert_broken_file(self) -> None: extension = ".odt" filename = "broken" with self.assertRaises(ConversionException): self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_odt(self) -> None: - """ - Tests the conversion of .odt file to .docx - """ filename = "english_doc" extension = ".odt" self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_doc(self) -> None: - """ - Tests the conversion of .doc file to .docx - """ filename = "english_doc" extension = ".doc" self._convert(filename=filename, extension=extension, converter=self.converter) diff --git a/tests/unit_tests/test_module_convertor_excel.py b/tests/unit_tests/test_module_converter_excel.py similarity index 76% rename from tests/unit_tests/test_module_convertor_excel.py rename to tests/unit_tests/test_module_converter_excel.py index 20cf9885..30915222 100644 --- a/tests/unit_tests/test_module_convertor_excel.py +++ b/tests/unit_tests/test_module_converter_excel.py @@ -6,34 +6,22 @@ class TestExcelConverter(AbstractConverterTest): - """ - Class with implemented tests for ExcelConvertor - """ converter = ExcelConverter(config={"need_content_analysis": True}) path = os.path.join(AbstractConverterTest.path, "xlsx") - def test_convert_broken(self) -> None: - """ - Tests conversion of broken file - """ + def test_convert_broken_file(self) -> None: extension = ".ods" filename = "broken" with self.assertRaises(ConversionException): self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_ods(self) -> None: - """ - Tests conversion from .ods extension - """ extension = ".ods" filename = "example" self._convert(filename=filename, extension=extension, converter=self.converter) def test_convert_xls(self) -> None: - """ - Tests conversion from .xls extension - """ extension = ".xls" filename = "example" self._convert(filename=filename, extension=extension, converter=self.converter) diff --git a/tests/unit_tests/test_module_convertor_ppt.py b/tests/unit_tests/test_module_converter_ppt.py similarity index 90% rename from tests/unit_tests/test_module_convertor_ppt.py rename to tests/unit_tests/test_module_converter_ppt.py index bd25ffbe..9c97515f 100644 --- a/tests/unit_tests/test_module_convertor_ppt.py +++ b/tests/unit_tests/test_module_converter_ppt.py @@ -5,12 +5,12 @@ from tests.unit_tests.abstract_converter_test import AbstractConverterTest -class TestDocxConverter(AbstractConverterTest): +class TestPPTXConverter(AbstractConverterTest): path = os.path.join(AbstractConverterTest.path, "pptx") converter = PptxConverter(config={"need_content_analysis": True}) - def test_convert_broken(self) -> None: + def test_convert_broken_file(self) -> None: extension = ".odp" filename = "broken" with self.assertRaises(ConversionException): diff --git a/tests/unit_tests/test_module_convertor_txt.py b/tests/unit_tests/test_module_converter_txt.py similarity index 100% rename from tests/unit_tests/test_module_convertor_txt.py rename to tests/unit_tests/test_module_converter_txt.py diff --git a/tests/unit_tests/test_module_font_classifier.py b/tests/unit_tests/test_module_font_classifier.py index 488da7eb..a14365c2 100644 --- a/tests/unit_tests/test_module_font_classifier.py +++ b/tests/unit_tests/test_module_font_classifier.py @@ -1,7 +1,7 @@ import os import unittest -from PIL import Image +import cv2 from dedoc.data_structures.bbox import BBox from dedoc.readers.pdf_reader.data_classes.page_with_bboxes import PageWithBBox @@ -11,18 +11,12 @@ class TestFontClassifier(unittest.TestCase): - """ - Class with implemented tests for font type classifier - """ data_directory_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "scanned")) path_model = os.path.abspath(os.path.join(get_test_config()["resources_path"], "font_classifier.pth")) classifier = FontTypeClassifier(path_model) - def get_page(self, filename) -> PageWithBBox: - """ - Method for getting page with bboxes from single image - """ - image = Image.open(os.path.join(self.data_directory_path, filename)) + def get_page_with_bbox(self, filename) -> PageWithBBox: + image = cv2.imread(os.path.join(self.data_directory_path, filename)) bbox_1 = TextWithBBox(bbox=BBox(10, 20, 11, 23), page_num=0, text="str", line_num=0) bbox_2 = TextWithBBox(bbox=BBox(20, 30, 11, 23), page_num=0, text="rts", line_num=1) @@ -30,11 +24,11 @@ def get_page(self, filename) -> PageWithBBox: return PageWithBBox(image=image, bboxes=bboxes, page_num=0) - def test__page2tensor(self) -> None: + def test_page_with_bbox_converted_to_tensor(self) -> None: """ - Test for font classifier output tensor shape + Tests font classifier output tensor shape correctness """ - page = self.get_page(filename="orient_1.png") + page = self.get_page_with_bbox(filename="orient_1.png") tensor = FontTypeClassifier._page2tensor(page=page) bbox_num, channels, height, width = tensor.shape self.assertEqual(2, bbox_num) @@ -42,11 +36,8 @@ def test__page2tensor(self) -> None: self.assertEqual(15, height) self.assertEqual(300, width) - def test__get_model_predictions(self) -> None: - """ - Test for font classifier predictions - """ - page = self.get_page(filename="orient_1.png") + def test_get_model_predictions(self) -> None: + page = self.get_page_with_bbox(filename="orient_1.png") predictions = self.classifier._get_model_predictions(page) self.assertEqual(predictions.shape[0], 2) self.assertEqual(len(predictions.shape), 2) diff --git a/tests/unit_tests/test_module_scan_rotator.py b/tests/unit_tests/test_module_scan_rotator.py index fb8d7216..42633ead 100644 --- a/tests/unit_tests/test_module_scan_rotator.py +++ b/tests/unit_tests/test_module_scan_rotator.py @@ -13,7 +13,7 @@ def _get_abs_path(self, file_name: str) -> str: data_directory_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data")) return os.path.join(data_directory_path, "scan_rotator", file_name) - def test_short_lines(self) -> None: + def test_documents_with_short_lines(self) -> None: for i in range(1, 6): file_name = f"short_lines-{i}.png" img = cv2.imread(self._get_abs_path(file_name))