-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c505eaa
commit 4ee4791
Showing
19 changed files
with
608 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 35 additions & 2 deletions
37
dedoc/structure_extractors/patterns/bracket_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,44 @@ | ||
from typing import Optional | ||
from typing import Optional, Union | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class BracketListPattern(RegexpPattern): | ||
""" | ||
Pattern for matching numbered lists with brackets, e.g. | ||
:: | ||
1) first element | ||
2) second element | ||
Example of library usage: | ||
.. code-block:: python | ||
from dedoc.structure_extractors import DefaultStructureExtractor | ||
from dedoc.structure_extractors.patterns import BracketListPattern | ||
reader = ... | ||
structure_extractor = DefaultStructureExtractor() | ||
patterns = [BracketListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] | ||
document = reader.read(file_path=file_path) | ||
document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) | ||
Example of API usage: | ||
.. code-block:: python | ||
import requests | ||
patterns = [{"name": "bracket_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] | ||
parameters = {"patterns": str(patterns)} | ||
with open(file_path, "rb") as file: | ||
files = {"file": (file_name, file)} | ||
r = requests.post("http://localhost:1231/upload", files=files, data=parameters) | ||
""" | ||
_name = "bracket_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: | ||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: | ||
super().__init__(regexp=BracketPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
43 changes: 41 additions & 2 deletions
43
dedoc/structure_extractors/patterns/bracket_roman_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,50 @@ | ||
from typing import Optional | ||
from typing import Optional, Union | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_roman_prefix import BracketRomanPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class BracketRomanListPattern(RegexpPattern): | ||
""" | ||
Pattern for matching roman lists with brackets, e.g. | ||
:: | ||
i) first item | ||
ii) second item | ||
iii) third item | ||
iv) forth item | ||
.. note:: | ||
The pattern is case-insensitive (lower and upper letters are not differed). | ||
Example of library usage: | ||
.. code-block:: python | ||
from dedoc.structure_extractors import DefaultStructureExtractor | ||
from dedoc.structure_extractors.patterns import BracketRomanListPattern | ||
reader = ... | ||
structure_extractor = DefaultStructureExtractor() | ||
patterns = [BracketRomanListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] | ||
document = reader.read(file_path=file_path) | ||
document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) | ||
Example of API usage: | ||
.. code-block:: python | ||
import requests | ||
patterns = [{"name": "bracket_roman_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] | ||
parameters = {"patterns": str(patterns)} | ||
with open(file_path, "rb") as file: | ||
files = {"file": (file_name, file)} | ||
r = requests.post("http://localhost:1231/upload", files=files, data=parameters) | ||
""" | ||
_name = "bracket_roman_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: | ||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: | ||
super().__init__(regexp=BracketRomanPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
39 changes: 37 additions & 2 deletions
39
dedoc/structure_extractors/patterns/bullet_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,46 @@ | ||
from typing import Optional | ||
from typing import Optional, Union | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class BulletListPattern(RegexpPattern): | ||
""" | ||
Pattern for matching bulleted lists, e.g. | ||
:: | ||
- first item | ||
- second item | ||
or with other bullet markers ``-, —, −, –, ®, ., •, ,, ‚, ©, ⎯, °, *, >, ●, ♣, ①, ▪, *, +``. | ||
Example of library usage: | ||
.. code-block:: python | ||
from dedoc.structure_extractors import DefaultStructureExtractor | ||
from dedoc.structure_extractors.patterns import BulletListPattern | ||
reader = ... | ||
structure_extractor = DefaultStructureExtractor() | ||
patterns = [BulletListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] | ||
document = reader.read(file_path=file_path) | ||
document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) | ||
Example of API usage: | ||
.. code-block:: python | ||
import requests | ||
patterns = [{"name": "bullet_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] | ||
parameters = {"patterns": str(patterns)} | ||
with open(file_path, "rb") as file: | ||
files = {"file": (file_name, file)} | ||
r = requests.post("http://localhost:1231/upload", files=files, data=parameters) | ||
""" | ||
_name = "bullet_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: | ||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: | ||
super().__init__(regexp=BulletPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
49 changes: 47 additions & 2 deletions
49
dedoc/structure_extractors/patterns/letter_list_pattern.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,56 @@ | ||
from typing import Optional | ||
from typing import Optional, Union | ||
|
||
from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix | ||
from dedoc.structure_extractors.patterns.regexp_pattern import RegexpPattern | ||
|
||
|
||
class LetterListPattern(RegexpPattern): | ||
""" | ||
Pattern for matching lists with letters and brackets, e.g. | ||
:: | ||
a) first element | ||
b) second element | ||
or (example for Armenian language) | ||
:: | ||
ա) տեղաբաշխել | ||
բ) Հայաստանի Հանրապետության | ||
գ) սահմանապահ վերակարգերի | ||
.. note:: | ||
The pattern is case-insensitive (lower and upper letters are not differed). | ||
Example of library usage: | ||
.. code-block:: python | ||
from dedoc.structure_extractors import DefaultStructureExtractor | ||
from dedoc.structure_extractors.patterns import LetterListPattern | ||
reader = ... | ||
structure_extractor = DefaultStructureExtractor() | ||
patterns = [LetterListPattern(line_type="list_item", level_1=1, level_2=1, can_be_multiline=False)] | ||
document = reader.read(file_path=file_path) | ||
document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) | ||
Example of API usage: | ||
.. code-block:: python | ||
import requests | ||
patterns = [{"name": "letter_list", "line_type": "list_item", "level_1": 1, "level_2": 1, "can_be_multiline": "false"}] | ||
parameters = {"patterns": str(patterns)} | ||
with open(file_path, "rb") as file: | ||
files = {"file": (file_name, file)} | ||
r = requests.post("http://localhost:1231/upload", files=files, data=parameters) | ||
""" | ||
_name = "letter_list" | ||
|
||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[bool or str] = None) -> None: | ||
def __init__(self, line_type: str, level_1: int, level_2: int, can_be_multiline: Optional[Union[bool, str]] = None) -> None: | ||
super().__init__(regexp=AnyLetterPrefix.regexp, line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline) |
Oops, something went wrong.