Skip to content

Commit

Permalink
Merge pull request #14 from R1j1t/dev
Browse files Browse the repository at this point in the history
Allow user to add vocab file
  • Loading branch information
R1j1t authored Jul 20, 2020
2 parents 3499393 + d6b4674 commit 1765bd6
Show file tree
Hide file tree
Showing 6 changed files with 7,949 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,4 @@ dmypy.json
peter's code/
*.pptx
*.ipynb
contextualSpellCheck/tests/debugFile.txt
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Contextual word checker for better suggestions
[![license](https://img.shields.io/github/license/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/blob/master/LICENSE)
[![PyPI](https://img.shields.io/pypi/v/contextualSpellCheck?color=green)](https://pypi.org/project/contextualSpellCheck/)
[![Python-Version](https://img.shields.io/badge/Python-3.6+-green)](https://github.com/R1j1t/contextualSpellCheck#install)
[![Downloads](https://pepy.tech/badge/contextualspellcheck)](https://pepy.tech/project/contextualspellcheck)
[![Downloads](https://pepy.tech/badge/contextualspellcheck/week)](https://pepy.tech/project/contextualspellcheck/week)
[![GitHub contributors](https://img.shields.io/github/contributors/r1j1t/contextualSpellCheck)](https://github.com/R1j1t/contextualSpellCheck/graphs/contributors)
[![Help Wanted](https://img.shields.io/badge/Help%20Wanted-Task%20List-violet)](https://github.com/R1j1t/contextualSpellCheck#task-list)

Expand Down
67 changes: 59 additions & 8 deletions contextualSpellCheck/contextualSpellCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,67 @@ class ContextualSpellCheck(object):
name = "contextual spellchecker"

def __init__(self, vocab_path="", debug=False, performance=False):
"""To create an object for this class. It does not require any special
Args:
vocab_path (str, optional): Vocabulary file path to be used by the model . Defaults to "".
debug (bool, optional): This help prints logs as the data flows throught the class. Defaults to False.
performance (bool, optional): This is used to print the time taken by individual steps in spell check. Defaults to False.
"""
if (
(type(vocab_path) != type(""))
or (type(debug) != type(True))
or (type(performance) != type(True))
):
raise TypeError(
"Please check datatype provided. vocab_path should be str, debug and performance should be bool"
)

if vocab_path != "":
try:
# First open() for user specified word addition to vocab
with open(vocab_path, encoding="utf8") as f:
# if want to remove '[unusedXX]' from vocab
# words = [line.rstrip() for line in f if not line.startswith('[unused')]
words = [line.strip() for line in f]

# The below code adds the neccesary words like numbers/puncutations/tokenizer specific words like [PAD]/[unused0]/##M
currentPath = os.path.dirname(__file__)
vocab_path = os.path.join(currentPath, "data", "vocab.txt")
extraToken = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
words.extend(extraToken)

with open(vocab_path, encoding="utf8") as f:
# if want to remove '[unusedXX]' from vocab
# words = [line.rstrip() for line in f if not line.startswith('[unused')]
for line in f:
extraToken = line.strip()
if extraToken.startswith("[unused"):
words.append(extraToken)
elif extraToken.startswith("##"):
words.append(extraToken)
elif len(extraToken) == 1:
words.append(extraToken)
if debug:
debugFilePath = os.path.join(currentPath, "tests", "debugFile.txt")
with open(debugFilePath, "w+") as newFile:
newFile.write("\n".join(words))
print("Final vocab at " + debugFilePath)

except Exception as e:
print(e)
warnings.warn("Using default vocab")
vocab_path = ""
words = []

if vocab_path == "":
currentPath = os.path.dirname(__file__)
vocab_path = os.path.join(currentPath, "data/vocab.txt")
# self.nlp = spacy.load(
# "en_core_web_sm", disable=["tagger", "parser"]
# ) # using default tokeniser with NER
with open(vocab_path) as f:
# if want to remove '[unusedXX]' from vocab
# words = [line.rstrip() for line in f if not line.startswith('[unused')]
words = [line.rstrip() for line in f]
with open(vocab_path, encoding="utf8") as f:
# if want to remove '[unusedXX]' from vocab
# words = [line.rstrip() for line in f if not line.startswith('[unused')]
words = [line.strip() for line in f]

self.vocab = Vocab(strings=words)
self.BertTokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
self.BertModel = AutoModelWithLMHead.from_pretrained("bert-base-cased")
Expand Down Expand Up @@ -448,7 +499,7 @@ def doc_outcome_spellCheck(self, doc):
doc = nlp(u"Income was $9.4 milion compared to the prior year of $2.7 milion.")

print("=" * 20, "Doc Extention Test", "=" * 20)
print(doc._.outcome_spellCheck, "\n")
print(doc._.outcome_spellCheck)

print(doc._.contextual_spellCheck)
print(doc._.performed_spellCheck)
Expand Down
Loading

0 comments on commit 1765bd6

Please sign in to comment.