Skip to content

Commit

Permalink
Use unique ngrams to improve low accuracy mode (#235)
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed Sep 20, 2024
1 parent 3a431e0 commit 16b3326
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 97 deletions.
4 changes: 3 additions & 1 deletion lingua/_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
MULTIPLE_WHITESPACE: Pattern = regex.compile(r"\s+")
NUMBERS: Pattern = regex.compile(r"\p{N}")
PUNCTUATION: Pattern = regex.compile(r"\p{P}")
LETTERS: Pattern = regex.compile(r"\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\p{L}+")
LETTERS: Pattern = regex.compile(
r"\p{Deva}+|\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\p{L}+"
)
TOKENS_WITH_OPTIONAL_WHITESPACE = regex.compile(
r"\s*(?:\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|[\p{L}'-]+)[\p{N}\p{P}]*\s*"
)
Expand Down
191 changes: 103 additions & 88 deletions lingua/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@
_MOST_COMMON_TRIGRAM_MODELS: Dict[Language, FrozenSet[str]] = {}
_MOST_COMMON_QUADRIGRAM_MODELS: Dict[Language, FrozenSet[str]] = {}
_MOST_COMMON_FIVEGRAM_MODELS: Dict[Language, FrozenSet[str]] = {}
_LANGUAGES_WITH_UNIQUE_SCRIPT: FrozenSet[Language] = Language.all_with_unique_script()
_LANGUAGES_WITH_SINGLE_UNIQUE_SCRIPT: FrozenSet[Language] = (
Language.all_with_single_unique_script()
)
_HIGH_ACCURACY_MODE_MAX_TEXT_LENGTH = 120


Expand Down Expand Up @@ -103,7 +105,7 @@ def _collect_languages_with_unique_characters(
)


def _collect_one_language_alphabets(
def _collect_single_language_alphabets(
languages: FrozenSet[Language],
) -> Dict[_Alphabet, Language]:
return {
Expand Down Expand Up @@ -226,7 +228,7 @@ def _from(
languages_with_unique_characters = _collect_languages_with_unique_characters(
languages
)
one_language_alphabets = _collect_one_language_alphabets(languages)
one_language_alphabets = _collect_single_language_alphabets(languages)
detector = LanguageDetector(
languages,
minimum_relative_distance,
Expand Down Expand Up @@ -254,8 +256,10 @@ def _from(
if is_every_language_model_preloaded:
detector._preload_language_models()

if detector._is_built_from_one_language:
if is_built_from_one_language or is_low_accuracy_mode_enabled:
detector._preload_unique_ngram_models()

if is_built_from_one_language:
detector._preload_most_common_ngram_models()

return detector
Expand Down Expand Up @@ -449,31 +453,33 @@ def unload_language_models(self):
This helps to free allocated memory previously consumed by the models.
"""
if self._is_built_from_one_language:
for language in self._languages:
try:
for language in self._languages:
try:
self._trigram_language_models.pop(language)

if not self._is_low_accuracy_mode_enabled:
self._unigram_language_models.pop(language)
self._bigram_language_models.pop(language)
self._quadrigram_language_models.pop(language)
self._fivegram_language_models.pop(language)

if (
self._is_built_from_one_language
or self._is_low_accuracy_mode_enabled
):
self._unique_unigram_language_models.pop(language)
self._unique_bigram_language_models.pop(language)
self._unique_trigram_language_models.pop(language)
self._unique_quadrigram_language_models.pop(language)
self._unique_fivegram_language_models.pop(language)

if self._is_built_from_one_language:
self._most_common_unigram_language_models.pop(language)
self._most_common_bigram_language_models.pop(language)
self._most_common_trigram_language_models.pop(language)
self._most_common_quadrigram_language_models.pop(language)
self._most_common_fivegram_language_models.pop(language)
except KeyError:
pass

for language in self._languages:
try:
self._trigram_language_models.pop(language)

if not self._is_low_accuracy_mode_enabled:
self._unigram_language_models.pop(language)
self._bigram_language_models.pop(language)
self._quadrigram_language_models.pop(language)
self._fivegram_language_models.pop(language)
except KeyError:
pass

Expand Down Expand Up @@ -657,16 +663,19 @@ def compute_language_confidence_values(self, text: str) -> List[ConfidenceValue]

words = _split_text_into_words(text)
if len(words) == 0:
_sort_confidence_values(values)
return values

if self._is_built_from_one_language:
if self._is_built_from_one_language or self._is_low_accuracy_mode_enabled:
language_detected_by_ngrams = (
self._detect_language_with_unique_and_common_ngrams(words)
)
if language_detected_by_ngrams is not None:
values[0] = ConfidenceValue(language_detected_by_ngrams, 1.0)
return values
for i in range(len(values)):
if values[i].language == language_detected_by_ngrams:
values[i] = ConfidenceValue(language_detected_by_ngrams, 1.0)
break
_sort_confidence_values(values)
return values

language_detected_by_rules = self._detect_language_with_rules(words)

Expand All @@ -678,6 +687,9 @@ def compute_language_confidence_values(self, text: str) -> List[ConfidenceValue]
_sort_confidence_values(values)
return values

if self._is_built_from_one_language:
return values

filtered_languages = self._filter_languages_by_rules(words)

if len(filtered_languages) == 1:
Expand Down Expand Up @@ -767,85 +779,88 @@ def compute_language_confidence(self, text: str, language: Language) -> float:
def _detect_language_with_unique_and_common_ngrams(
self, words: List[str]
) -> Optional[Language]:
selected_language = next(iter(self._languages))

fivegrams = _create_ngrams(words, ngram_length=5)

if selected_language in self._unique_fivegram_language_models:
for fivegram in fivegrams:
if fivegram in self._unique_fivegram_language_models[selected_language]:
return selected_language

if selected_language in self._most_common_fivegram_language_models:
for fivegram in fivegrams:
if (
fivegram
in self._most_common_fivegram_language_models[selected_language]
):
return selected_language
for language in self._languages:
if language in self._unique_fivegram_language_models:
for fivegram in fivegrams:
if fivegram in self._unique_fivegram_language_models[language]:
return language

if (
self._is_built_from_one_language
and language in self._most_common_fivegram_language_models
):
for fivegram in fivegrams:
if fivegram in self._most_common_fivegram_language_models[language]:
return language

quadrigrams = _create_ngrams(words, ngram_length=4)

if selected_language in self._unique_quadrigram_language_models:
for quadrigram in quadrigrams:
if (
quadrigram
in self._unique_quadrigram_language_models[selected_language]
):
return selected_language

if selected_language in self._most_common_quadrigram_language_models:
for quadrigram in quadrigrams:
if (
quadrigram
in self._most_common_quadrigram_language_models[selected_language]
):
return selected_language
for language in self._languages:
if language in self._unique_quadrigram_language_models:
for quadrigram in quadrigrams:
if quadrigram in self._unique_quadrigram_language_models[language]:
return language

if (
self._is_built_from_one_language
and language in self._most_common_quadrigram_language_models
):
for quadrigram in quadrigrams:
if (
quadrigram
in self._most_common_quadrigram_language_models[language]
):
return language

trigrams = _create_ngrams(words, ngram_length=3)

if selected_language in self._unique_trigram_language_models:
for trigram in trigrams:
if trigram in self._unique_trigram_language_models[selected_language]:
return selected_language

if selected_language in self._most_common_trigram_language_models:
for trigram in trigrams:
if (
trigram
in self._most_common_trigram_language_models[selected_language]
):
return selected_language
for language in self._languages:
if language in self._unique_trigram_language_models:
for trigram in trigrams:
if trigram in self._unique_trigram_language_models[language]:
return language

if (
self._is_built_from_one_language
and language in self._most_common_trigram_language_models
):
for trigram in trigrams:
if trigram in self._most_common_trigram_language_models[language]:
return language

bigrams = _create_ngrams(words, ngram_length=2)

if selected_language in self._unique_bigram_language_models:
for bigram in bigrams:
if bigram in self._unique_bigram_language_models[selected_language]:
return selected_language

if (
selected_language in _LANGUAGES_WITH_UNIQUE_SCRIPT
or selected_language == Language.HINDI
or selected_language == Language.MARATHI
):
unigrams = _create_ngrams(words, ngram_length=1)
for language in self._languages:
if language in self._unique_bigram_language_models:
for bigram in bigrams:
if bigram in self._unique_bigram_language_models[language]:
return language

if (
language == Language.HINDI
or language == Language.MARATHI
or (language == Language.JAPANESE and self._is_built_from_one_language)
or language in _LANGUAGES_WITH_SINGLE_UNIQUE_SCRIPT
):
unigrams = _create_ngrams(words, ngram_length=1)

if selected_language in self._unique_unigram_language_models:
for unigram in unigrams:
if (
unigram
in self._unique_unigram_language_models[selected_language]
):
return selected_language
if language in self._unique_unigram_language_models:
for unigram in unigrams:
if unigram in self._unique_unigram_language_models[language]:
return language

if selected_language in self._most_common_unigram_language_models:
for unigram in unigrams:
if (
unigram
in self._most_common_unigram_language_models[selected_language]
):
return selected_language
if (
self._is_built_from_one_language
and language in self._most_common_unigram_language_models
):
for unigram in unigrams:
if (
unigram
in self._most_common_unigram_language_models[language]
):
return language

return None

Expand Down
13 changes: 10 additions & 3 deletions lingua/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,9 +421,16 @@ def all_with_latin_script(cls) -> FrozenSet["Language"]:
)

@classmethod
def all_with_unique_script(cls) -> FrozenSet["Language"]:
"""Return a set of all languages supporting a unique script."""
return frozenset(_Alphabet.all_supporting_single_language().values())
def all_with_single_unique_script(cls) -> FrozenSet["Language"]:
"""Return a set of all languages supporting a single unique script."""
languages = set()
single_language_alphabets = _Alphabet.all_supporting_single_language().keys()
for language in Language:
if len(language._alphabets) == 1:
alphabet = next(iter(language._alphabets))
if alphabet in single_language_alphabets:
languages.add(language)
return frozenset(languages)

@classmethod
def from_iso_code_639_1(cls, iso_code: IsoCode639_1) -> "Language":
Expand Down
4 changes: 2 additions & 2 deletions tests/test_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
_QUADRIGRAM_MODELS,
_FIVEGRAM_MODELS,
_collect_languages_with_unique_characters,
_collect_one_language_alphabets,
_collect_single_language_alphabets,
_split_text_into_words,
)
from lingua.language import Language
Expand Down Expand Up @@ -226,7 +226,7 @@ def customized_detector_for_english_and_german(
_languages_with_unique_characters=_collect_languages_with_unique_characters(
languages
),
_one_language_alphabets=_collect_one_language_alphabets(languages),
_one_language_alphabets=_collect_single_language_alphabets(languages),
_unigram_language_models=unigram_models,
_bigram_language_models=bigram_models,
_trigram_language_models=trigram_models,
Expand Down
5 changes: 2 additions & 3 deletions tests/test_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ def test_languages_support_latin_script():
)


def test_languages_with_unique_script():
assert Language.all_with_unique_script() == frozenset(
def test_languages_with_single_unique_script():
assert Language.all_with_single_unique_script() == frozenset(
[
Language.ARMENIAN,
Language.BENGALI,
Expand All @@ -274,7 +274,6 @@ def test_languages_with_unique_script():
Language.PUNJABI,
Language.KOREAN,
Language.HEBREW,
Language.JAPANESE,
Language.TAMIL,
Language.TELUGU,
Language.THAI,
Expand Down

0 comments on commit 16b3326

Please sign in to comment.