Skip to content

Commit

Permalink
Fix Chinese and Japanese language detection logic
Browse files Browse the repository at this point in the history
Refactor code to correctly increment language counters and update logic for handling uncertainty between Chinese and Japanese languages. Adjust accuracy reports to reflect updated detection accuracy metrics.
  • Loading branch information
michaelbennieUFL committed Sep 26, 2024
1 parent 951c720 commit e3fee27
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 41 deletions.
2 changes: 1 addition & 1 deletion accuracy-reports/aggregated-accuracy-values.csv
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Bokmal,NaN,NaN,NaN,NaN,49,24,44,80,NaN,NaN,NaN,NaN,49,27,47,74,58,38,58,76
Bosnian,18,4,15,36,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,29,22,28,36,34,29,34,40
Bulgarian,65,31,72,92,69,44,67,96,NaN,NaN,NaN,NaN,77,56,80,96,86,70,91,99
Catalan,37,4,29,79,51,29,45,80,NaN,NaN,NaN,NaN,58,33,60,81,70,50,73,86
Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,73,61,58,99,95,89,96,100
Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,100,100,100,100,95,89,96,100
Croatian,51,33,46,72,61,34,54,94,NaN,NaN,NaN,NaN,59,36,57,85,72,53,74,90
Czech,73,50,79,90,63,42,66,82,NaN,NaN,NaN,NaN,70,54,71,87,80,65,84,91
Danish,59,26,56,94,53,31,45,84,NaN,NaN,NaN,NaN,70,45,70,95,81,61,83,97
Expand Down
14 changes: 7 additions & 7 deletions accuracy-reports/lingua-low-accuracy/Chinese.txt
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
##### Chinese #####

>>> Accuracy on average: 73.12%
>>> Accuracy on average: 100%

>> Detection of 1000 single words (average length: 1 chars)
Accuracy: 61.1%
Erroneously classified as Unknown: 38.9%
Accuracy: 100%
Erroneously classified as

>> Detection of 1000 word pairs (average length: 2 chars)
Accuracy: 58.4%
Erroneously classified as Unknown: 41.6%
Accuracy: 100%
Erroneously classified as

>> Detection of 729 sentences (average length: 48 chars)
Accuracy: 99.86%
Erroneously classified as Unknown: 0.13%
Accuracy: 100%
Erroneously classified as

39 changes: 27 additions & 12 deletions src/detector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -813,13 +813,7 @@ impl LanguageDetector {
}

if !is_match {
if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) {
self.increment_counter(
&mut word_language_counts,
Language::from_str("Chinese").unwrap(),
1,
);
}

if cfg!(feature = "japanese") //we need to test for both and later guess at which one it is
&& JAPANESE_CHARACTER_SET.is_char_match(character)
{
Expand All @@ -828,6 +822,12 @@ impl LanguageDetector {
Language::from_str("Japanese").unwrap(),
1,
);
} if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) {
self.increment_counter(
&mut word_language_counts,
Language::from_str("Chinese").unwrap(),
1,
);
} else if Alphabet::Latin.matches_char(character)
|| Alphabet::Cyrillic.matches_char(character)
|| Alphabet::Devanagari.matches_char(character)
Expand Down Expand Up @@ -905,13 +905,28 @@ impl LanguageDetector {
if total_language_counts.len() == 2
&& cfg!(feature = "chinese")
&& cfg!(feature = "japanese")
&& total_language_counts.contains_key(&Some(Language::from_str("Chinese").unwrap()))
&& total_language_counts.contains_key(&Some(Language::from_str("Japanese").unwrap()))
&&cjk_lang_uncertainty as f32/words.len() as f32>= cjk_lang_uncertainty_max_ratio
&& self.is_low_accuracy_mode_enabled{
return None;
&& total_language_counts.contains_key(&Some(Language::Chinese))
&& total_language_counts.contains_key(&Some(Language::Japanese))
&& (cjk_lang_uncertainty as f32 / words.len() as f32) >= cjk_lang_uncertainty_max_ratio
&& self.is_low_accuracy_mode_enabled
{
// Retrieve the counts for Chinese and Japanese languages
let chinese_count = *total_language_counts
.get(&Some(Language::Chinese))
.unwrap_or(&0);
let japanese_count = *total_language_counts
.get(&Some(Language::Japanese))
.unwrap_or(&0);
// Compare the counts and return the language with the higher count
if chinese_count >= japanese_count {
return Some(Language::Chinese);
} else {
return Some(Language::Japanese);
}
}



let sorted_total_language_counts = total_language_counts
.into_iter()
.sorted_by(|(_, first_count), (_, second_count)| second_count.cmp(first_count))
Expand Down
42 changes: 21 additions & 21 deletions src/script.rs
Original file line number Diff line number Diff line change
Expand Up @@ -701,27 +701,27 @@ pub const GURMUKHI: &[(char, char)] = &[
];

pub const HAN: &[(char, char)] = &[
('⺀', '⺙'),
('⺛', '⻳'),
('⼀', '⿕'),
('々', '々'),
('〇', '〇'),
('〡', '〩'),
('〸', '〻'),
('㐀', '䶿'),
('一', '鿿'),
('豈', '舘'),
('並', '龎'),
('𖿢', '𖿣'),
('𖿰', '𖿱'),
('𠀀', '𪛟'),
('𪜀', '𫜹'),
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
('⺀', '⺙'), // U+2E80 to U+2E99: Radical symbols
('⺛', '⻳'), // U+2E9B to U+2EF3: Additional radicals
('⼀', '⿕'), // U+2F00 to U+2FD5: Kangxi radicals
('々', '々'), // U+3005: Ideographic iteration mark
('〇', '〇'), // U+3007: Ideographic number zero
('〡', '〩'), // U+3021 to U+3029: Suzhou numerals
('〸', '〻'), // U+3038 to U+303B: Hangzhou numerals and iteration marks
('㐀', '䶿'), // U+3400 to U+4DBF: CJK Unified Ideographs Extension A
('一', '鿿'), // U+4E00 to U+9FFF: CJK Unified Ideographs
('豈', '舘'), // U+F900 to U+FA6D: CJK Compatibility Ideographs
('並', '龎'), // U+FA70 to U+FAD9: Additional compatibility ideographs
('𖿢', '𖿣'), // U+16FE2 to U+16FE3: Ideographic symbols and punctuation
('𖿰', '𖿱'), // U+16FF0 to U+16FF1: Kana symbols
('𠀀', '𪛟'), // U+20000 to U+2A6DF: CJK Unified Ideographs Extension B
('𪜀', '𫜹'), // U+2A700 to U+2B734: Extensions C and D
('𫝀', '𫠝'), // U+2B740 to U+2B81D: Extension E
('𫠠', '𬺡'), // U+2B820 to U+2CEA1: Extensions F and G
('𬺰', '𮯠'), // U+2CEB0 to U+2EBE0: Extension H
('丽', '𪘀'), // U+2F800 to U+2FA1D: CJK Compatibility Ideographs Supplement
('𰀀', '𱍊'), // U+30000 to U+3134A: Extension J (proposed)
('𱍐', '𲎯'), // U+31350 to U+323AF: Extension K (proposed)
];

pub const HANGUL: &[(char, char)] = &[
Expand Down

0 comments on commit e3fee27

Please sign in to comment.