Fix Chinese and Japanese language detection logic

Refactor code to correctly increment language counters and update logic for handling uncertainty between Chinese and Japanese languages. Adjust accuracy reports to reflect updated detection accuracy metrics.
pemistahl · Sep 26, 2024 · e3fee27 · e3fee27
1 parent 951c720
commit e3fee27
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 41 deletions.
diff --git a/accuracy-reports/aggregated-accuracy-values.csv b/accuracy-reports/aggregated-accuracy-values.csv
@@ -11,7 +11,7 @@ Bokmal,NaN,NaN,NaN,NaN,49,24,44,80,NaN,NaN,NaN,NaN,49,27,47,74,58,38,58,76
 Bosnian,18,4,15,36,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,29,22,28,36,34,29,34,40
 Bulgarian,65,31,72,92,69,44,67,96,NaN,NaN,NaN,NaN,77,56,80,96,86,70,91,99
 Catalan,37,4,29,79,51,29,45,80,NaN,NaN,NaN,NaN,58,33,60,81,70,50,73,86
-Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,73,61,58,99,95,89,96,100
+Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,100,100,100,100,95,89,96,100
 Croatian,51,33,46,72,61,34,54,94,NaN,NaN,NaN,NaN,59,36,57,85,72,53,74,90
 Czech,73,50,79,90,63,42,66,82,NaN,NaN,NaN,NaN,70,54,71,87,80,65,84,91
 Danish,59,26,56,94,53,31,45,84,NaN,NaN,NaN,NaN,70,45,70,95,81,61,83,97

diff --git a/accuracy-reports/lingua-low-accuracy/Chinese.txt b/accuracy-reports/lingua-low-accuracy/Chinese.txt
@@ -1,16 +1,16 @@
 ##### Chinese #####
 
->>> Accuracy on average: 73.12%
+>>> Accuracy on average: 100%
 
 >> Detection of 1000 single words (average length: 1 chars)
-Accuracy: 61.1%
-Erroneously classified as Unknown: 38.9%
+Accuracy: 100%
+Erroneously classified as 
 
 >> Detection of 1000 word pairs (average length: 2 chars)
-Accuracy: 58.4%
-Erroneously classified as Unknown: 41.6%
+Accuracy: 100%
+Erroneously classified as 
 
 >> Detection of 729 sentences (average length: 48 chars)
-Accuracy: 99.86%
-Erroneously classified as Unknown: 0.13%
+Accuracy: 100%
+Erroneously classified as 
 
diff --git a/src/detector.rs b/src/detector.rs
@@ -813,13 +813,7 @@ impl LanguageDetector {
                 }
 
                 if !is_match {
-                    if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) {
-                        self.increment_counter(
-                            &mut word_language_counts,
-                            Language::from_str("Chinese").unwrap(),
-                            1,
-                        );
-                    }
+
                     if cfg!(feature = "japanese") //we need to test for both and later guess at which one it is
                         && JAPANESE_CHARACTER_SET.is_char_match(character)
                     {
@@ -828,6 +822,12 @@ impl LanguageDetector {
                             Language::from_str("Japanese").unwrap(),
                             1,
                         );
+                    }                    if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) {
+                        self.increment_counter(
+                            &mut word_language_counts,
+                            Language::from_str("Chinese").unwrap(),
+                            1,
+                        );
                     } else if Alphabet::Latin.matches_char(character)
                         || Alphabet::Cyrillic.matches_char(character)
                         || Alphabet::Devanagari.matches_char(character)
@@ -905,13 +905,28 @@ impl LanguageDetector {
         if total_language_counts.len() == 2
             && cfg!(feature = "chinese")
             && cfg!(feature = "japanese")
-            && total_language_counts.contains_key(&Some(Language::from_str("Chinese").unwrap()))
-            && total_language_counts.contains_key(&Some(Language::from_str("Japanese").unwrap()))
-            &&cjk_lang_uncertainty as f32/words.len() as f32>= cjk_lang_uncertainty_max_ratio
-            && self.is_low_accuracy_mode_enabled{
-                return None;
+            && total_language_counts.contains_key(&Some(Language::Chinese))
+            && total_language_counts.contains_key(&Some(Language::Japanese))
+            && (cjk_lang_uncertainty as f32 / words.len() as f32) >= cjk_lang_uncertainty_max_ratio
+            && self.is_low_accuracy_mode_enabled
+        {
+            // Retrieve the counts for Chinese and Japanese languages
+            let chinese_count = *total_language_counts
+                .get(&Some(Language::Chinese))
+                .unwrap_or(&0);
+            let japanese_count = *total_language_counts
+                .get(&Some(Language::Japanese))
+                .unwrap_or(&0);
+            // Compare the counts and return the language with the higher count
+            if chinese_count >= japanese_count {
+                return Some(Language::Chinese);
+            } else {
+                return Some(Language::Japanese);
+            }
         }
 
+
+
         let sorted_total_language_counts = total_language_counts
             .into_iter()
             .sorted_by(|(_, first_count), (_, second_count)| second_count.cmp(first_count))

diff --git a/src/script.rs b/src/script.rs
@@ -701,27 +701,27 @@ pub const GURMUKHI: &[(char, char)] = &[
 ];
 
 pub const HAN: &[(char, char)] = &[
-    ('⺀', '⺙'),
-    ('⺛', '⻳'),
-    ('⼀', '⿕'),
-    ('々', '々'),
-    ('〇', '〇'),
-    ('〡', '〩'),
-    ('〸', '〻'),
-    ('㐀', '䶿'),
-    ('一', '鿿'),
-    ('豈', '舘'),
-    ('並', '龎'),
-    ('𖿢', '𖿣'),
-    ('𖿰', '𖿱'),
-    ('𠀀', '𪛟'),
-    ('𪜀', '𫜹'),
-    ('𫝀', '𫠝'),
-    ('𫠠', '𬺡'),
-    ('𬺰', '𮯠'),
-    ('丽', '𪘀'),
-    ('𰀀', '𱍊'),
-    ('𱍐', '𲎯'),
+    ('⺀', '⺙'),   // U+2E80 to U+2E99: Radical symbols
+    ('⺛', '⻳'),   // U+2E9B to U+2EF3: Additional radicals
+    ('⼀', '⿕'),   // U+2F00 to U+2FD5: Kangxi radicals
+    ('々', '々'),   // U+3005: Ideographic iteration mark
+    ('〇', '〇'),   // U+3007: Ideographic number zero
+    ('〡', '〩'),   // U+3021 to U+3029: Suzhou numerals
+    ('〸', '〻'),   // U+3038 to U+303B: Hangzhou numerals and iteration marks
+    ('㐀', '䶿'),   // U+3400 to U+4DBF: CJK Unified Ideographs Extension A
+    ('一', '鿿'),   // U+4E00 to U+9FFF: CJK Unified Ideographs
+    ('豈', '舘'),   // U+F900 to U+FA6D: CJK Compatibility Ideographs
+    ('並', '龎'),   // U+FA70 to U+FAD9: Additional compatibility ideographs
+    ('𖿢', '𖿣'), // U+16FE2 to U+16FE3: Ideographic symbols and punctuation
+    ('𖿰', '𖿱'), // U+16FF0 to U+16FF1: Kana symbols
+    ('𠀀', '𪛟'), // U+20000 to U+2A6DF: CJK Unified Ideographs Extension B
+    ('𪜀', '𫜹'), // U+2A700 to U+2B734: Extensions C and D
+    ('𫝀', '𫠝'), // U+2B740 to U+2B81D: Extension E
+    ('𫠠', '𬺡'), // U+2B820 to U+2CEA1: Extensions F and G
+    ('𬺰', '𮯠'), // U+2CEB0 to U+2EBE0: Extension H
+    ('丽', '𪘀'), // U+2F800 to U+2FA1D: CJK Compatibility Ideographs Supplement
+    ('𰀀', '𱍊'), // U+30000 to U+3134A: Extension J (proposed)
+    ('𱍐', '𲎯'), // U+31350 to U+323AF: Extension K (proposed)
 ];
 
 pub const HANGUL: &[(char, char)] = &[