diff --git a/accuracy-reports/aggregated-accuracy-values.csv b/accuracy-reports/aggregated-accuracy-values.csv index d21fa41d..17651e8e 100644 --- a/accuracy-reports/aggregated-accuracy-values.csv +++ b/accuracy-reports/aggregated-accuracy-values.csv @@ -11,7 +11,7 @@ Bokmal,NaN,NaN,NaN,NaN,49,24,44,80,NaN,NaN,NaN,NaN,49,27,47,74,58,38,58,76 Bosnian,18,4,15,36,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,29,22,28,36,34,29,34,40 Bulgarian,65,31,72,92,69,44,67,96,NaN,NaN,NaN,NaN,77,56,80,96,86,70,91,99 Catalan,37,4,29,79,51,29,45,80,NaN,NaN,NaN,NaN,58,33,60,81,70,50,73,86 -Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,73,61,58,99,95,89,96,100 +Chinese,33,NaN,2,98,100,100,100,100,97,93,98,100,100,100,100,100,95,89,96,100 Croatian,51,33,46,72,61,34,54,94,NaN,NaN,NaN,NaN,59,36,57,85,72,53,74,90 Czech,73,50,79,90,63,42,66,82,NaN,NaN,NaN,NaN,70,54,71,87,80,65,84,91 Danish,59,26,56,94,53,31,45,84,NaN,NaN,NaN,NaN,70,45,70,95,81,61,83,97 diff --git a/accuracy-reports/lingua-low-accuracy/Chinese.txt b/accuracy-reports/lingua-low-accuracy/Chinese.txt index 422ab3c0..ef70daea 100644 --- a/accuracy-reports/lingua-low-accuracy/Chinese.txt +++ b/accuracy-reports/lingua-low-accuracy/Chinese.txt @@ -1,16 +1,16 @@ ##### Chinese ##### ->>> Accuracy on average: 73.12% +>>> Accuracy on average: 100% >> Detection of 1000 single words (average length: 1 chars) -Accuracy: 61.1% -Erroneously classified as Unknown: 38.9% +Accuracy: 100% +Erroneously classified as >> Detection of 1000 word pairs (average length: 2 chars) -Accuracy: 58.4% -Erroneously classified as Unknown: 41.6% +Accuracy: 100% +Erroneously classified as >> Detection of 729 sentences (average length: 48 chars) -Accuracy: 99.86% -Erroneously classified as Unknown: 0.13% +Accuracy: 100% +Erroneously classified as diff --git a/src/detector.rs b/src/detector.rs index 4cf384ef..e2052fa5 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -813,13 +813,7 @@ impl LanguageDetector { } if !is_match { - if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) { - self.increment_counter( - &mut word_language_counts, - Language::from_str("Chinese").unwrap(), - 1, - ); - } + if cfg!(feature = "japanese") //we need to test for both and later guess at which one it is && JAPANESE_CHARACTER_SET.is_char_match(character) { @@ -828,6 +822,12 @@ impl LanguageDetector { Language::from_str("Japanese").unwrap(), 1, ); + } if cfg!(feature = "chinese") && Alphabet::Han.matches_char(character) { + self.increment_counter( + &mut word_language_counts, + Language::from_str("Chinese").unwrap(), + 1, + ); } else if Alphabet::Latin.matches_char(character) || Alphabet::Cyrillic.matches_char(character) || Alphabet::Devanagari.matches_char(character) @@ -905,13 +905,28 @@ impl LanguageDetector { if total_language_counts.len() == 2 && cfg!(feature = "chinese") && cfg!(feature = "japanese") - && total_language_counts.contains_key(&Some(Language::from_str("Chinese").unwrap())) - && total_language_counts.contains_key(&Some(Language::from_str("Japanese").unwrap())) - &&cjk_lang_uncertainty as f32/words.len() as f32>= cjk_lang_uncertainty_max_ratio - && self.is_low_accuracy_mode_enabled{ - return None; + && total_language_counts.contains_key(&Some(Language::Chinese)) + && total_language_counts.contains_key(&Some(Language::Japanese)) + && (cjk_lang_uncertainty as f32 / words.len() as f32) >= cjk_lang_uncertainty_max_ratio + && self.is_low_accuracy_mode_enabled + { + // Retrieve the counts for Chinese and Japanese languages + let chinese_count = *total_language_counts + .get(&Some(Language::Chinese)) + .unwrap_or(&0); + let japanese_count = *total_language_counts + .get(&Some(Language::Japanese)) + .unwrap_or(&0); + // Compare the counts and return the language with the higher count + if chinese_count >= japanese_count { + return Some(Language::Chinese); + } else { + return Some(Language::Japanese); + } } + + let sorted_total_language_counts = total_language_counts .into_iter() .sorted_by(|(_, first_count), (_, second_count)| second_count.cmp(first_count)) diff --git a/src/script.rs b/src/script.rs index 5de3443d..6e9c4f3d 100644 --- a/src/script.rs +++ b/src/script.rs @@ -701,27 +701,27 @@ pub const GURMUKHI: &[(char, char)] = &[ ]; pub const HAN: &[(char, char)] = &[ - ('⺀', '⺙'), - ('⺛', '⻳'), - ('⼀', '⿕'), - ('々', '々'), - ('〇', '〇'), - ('〡', '〩'), - ('〸', '〻'), - ('㐀', '䶿'), - ('一', '鿿'), - ('豈', '舘'), - ('並', '龎'), - ('𖿢', '𖿣'), - ('𖿰', '𖿱'), - ('𠀀', '𪛟'), - ('𪜀', '𫜹'), - ('𫝀', '𫠝'), - ('𫠠', '𬺡'), - ('𬺰', '𮯠'), - ('丽', '𪘀'), - ('𰀀', '𱍊'), - ('𱍐', '𲎯'), + ('⺀', '⺙'), // U+2E80 to U+2E99: Radical symbols + ('⺛', '⻳'), // U+2E9B to U+2EF3: Additional radicals + ('⼀', '⿕'), // U+2F00 to U+2FD5: Kangxi radicals + ('々', '々'), // U+3005: Ideographic iteration mark + ('〇', '〇'), // U+3007: Ideographic number zero + ('〡', '〩'), // U+3021 to U+3029: Suzhou numerals + ('〸', '〻'), // U+3038 to U+303B: Hangzhou numerals and iteration marks + ('㐀', '䶿'), // U+3400 to U+4DBF: CJK Unified Ideographs Extension A + ('一', '鿿'), // U+4E00 to U+9FFF: CJK Unified Ideographs + ('豈', '舘'), // U+F900 to U+FA6D: CJK Compatibility Ideographs + ('並', '龎'), // U+FA70 to U+FAD9: Additional compatibility ideographs + ('𖿢', '𖿣'), // U+16FE2 to U+16FE3: Ideographic symbols and punctuation + ('𖿰', '𖿱'), // U+16FF0 to U+16FF1: Kana symbols + ('𠀀', '𪛟'), // U+20000 to U+2A6DF: CJK Unified Ideographs Extension B + ('𪜀', '𫜹'), // U+2A700 to U+2B734: Extensions C and D + ('𫝀', '𫠝'), // U+2B740 to U+2B81D: Extension E + ('𫠠', '𬺡'), // U+2B820 to U+2CEA1: Extensions F and G + ('𬺰', '𮯠'), // U+2CEB0 to U+2EBE0: Extension H + ('丽', '𪘀'), // U+2F800 to U+2FA1D: CJK Compatibility Ideographs Supplement + ('𰀀', '𱍊'), // U+30000 to U+3134A: Extension J (proposed) + ('𱍐', '𲎯'), // U+31350 to U+323AF: Extension K (proposed) ]; pub const HANGUL: &[(char, char)] = &[