From b7790efb82182f949a64d636570f9061c7b561ad Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Fri, 30 Aug 2024 14:14:08 +0900 Subject: [PATCH] test content equality of rebuilt dictionaries --- .../dictionary/DictionaryPrinterTest.java | 102 ++++++++++++------ .../dictionary/DoubleArrayLexiconTest.java | 2 +- src/test/resources/dict/lex.csv | 5 +- 3 files changed, 73 insertions(+), 36 deletions(-) diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java index 73eac134..da396594 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryPrinterTest.java @@ -26,6 +26,7 @@ import java.io.IOException; import java.io.PrintStream; import java.nio.file.Path; +import java.util.Arrays; import com.worksap.nlp.sudachi.TestDictionary; import com.worksap.nlp.sudachi.Utils; @@ -49,6 +50,14 @@ public void setUp() throws IOException { Utils.copyResource(folder, "/unk.def", "/dict/matrix.def"); } + private String wordInfoToString(int wordId, WordInfo wordInfo) { + return String.format("%d, %s, %d, %d, %s, %d, %s, %s, %s, %s, %s, %s", wordId, wordInfo.getSurface(), + wordInfo.getLength(), wordInfo.getPOSId(), wordInfo.getNormalizedForm(), + wordInfo.getDictionaryFormWordId(), wordInfo.getDictionaryForm(), wordInfo.getReadingForm(), + Arrays.toString(wordInfo.getAunitSplit()), Arrays.toString(wordInfo.getBunitSplit()), + Arrays.toString(wordInfo.getWordStructure()), Arrays.toString(wordInfo.getSynonymGoupIds())); + } + @Test public void printWithSystemDict() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); @@ -60,7 +69,7 @@ public void printWithSystemDict() throws IOException { printer.printEntries(); actuals = output.toString().split(System.lineSeparator()); } - assertThat(actuals.length, is(39)); + assertThat(actuals.length, is(40)); assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*")); } @@ -101,14 +110,13 @@ public void readGrammarWithInvalidFile() throws IOException { } @Test - public void rebuildAndReprintSystem() throws IOException { + public void rebuildSystem() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "system.dic"); String printed; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(output); - BinaryDictionary dict = new BinaryDictionary(inputFile.getPath())) { - DictionaryPrinter printer = new DictionaryPrinter(ps, dict, null); + BinaryDictionary original = new BinaryDictionary(inputFile.getPath()); + try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { + DictionaryPrinter printer = new DictionaryPrinter(ps, original, null); printer.printEntries(); printed = output.toString(); } @@ -123,33 +131,46 @@ public void rebuildAndReprintSystem() throws IOException { DictionaryBuilder.main(new String[] { "-o", rebuiltDict.getPath(), "-m", matrixFile.getPath(), "-d", "rebuild system dict", lexiconFile.getPath() }); - String[] reprinted; - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(output); - BinaryDictionary dict = new BinaryDictionary(rebuiltDict.getPath())) { - DictionaryPrinter printer = new DictionaryPrinter(ps, dict, null); - printer.printEntries(); - reprinted = output.toString().split(System.lineSeparator()); + BinaryDictionary rebuilt = new BinaryDictionary(rebuiltDict.getPath()); + Long version = original.getDictionaryHeader().getVersion(); + assertThat(rebuilt.getDictionaryHeader().getVersion(), is(version)); + + if (DictionaryVersion.hasGrammar(version)) { + GrammarImpl grammarO = original.getGrammar(); + GrammarImpl grammarR = rebuilt.getGrammar(); + int originalPosSize = grammarO.getPartOfSpeechSize(); + assertThat(grammarR.getPartOfSpeechSize(), is(originalPosSize)); + for (short i = 0; i < originalPosSize; i++) { + assertThat(grammarR.getPartOfSpeechString(i), is(grammarO.getPartOfSpeechString(i))); + } } - assertThat(reprinted, is(printed.split(System.lineSeparator()))); + DoubleArrayLexicon lexO = original.getLexicon(); + DoubleArrayLexicon lexR = rebuilt.getLexicon(); + int wordSize = lexO.size(); + assertThat(lexR.size(), is(wordSize)); + for (int i = 0; i < wordSize; i++) { + WordInfo wio = lexO.getWordInfo(i); + WordInfo wir = lexR.getWordInfo(i); + assertThat(wordInfoToString(i, wir), is(wordInfoToString(i, wio))); + } + + original.close(); + rebuilt.close(); } @Test - public void rebuildAndReprintUser() throws IOException { + public void rebuildUser() throws IOException { File inputFile = new File(temporaryFolder.getRoot(), "user.dic"); File systemDictFile = new File(temporaryFolder.getRoot(), "system.dic"); String printed; - try (BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath())) { - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(output); - BinaryDictionary dict = new BinaryDictionary(inputFile.getPath())) { - DictionaryPrinter printer = new DictionaryPrinter(ps, dict, systemDict); - printer.printEntries(); - - printed = output.toString(); - } + BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath()); + BinaryDictionary original = new BinaryDictionary(inputFile.getPath()); + try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) { + DictionaryPrinter printer = new DictionaryPrinter(ps, original, systemDict); + printer.printEntries(); + printed = output.toString(); } File lexiconFile = new File(temporaryFolder.getRoot(), "user_lex.csv"); @@ -161,17 +182,32 @@ public void rebuildAndReprintUser() throws IOException { UserDictionaryBuilder.main(new String[] { "-o", rebuiltDict.getPath(), "-s", systemDictFile.getPath(), "-d", "rebuild user dict", lexiconFile.getPath() }); - String[] reprinted; - try (BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath())) { - try (ByteArrayOutputStream output = new ByteArrayOutputStream(); - PrintStream ps = new PrintStream(output); - BinaryDictionary dict = new BinaryDictionary(rebuiltDict.getPath())) { - DictionaryPrinter printer = new DictionaryPrinter(ps, dict, systemDict); - printer.printEntries(); - reprinted = output.toString().split(System.lineSeparator()); + BinaryDictionary rebuilt = new BinaryDictionary(rebuiltDict.getPath()); + Long version = original.getDictionaryHeader().getVersion(); + assertThat(rebuilt.getDictionaryHeader().getVersion(), is(version)); + + if (DictionaryVersion.hasGrammar(version)) { + GrammarImpl grammarO = original.getGrammar(); + GrammarImpl grammarR = rebuilt.getGrammar(); + int originalPosSize = grammarO.getPartOfSpeechSize(); + assertThat(grammarR.getPartOfSpeechSize(), is(originalPosSize)); + for (short i = 0; i < originalPosSize; i++) { + assertThat(grammarR.getPartOfSpeechString(i), is(grammarO.getPartOfSpeechString(i))); } } - assertThat(reprinted, is(printed.split(System.lineSeparator()))); + DoubleArrayLexicon lexO = original.getLexicon(); + DoubleArrayLexicon lexR = rebuilt.getLexicon(); + int wordSize = lexO.size(); + assertThat(lexR.size(), is(wordSize)); + for (int i = 0; i < wordSize; i++) { + WordInfo wio = lexO.getWordInfo(i); + WordInfo wir = lexR.getWordInfo(i); + assertThat(wordInfoToString(i, wir), is(wordInfoToString(i, wio))); + } + + original.close(); + rebuilt.close(); + systemDict.close(); } } \ No newline at end of file diff --git a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java index 151b513d..c1785463 100644 --- a/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexiconTest.java @@ -124,7 +124,7 @@ public void wordInfoWithLongWord() { @Test public void size() { - assertEquals(39, lexicon.size()); + assertEquals(40, lexicon.size()); } static List iteratorToList(Iterator iterator) { diff --git a/src/test/resources/dict/lex.csv b/src/test/resources/dict/lex.csv index 31d43125..d8214f6b 100644 --- a/src/test/resources/dict/lex.csv +++ b/src/test/resources/dict/lex.csv @@ -10,7 +10,7 @@ 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* -アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* +アイアイウ,6,6,32766,アイアイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* @@ -36,4 +36,5 @@ いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,* -な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,* \ No newline at end of file +隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,* +な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*