Skip to content

Commit

Permalink
test content equality of rebuilt dictionaries
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Aug 30, 2024
1 parent bb8096b commit b7790ef
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.io.IOException;
import java.io.PrintStream;
import java.nio.file.Path;
import java.util.Arrays;

import com.worksap.nlp.sudachi.TestDictionary;
import com.worksap.nlp.sudachi.Utils;
Expand All @@ -49,6 +50,14 @@ public void setUp() throws IOException {
Utils.copyResource(folder, "/unk.def", "/dict/matrix.def");
}

private String wordInfoToString(int wordId, WordInfo wordInfo) {
return String.format("%d, %s, %d, %d, %s, %d, %s, %s, %s, %s, %s, %s", wordId, wordInfo.getSurface(),
wordInfo.getLength(), wordInfo.getPOSId(), wordInfo.getNormalizedForm(),
wordInfo.getDictionaryFormWordId(), wordInfo.getDictionaryForm(), wordInfo.getReadingForm(),
Arrays.toString(wordInfo.getAunitSplit()), Arrays.toString(wordInfo.getBunitSplit()),
Arrays.toString(wordInfo.getWordStructure()), Arrays.toString(wordInfo.getSynonymGoupIds()));
}

@Test
public void printWithSystemDict() throws IOException {
File inputFile = new File(temporaryFolder.getRoot(), "system.dic");
Expand All @@ -60,7 +69,7 @@ public void printWithSystemDict() throws IOException {
printer.printEntries();
actuals = output.toString().split(System.lineSeparator());
}
assertThat(actuals.length, is(39));
assertThat(actuals.length, is(40));
assertThat(actuals[0], is("た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*"));
}

Expand Down Expand Up @@ -101,14 +110,13 @@ public void readGrammarWithInvalidFile() throws IOException {
}

@Test
public void rebuildAndReprintSystem() throws IOException {
public void rebuildSystem() throws IOException {
File inputFile = new File(temporaryFolder.getRoot(), "system.dic");

String printed;
try (ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(output);
BinaryDictionary dict = new BinaryDictionary(inputFile.getPath())) {
DictionaryPrinter printer = new DictionaryPrinter(ps, dict, null);
BinaryDictionary original = new BinaryDictionary(inputFile.getPath());
try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) {
DictionaryPrinter printer = new DictionaryPrinter(ps, original, null);
printer.printEntries();
printed = output.toString();
}
Expand All @@ -123,33 +131,46 @@ public void rebuildAndReprintSystem() throws IOException {
DictionaryBuilder.main(new String[] { "-o", rebuiltDict.getPath(), "-m", matrixFile.getPath(), "-d",
"rebuild system dict", lexiconFile.getPath() });

String[] reprinted;
try (ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(output);
BinaryDictionary dict = new BinaryDictionary(rebuiltDict.getPath())) {
DictionaryPrinter printer = new DictionaryPrinter(ps, dict, null);
printer.printEntries();
reprinted = output.toString().split(System.lineSeparator());
BinaryDictionary rebuilt = new BinaryDictionary(rebuiltDict.getPath());
Long version = original.getDictionaryHeader().getVersion();
assertThat(rebuilt.getDictionaryHeader().getVersion(), is(version));

if (DictionaryVersion.hasGrammar(version)) {
GrammarImpl grammarO = original.getGrammar();
GrammarImpl grammarR = rebuilt.getGrammar();
int originalPosSize = grammarO.getPartOfSpeechSize();
assertThat(grammarR.getPartOfSpeechSize(), is(originalPosSize));
for (short i = 0; i < originalPosSize; i++) {
assertThat(grammarR.getPartOfSpeechString(i), is(grammarO.getPartOfSpeechString(i)));
}
}

assertThat(reprinted, is(printed.split(System.lineSeparator())));
DoubleArrayLexicon lexO = original.getLexicon();
DoubleArrayLexicon lexR = rebuilt.getLexicon();
int wordSize = lexO.size();
assertThat(lexR.size(), is(wordSize));
for (int i = 0; i < wordSize; i++) {
WordInfo wio = lexO.getWordInfo(i);
WordInfo wir = lexR.getWordInfo(i);
assertThat(wordInfoToString(i, wir), is(wordInfoToString(i, wio)));
}

original.close();
rebuilt.close();
}

@Test
public void rebuildAndReprintUser() throws IOException {
public void rebuildUser() throws IOException {
File inputFile = new File(temporaryFolder.getRoot(), "user.dic");
File systemDictFile = new File(temporaryFolder.getRoot(), "system.dic");

String printed;
try (BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath())) {
try (ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(output);
BinaryDictionary dict = new BinaryDictionary(inputFile.getPath())) {
DictionaryPrinter printer = new DictionaryPrinter(ps, dict, systemDict);
printer.printEntries();

printed = output.toString();
}
BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath());
BinaryDictionary original = new BinaryDictionary(inputFile.getPath());
try (ByteArrayOutputStream output = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(output)) {
DictionaryPrinter printer = new DictionaryPrinter(ps, original, systemDict);
printer.printEntries();
printed = output.toString();
}

File lexiconFile = new File(temporaryFolder.getRoot(), "user_lex.csv");
Expand All @@ -161,17 +182,32 @@ public void rebuildAndReprintUser() throws IOException {
UserDictionaryBuilder.main(new String[] { "-o", rebuiltDict.getPath(), "-s", systemDictFile.getPath(), "-d",
"rebuild user dict", lexiconFile.getPath() });

String[] reprinted;
try (BinaryDictionary systemDict = BinaryDictionary.loadSystem(systemDictFile.getPath())) {
try (ByteArrayOutputStream output = new ByteArrayOutputStream();
PrintStream ps = new PrintStream(output);
BinaryDictionary dict = new BinaryDictionary(rebuiltDict.getPath())) {
DictionaryPrinter printer = new DictionaryPrinter(ps, dict, systemDict);
printer.printEntries();
reprinted = output.toString().split(System.lineSeparator());
BinaryDictionary rebuilt = new BinaryDictionary(rebuiltDict.getPath());
Long version = original.getDictionaryHeader().getVersion();
assertThat(rebuilt.getDictionaryHeader().getVersion(), is(version));

if (DictionaryVersion.hasGrammar(version)) {
GrammarImpl grammarO = original.getGrammar();
GrammarImpl grammarR = rebuilt.getGrammar();
int originalPosSize = grammarO.getPartOfSpeechSize();
assertThat(grammarR.getPartOfSpeechSize(), is(originalPosSize));
for (short i = 0; i < originalPosSize; i++) {
assertThat(grammarR.getPartOfSpeechString(i), is(grammarO.getPartOfSpeechString(i)));
}
}

assertThat(reprinted, is(printed.split(System.lineSeparator())));
DoubleArrayLexicon lexO = original.getLexicon();
DoubleArrayLexicon lexR = rebuilt.getLexicon();
int wordSize = lexO.size();
assertThat(lexR.size(), is(wordSize));
for (int i = 0; i < wordSize; i++) {
WordInfo wio = lexO.getWordInfo(i);
WordInfo wir = lexR.getWordInfo(i);
assertThat(wordInfoToString(i, wir), is(wordInfoToString(i, wio)));
}

original.close();
rebuilt.close();
systemDict.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ public void wordInfoWithLongWord() {

@Test
public void size() {
assertEquals(39, lexicon.size());
assertEquals(40, lexicon.size());
}

static <E> List<E> iteratorToList(Iterator<E> iterator) {
Expand Down
5 changes: 3 additions & 2 deletions src/test/resources/dict/lex.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*
アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,*
アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,*
アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,*
アイアイウ,6,6,32766,アイアイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,*
0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,*
1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,*
2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,*
Expand All @@ -36,4 +36,5 @@
いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
特a,8,8,2914,特a,名詞,普通名詞,一般,*,*,*,トクエー,特a,*,A,*,*,*,*
な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*
隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,*
な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*

0 comments on commit b7790ef

Please sign in to comment.