Skip to content

Commit

Permalink
reduce memory usage during StringStorage compile
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Aug 5, 2024
1 parent d303362 commit c217631
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ private void checkString(String value, String name) {
public void validate() {
checkString(headword, "headword");
checkString(reading, "reading");
if (normalizedForm instanceof WordRef.Headword) {
checkString(((WordRef.Headword) normalizedForm).getHeadword(), "normalized form");
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,11 @@ private Item process(String str) {
int end = offsets[j];
String sub = str.substring(start, end);
// Create a possible substring only if
// 1. It does not exist yet
// 2. Can form a valid pointer to it (string pointer requires aligned offset
// 1. It will be used later
// 2. It does not exist yet
// 3. Can form a valid pointer to it (string pointer requires aligned offset
// based on str length)
if (!candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) {
if (strings.containsKey(sub) && !candidates.containsKey(sub) && ptr.isSubseqValid(start, end)) {
Item item = new Item(str, start, end);
item.root = full;
candidates.put(sub, item);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package com.worksap.nlp.sudachi.dictionary.build
import com.worksap.nlp.sudachi.dictionary.StringPtr
import com.worksap.nlp.sudachi.resStream
import java.io.StringReader
import kotlin.test.Ignore
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFails
Expand Down Expand Up @@ -95,30 +96,28 @@ class RawLexiconReaderTest {
skipVals.removeAt(i)

val text = skipCols.joinToString(",") + "\n" + skipVals.joinToString(",")
assertFails {
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
}
assertFails { RawLexiconReader(csvtext(text), POSTable(), false) }
}
}

@Test
fun failTooLongValue() {
val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1);
{
val oversizeWord = "a".repeat(StringPtr.MAX_LENGTH + 1)
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,${oversizeWord},,,1,,,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,${oversizeWord},,1,,,"""
Expand All @@ -137,29 +136,30 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ
}

@Test
@Ignore // Currently single split list is allowed.
fun failSingleSplit() {
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,1,,,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,1,,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,1,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,1"""
Expand All @@ -171,30 +171,30 @@ ${oversizeWord},6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウ
@Test
fun failTooManySplit() {
val oversizeSplit: String =
generateSequence { "1" }.take(Byte.MAX_VALUE.toInt() + 1).joinToString("/");
generateSequence { "1" }.take(Byte.MAX_VALUE.toInt() + 1).joinToString("/")

{
run {
var text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,${oversizeSplit},,,"""
var reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,${oversizeSplit},,"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,${oversizeSplit},"""
val reader = RawLexiconReader(csvtext(text), POSTable(), false)
assertFails { reader.nextEntry() }
}
{
run {
val text =
"""Surface,LeftId,RightId,Cost,pos1,pos2,pos3,pos4,pos5,pos6,reading_form,normalized_form,DictionaryForm,splita,splitb,splitC,wordstructure
東京都,6,8,5320,名詞,固有名詞,地名,一般,*,*,トウキョウト,,,,,,${oversizeSplit}"""
Expand Down

0 comments on commit c217631

Please sign in to comment.