Skip to content

Commit

Permalink
add constant for wordref split char
Browse files Browse the repository at this point in the history
  • Loading branch information
mh-northlander committed Aug 19, 2024
1 parent 19d0178 commit 5602a25
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import com.worksap.nlp.sudachi.WordId;
import com.worksap.nlp.sudachi.dictionary.build.Progress;
import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader;
import com.worksap.nlp.sudachi.dictionary.build.WordRef;
import com.worksap.nlp.sudachi.dictionary.build.RawLexiconReader.Column;

import java.io.Console;
Expand All @@ -30,11 +32,6 @@
import java.util.stream.Collectors;

public class DictionaryPrinter {
public static final char wordRefDelimiter = '/';
public static final String wordRefDelimiterStr = String.valueOf(wordRefDelimiter);
public static final char wordRefJoiner = ',';
public static final String wordRefJoinerStr = String.valueOf(wordRefJoiner);

private final PrintStream output;
private final Progress progress = Progress.syserr(20);

Expand Down Expand Up @@ -176,7 +173,8 @@ String wordRef(int wordId) {
parts.addAll(pos);
parts.add(reading);

return String.join(wordRefJoinerStr, parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList()));
return String.join(String.valueOf(WordRef.Parser.WORDREF_DELIMITER),
parts.stream().map(this::maybeEscapeRefPart).collect(Collectors.toList()));
}

/** encode word entry pointed by the wordId as WordRef.Headword. */
Expand All @@ -190,7 +188,7 @@ String wordRefHeadword(int wordId, int reference) {
}

String wordRefList(int[] wordIds) {
return String.join(wordRefDelimiterStr,
return String.join(String.valueOf(RawLexiconReader.LIST_DELIMITER),
Arrays.stream(wordIds).boxed().map(this::wordRef).collect(Collectors.toList()));
}

Expand All @@ -217,12 +215,12 @@ private String maybeEscapeString(String value) {

/** escape WordRef.Triple part. */
private String maybeEscapeRefPart(String value) {
boolean hasDelimiter = hasCh(value, wordRefDelimiter);
boolean hasJoiner = hasCh(value, wordRefJoiner);
boolean hasDelimiter = hasCh(value, RawLexiconReader.LIST_DELIMITER);
boolean hasJoiner = hasCh(value, WordRef.Parser.WORDREF_DELIMITER);
if (!hasDelimiter && !hasJoiner) {
return value;
}
return unicodeEscape(value, Arrays.asList(wordRefDelimiter, wordRefJoiner));
return unicodeEscape(value, Arrays.asList(RawLexiconReader.LIST_DELIMITER, WordRef.Parser.WORDREF_DELIMITER));
}

/** escape specified chars as unicode codepoint */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ public enum Column {
}
}

private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$");
public static final char LIST_DELIMITER = '/';

private List<String> cachedRow;
private int[] mapping;
private final CSVParser parser;
Expand All @@ -70,8 +73,6 @@ public RawLexiconReader(CSVParser parser, POSTable pos, boolean user) throws IOE
}
}

private static final Pattern INTEGER_REGEX = Pattern.compile("^-?\\d+$");

/** assume legacy column layout if header line is not present */
private boolean isLegacyColumnLayout() {
return mapping == null;
Expand Down Expand Up @@ -181,7 +182,7 @@ private Ints getInts(List<String> data, Column column) {
if (value == null || value.isEmpty() || "*".equals(value)) {
return Ints.wrap(Ints.EMPTY_ARRAY);
}
String[] parts = value.split("/");
String[] parts = value.split(String.valueOf(LIST_DELIMITER));
if (parts.length > Byte.MAX_VALUE) {
throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(),
new IllegalArgumentException("int list contained more than 127 entries: " + value));
Expand All @@ -199,7 +200,7 @@ private List<WordRef> getWordRefs(List<String> data, Column column, WordRef.Pars
if (value == null || value.isEmpty() || "*".equals(value)) {
return new ArrayList<>();
}
String[] parts = value.split("/");
String[] parts = value.split(String.valueOf(LIST_DELIMITER));
if (parts.length > Byte.MAX_VALUE) {
throw new InputFileException(parser.getName(), parser.getRowCount(), column.name(),
new IllegalArgumentException("reference list contained more than 127 entries: " + value));
Expand Down

0 comments on commit 5602a25

Please sign in to comment.