diff --git a/src/main/java/com/github/oeuvres/alix/cli/Load.java b/src/main/java/com/github/oeuvres/alix/cli/Load.java index 5b434c52..e14c5f9e 100644 --- a/src/main/java/com/github/oeuvres/alix/cli/Load.java +++ b/src/main/java/com/github/oeuvres/alix/cli/Load.java @@ -47,14 +47,10 @@ import java.util.Scanner; import java.util.concurrent.Callable; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.transform.TransformerException; - import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; -import org.xml.sax.SAXException; import com.github.oeuvres.alix.lucene.Alix; import com.github.oeuvres.alix.lucene.XMLIndexer; diff --git a/src/main/java/com/github/oeuvres/alix/fr/Tag.java b/src/main/java/com/github/oeuvres/alix/fr/Tag.java index c266e28e..e21f68cf 100644 --- a/src/main/java/com/github/oeuvres/alix/fr/Tag.java +++ b/src/main/java/com/github/oeuvres/alix/fr/Tag.java @@ -213,6 +213,8 @@ public enum Tag final public int flag; /** The first hexa digit, used as a type grouping */ final public int parent; + /** A name without spaces */ + final public String name; /** A french label for humans */ final public String label; /** A line of explanation */ @@ -225,16 +227,17 @@ public enum Tag this.label = label; this.desc = desc; this.parent = flag & 0xF0; + this.name = this.toString(); } /** Array to get a tag by number */ - private static final Tag[] byFlag = new Tag[256]; + private static final Tag[] Flag4name = new Tag[256]; /** Dictionary to get number of a tag by name */ - private static final Map byName = new HashMap(); + private static final Map Name4flag = new HashMap(); static { for (Tag tag : Tag.values()) { - byFlag[tag.flag] = tag; - byName.put(tag.toString(), tag.flag); + Flag4name[tag.flag] = tag; + Name4flag.put(tag.toString(), tag.flag); } } @@ -255,7 +258,7 @@ public boolean sameParent(final int flag) */ static public Tag parent(final int flag) { - Tag ret = byFlag[flag & 0xF0]; + Tag ret = Flag4name[flag & 0xF0]; if (ret == null) return UNKNOWN; return ret; @@ -292,28 +295,29 @@ public int flag() /** * - * @param label A Tag label. + * @param name A Tag name. * @return The Identifier number of a Tag. */ - public static int flag(final Chain label) + public static int flag(final Chain name) { @SuppressWarnings("unlikely-arg-type") - Integer ret = byName.get(label); + Integer ret = Name4flag.get(name); if (ret == null) { - LOGGER.log(Level.FINEST, "[Alix] unknown tag:" + label); + LOGGER.log(Level.FINEST, "[Alix] unknown tag:" + name); return UNKNOWN.flag; } return ret; } /** + * Get Tag number by name. * - * @param label A tag label. + * @param name A tag name. * @return The identifier number of a Tag. */ - public static int flag(final String label) + public static int flag(final String name) { - Integer ret = byName.get(label); + Integer ret = Name4flag.get(name); if (ret == null) return UNKNOWN.flag; return ret; @@ -329,22 +333,22 @@ public static Tag tag(int flag) { // the int may be used as a more complex bit flag flag = flag & 0xFF; - return byFlag[flag]; + return Flag4name[flag]; } /** - * Get Tag label by number identifier. + * Get Tag name by number identifier. * @param flag Tag identifier number. - * @return A human understandable label. + * @return Name of a Tag. */ - public static String label(int flag) + public static String name(int flag) { flag = flag & 0xFF; - Tag tag = byFlag[flag]; + Tag tag = Flag4name[flag]; if (tag == null) return null; - return tag.label; + return tag.name; } /** @@ -458,7 +462,7 @@ public String toString() StringBuilder sb = new StringBuilder(); for (int tag = 0; tag < 256; tag++) { if ((tag % 16) == 0) - sb.append(Tag.label(tag)).append("\t"); + sb.append(Tag.name(tag)).append("\t"); if (rule[tag]) sb.append(1); else diff --git a/src/main/java/com/github/oeuvres/alix/lucene/Alix.java b/src/main/java/com/github/oeuvres/alix/lucene/Alix.java index d53a6cd9..02b33e7c 100644 --- a/src/main/java/com/github/oeuvres/alix/lucene/Alix.java +++ b/src/main/java/com/github/oeuvres/alix/lucene/Alix.java @@ -38,7 +38,6 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.Locale; @@ -90,7 +89,6 @@ import static com.github.oeuvres.alix.Names.*; -import com.github.oeuvres.alix.lucene.analysis.FrAnalyzer; import com.github.oeuvres.alix.lucene.search.FieldFacet; import com.github.oeuvres.alix.lucene.search.Scale; import com.github.oeuvres.alix.lucene.search.FieldText; diff --git a/src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java b/src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java index 0dc79e85..2fab1757 100644 --- a/src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java +++ b/src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java @@ -4,7 +4,11 @@ import java.security.InvalidParameterException; import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntPoint; @@ -15,6 +19,7 @@ import org.apache.lucene.document.StringField; import org.apache.lucene.util.BytesRef; +import com.github.oeuvres.alix.lucene.analysis.MetaAnalyzer; import com.github.oeuvres.alix.util.ML; /** @@ -31,6 +36,8 @@ public class AlixDocument HashSet uniks = new HashSet<>(); /** Required fields for this collection */ String[] required; + /** Simple analyzer for recall */ + Analyzer metaAnalyzer = new MetaAnalyzer(); /** * Create document indexer with a list of required fields, tested when lucene document is requested. * @param required Array of field names. @@ -105,7 +112,20 @@ public AlixDocument author(String html) facetField("author", html); return this; } - + + /** + * Set a searchable bibliographic line for a document. + * Not for grouping, sorting, or hiliting. + * + * @param html Value, html tags allowed. + * @return This for chaining. + */ + public AlixDocument bibl(String html) + { + facetField("bibl", html); + return this; + } + /** * A field type unique for a document, usually mandatory, like a title or byline ; * maybe a covering class among a corpus. Could be used for sorting. @@ -123,14 +143,16 @@ public AlixDocument catField(String name, String html) doc.add(new StoredField(name, html)); String txt = ML.detag(html); BytesRef bytes = new BytesRef(txt); - // doc.add(new SortedDocValuesField(name, bytes)); + doc.add(new SortedDocValuesField(name, bytes)); + // NO, or lucene // doc.add(new SortedSetDocValuesField(name, bytes)); doc.add(new StringField(name, bytes, Field.Store.NO)); return this; } /** - * A field type repeatable for a document. + * A field repeatable for a document, like authors, or tags. + * Not searchable by word. * * @param name Field name. * @param html Field value, tags allowed. @@ -152,6 +174,23 @@ public AlixDocument facetField(String name, String html) return this; } + /** + * A searchable field allowing hiliting and frequency stats. + * + * @param name Field name. + * @param html Field value, tags allowed. + * @return This for chaining. + */ + public AlixDocument metaField(String name, String html) + { + if (bad(html)) return this; + doc.add(new StoredField(name, html)); // (TokenStream fields cannot be stored) + String txt = ML.detag(html); + TokenStream ts = metaAnalyzer.tokenStream("meta", txt); // renew token stream + doc.add(new Field(name, ts, Alix.ftypeMeta)); // indexation of the chosen tokens + return this; + } + /** * An int field, unique for a document, for sorting and grouping, ex: year. * @@ -174,10 +213,10 @@ public AlixDocument intField(final String name, final int value) /** * Set the body, caller should have strip un-necessary contents * - * @param html Field value. + * @param html Value, html tags allowed. * @return This for chaining. */ - public AlixDocument text(final String html) + public AlixDocument textField(final String html) { final String name = TEXT; if (bad(html)) { @@ -194,24 +233,32 @@ public AlixDocument text(final String html) } /** - * Returns the builded document + * Check which required field has not been set. + * + * @return An array of missing fields. */ - public Document document() + public String[] missing() { - boolean first =true; - String error = ""; + List missing = new LinkedList<>(); + if (required == null || required.length == 0) { + return null; + } for (String name: required) { + if (name == null || "".equals(name.trim())) { + continue; + } if (!uniks.contains(name)) { - if (first) { - error += ", "; - } - else { - first = false; - } - error += name; + missing.add(name); } } - // throw error or log ? + return missing.toArray(new String[0]); + } + + /** + * Returns the builded document + */ + public Document document() + { return doc; } diff --git a/src/main/java/com/github/oeuvres/alix/lucene/search/Doc.java b/src/main/java/com/github/oeuvres/alix/lucene/search/Doc.java index 855373c2..656e6d3c 100644 --- a/src/main/java/com/github/oeuvres/alix/lucene/search/Doc.java +++ b/src/main/java/com/github/oeuvres/alix/lucene/search/Doc.java @@ -316,22 +316,45 @@ public int docId() return docId; } + public int freq(final String field, final String[] forms) throws NoSuchFieldException, IOException + { + return freq( + alix.reader(), + this.docId, + field, + forms + ); + } /** - * Count occurences of terms in the doc + * Count occurences of terms in a doc. + * + * @param reader A Lucene reader to get stats from. + * @param doc Internal id of doc. + * @param field Field name. + * @param forms Array of forms. + * @return Occurrences count for founded forms. + * @throws NoSuchFieldException + * @throws IOException */ - public int freq(final String field, final String[] forms) throws NoSuchFieldException, IOException + static public int freq( + final IndexReader reader, + final int doc, + final String field, + final String[] forms + ) throws NoSuchFieldException, IOException { if (forms == null || forms.length < 1) return 0; Arrays.sort(forms); // may optimize term seekink, useful for uniq - Terms tvek = getTermVector(field); + Terms tvek = reader.termVectors().get(doc, field); + if (!tvek.hasFreqs()) { - throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " docId=" + docId); + throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " doc=" + doc); } int freq = 0; TermsEnum tenum = tvek.iterator(); if (tenum == null) { - throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " docId=" + docId); + throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " doc=" + doc); } PostingsEnum postings = null; String last = null; @@ -383,10 +406,6 @@ public String get(String field) throws NoSuchFieldException */ public Terms getTermVector(String field) throws IOException, NoSuchFieldException { - /* - * Shall we cache ? Terms tvek = vectors.get(field); if (tvek != null) return - * tvek; // cache OK - */ // new lucene API, not tested Terms tvek = alix.reader().termVectors().get(docId, field); if (tvek == null)