Skip to content

Commit

Permalink
Tag.name()
Browse files Browse the repository at this point in the history
  • Loading branch information
glorieux-f committed Jan 12, 2024
1 parent 9f4f91a commit eaae1e8
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 51 deletions.
4 changes: 0 additions & 4 deletions src/main/java/com/github/oeuvres/alix/cli/Load.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,10 @@
import java.util.Scanner;
import java.util.concurrent.Callable;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.xml.sax.SAXException;

import com.github.oeuvres.alix.lucene.Alix;
import com.github.oeuvres.alix.lucene.XMLIndexer;
Expand Down
42 changes: 23 additions & 19 deletions src/main/java/com/github/oeuvres/alix/fr/Tag.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ public enum Tag
final public int flag;
/** The first hexa digit, used as a type grouping */
final public int parent;
/** A name without spaces */
final public String name;
/** A french label for humans */
final public String label;
/** A line of explanation */
Expand All @@ -225,16 +227,17 @@ public enum Tag
this.label = label;
this.desc = desc;
this.parent = flag & 0xF0;
this.name = this.toString();
}

/** Array to get a tag by number */
private static final Tag[] byFlag = new Tag[256];
private static final Tag[] Flag4name = new Tag[256];
/** Dictionary to get number of a tag by name */
private static final Map<String, Integer> byName = new HashMap<String, Integer>();
private static final Map<String, Integer> Name4flag = new HashMap<String, Integer>();
static {
for (Tag tag : Tag.values()) {
byFlag[tag.flag] = tag;
byName.put(tag.toString(), tag.flag);
Flag4name[tag.flag] = tag;
Name4flag.put(tag.toString(), tag.flag);
}
}

Expand All @@ -255,7 +258,7 @@ public boolean sameParent(final int flag)
*/
static public Tag parent(final int flag)
{
Tag ret = byFlag[flag & 0xF0];
Tag ret = Flag4name[flag & 0xF0];
if (ret == null)
return UNKNOWN;
return ret;
Expand Down Expand Up @@ -292,28 +295,29 @@ public int flag()

/**
*
* @param label A Tag label.
* @param name A Tag name.
* @return The Identifier number of a Tag.
*/
public static int flag(final Chain label)
public static int flag(final Chain name)
{
@SuppressWarnings("unlikely-arg-type")
Integer ret = byName.get(label);
Integer ret = Name4flag.get(name);
if (ret == null) {
LOGGER.log(Level.FINEST, "[Alix] unknown tag:" + label);
LOGGER.log(Level.FINEST, "[Alix] unknown tag:" + name);
return UNKNOWN.flag;
}
return ret;
}

/**
* Get Tag number by name.
*
* @param label A tag label.
* @param name A tag name.
* @return The identifier number of a Tag.
*/
public static int flag(final String label)
public static int flag(final String name)
{
Integer ret = byName.get(label);
Integer ret = Name4flag.get(name);
if (ret == null)
return UNKNOWN.flag;
return ret;
Expand All @@ -329,22 +333,22 @@ public static Tag tag(int flag)
{
// the int may be used as a more complex bit flag
flag = flag & 0xFF;
return byFlag[flag];
return Flag4name[flag];
}


/**
* Get Tag label by number identifier.
* Get Tag name by number identifier.
* @param flag Tag identifier number.
* @return A human understandable label.
* @return Name of a Tag.
*/
public static String label(int flag)
public static String name(int flag)
{
flag = flag & 0xFF;
Tag tag = byFlag[flag];
Tag tag = Flag4name[flag];
if (tag == null)
return null;
return tag.label;
return tag.name;
}

/**
Expand Down Expand Up @@ -458,7 +462,7 @@ public String toString()
StringBuilder sb = new StringBuilder();
for (int tag = 0; tag < 256; tag++) {
if ((tag % 16) == 0)
sb.append(Tag.label(tag)).append("\t");
sb.append(Tag.name(tag)).append("\t");
if (rule[tag])
sb.append(1);
else
Expand Down
2 changes: 0 additions & 2 deletions src/main/java/com/github/oeuvres/alix/lucene/Alix.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Locale;
Expand Down Expand Up @@ -90,7 +89,6 @@

import static com.github.oeuvres.alix.Names.*;

import com.github.oeuvres.alix.lucene.analysis.FrAnalyzer;
import com.github.oeuvres.alix.lucene.search.FieldFacet;
import com.github.oeuvres.alix.lucene.search.Scale;
import com.github.oeuvres.alix.lucene.search.FieldText;
Expand Down
81 changes: 64 additions & 17 deletions src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@

import java.security.InvalidParameterException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
Expand All @@ -15,6 +19,7 @@
import org.apache.lucene.document.StringField;
import org.apache.lucene.util.BytesRef;

import com.github.oeuvres.alix.lucene.analysis.MetaAnalyzer;
import com.github.oeuvres.alix.util.ML;

/**
Expand All @@ -31,6 +36,8 @@ public class AlixDocument
HashSet<String> uniks = new HashSet<>();
/** Required fields for this collection */
String[] required;
/** Simple analyzer for recall */
Analyzer metaAnalyzer = new MetaAnalyzer();
/**
* Create document indexer with a list of required fields, tested when lucene document is requested.
* @param required Array of field names.
Expand Down Expand Up @@ -105,7 +112,20 @@ public AlixDocument author(String html)
facetField("author", html);
return this;
}


/**
* Set a searchable bibliographic line for a document.
* Not for grouping, sorting, or hiliting.
*
* @param html Value, html tags allowed.
* @return This for chaining.
*/
public AlixDocument bibl(String html)
{
facetField("bibl", html);
return this;
}

/**
* A field type unique for a document, usually mandatory, like a title or byline ;
* maybe a covering class among a corpus. Could be used for sorting.
Expand All @@ -123,14 +143,16 @@ public AlixDocument catField(String name, String html)
doc.add(new StoredField(name, html));
String txt = ML.detag(html);
BytesRef bytes = new BytesRef(txt);
// doc.add(new SortedDocValuesField(name, bytes));
doc.add(new SortedDocValuesField(name, bytes));
// NO, or lucene
// doc.add(new SortedSetDocValuesField(name, bytes));
doc.add(new StringField(name, bytes, Field.Store.NO));
return this;
}

/**
* A field type repeatable for a document.
* A field repeatable for a document, like authors, or tags.
* Not searchable by word.
*
* @param name Field name.
* @param html Field value, tags allowed.
Expand All @@ -152,6 +174,23 @@ public AlixDocument facetField(String name, String html)
return this;
}

/**
* A searchable field allowing hiliting and frequency stats.
*
* @param name Field name.
* @param html Field value, tags allowed.
* @return This for chaining.
*/
public AlixDocument metaField(String name, String html)
{
if (bad(html)) return this;
doc.add(new StoredField(name, html)); // (TokenStream fields cannot be stored)
String txt = ML.detag(html);
TokenStream ts = metaAnalyzer.tokenStream("meta", txt); // renew token stream
doc.add(new Field(name, ts, Alix.ftypeMeta)); // indexation of the chosen tokens
return this;
}

/**
* An int field, unique for a document, for sorting and grouping, ex: year.
*
Expand All @@ -174,10 +213,10 @@ public AlixDocument intField(final String name, final int value)
/**
* Set the body, caller should have strip un-necessary contents
*
* @param html Field value.
* @param html Value, html tags allowed.
* @return This for chaining.
*/
public AlixDocument text(final String html)
public AlixDocument textField(final String html)
{
final String name = TEXT;
if (bad(html)) {
Expand All @@ -194,24 +233,32 @@ public AlixDocument text(final String html)
}

/**
* Returns the builded document
* Check which required field has not been set.
*
* @return An array of missing fields.
*/
public Document document()
public String[] missing()
{
boolean first =true;
String error = "";
List<String> missing = new LinkedList<>();
if (required == null || required.length == 0) {
return null;
}
for (String name: required) {
if (name == null || "".equals(name.trim())) {
continue;
}
if (!uniks.contains(name)) {
if (first) {
error += ", ";
}
else {
first = false;
}
error += name;
missing.add(name);
}
}
// throw error or log ?
return missing.toArray(new String[0]);
}

/**
* Returns the builded document
*/
public Document document()
{
return doc;
}

Expand Down
37 changes: 28 additions & 9 deletions src/main/java/com/github/oeuvres/alix/lucene/search/Doc.java
Original file line number Diff line number Diff line change
Expand Up @@ -316,22 +316,45 @@ public int docId()
return docId;
}

public int freq(final String field, final String[] forms) throws NoSuchFieldException, IOException
{
return freq(
alix.reader(),
this.docId,
field,
forms
);
}
/**
* Count occurences of terms in the doc
* Count occurences of terms in a doc.
*
* @param reader A Lucene reader to get stats from.
* @param doc Internal id of doc.
* @param field Field name.
* @param forms Array of forms.
* @return Occurrences count for founded forms.
* @throws NoSuchFieldException
* @throws IOException
*/
public int freq(final String field, final String[] forms) throws NoSuchFieldException, IOException
static public int freq(
final IndexReader reader,
final int doc,
final String field,
final String[] forms
) throws NoSuchFieldException, IOException
{
if (forms == null || forms.length < 1)
return 0;
Arrays.sort(forms); // may optimize term seekink, useful for uniq
Terms tvek = getTermVector(field);
Terms tvek = reader.termVectors().get(doc, field);

if (!tvek.hasFreqs()) {
throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " docId=" + docId);
throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " doc=" + doc);
}
int freq = 0;
TermsEnum tenum = tvek.iterator();
if (tenum == null) {
throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " docId=" + docId);
throw new NoSuchFieldException("Missing freqs in TermVector for field=" + field + " doc=" + doc);
}
PostingsEnum postings = null;
String last = null;
Expand Down Expand Up @@ -383,10 +406,6 @@ public String get(String field) throws NoSuchFieldException
*/
public Terms getTermVector(String field) throws IOException, NoSuchFieldException
{
/*
* Shall we cache ? Terms tvek = vectors.get(field); if (tvek != null) return
* tvek; // cache OK
*/
// new lucene API, not tested
Terms tvek = alix.reader().termVectors().get(docId, field);
if (tvek == null)
Expand Down

0 comments on commit eaae1e8

Please sign in to comment.