Skip to content

Commit

Permalink
Gallicobvie ready
Browse files Browse the repository at this point in the history
  • Loading branch information
glorieux-f committed Jan 9, 2024
1 parent 1d025dc commit 4f1f4c6
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 30 deletions.
36 changes: 36 additions & 0 deletions src/main/java/com/github/oeuvres/alix/lucene/Alix.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Locale;
Expand Down Expand Up @@ -88,6 +89,8 @@
import org.apache.lucene.util.Bits;

import static com.github.oeuvres.alix.Names.*;

import com.github.oeuvres.alix.lucene.analysis.FrAnalyzer;
import com.github.oeuvres.alix.lucene.search.FieldFacet;
import com.github.oeuvres.alix.lucene.search.Scale;
import com.github.oeuvres.alix.lucene.search.FieldText;
Expand Down Expand Up @@ -388,6 +391,7 @@ public FieldInt fieldInt(final String fieldName) throws IOException
return ints;
}


/**
* Get a co-occurrences reader.
*
Expand Down Expand Up @@ -915,4 +919,36 @@ public IndexWriter writer(final Similarity similarity) throws IOException
writer = new IndexWriter(dir, conf);
return writer;
}

/**
* Give a new writer to create a lucene index, could be closed.
* @param path
* @param analyzer
* @return
* @throws IOException
*/
public static IndexWriter writer(Path path, Analyzer analyzer) throws IOException
{
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
// Use false for batch indexing with very large ram buffer settings.
conf.setUseCompoundFile(false);
// may needed, increase the max heap size to the JVM (eg add -Xmx512m or
// -Xmx1g):
conf.setRAMBufferSizeMB(1024.0);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
// default similarity
Similarity similarity = new BM25Similarity();
conf.setSimilarity(similarity);
// no effect found with modification ConcurrentMergeScheduler
/*
* int threads = Runtime.getRuntime().availableProcessors() - 1;
* ConcurrentMergeScheduler cms = new ConcurrentMergeScheduler();
* cms.setMaxMergesAndThreads(threads, threads); cms.disableAutoIOThrottle();
* conf.setMergeScheduler(cms);
*/
// order docId by a field after merge ? No functionality should rely on such
// order
// conf.setIndexSort(new Sort(new SortField(YEAR, SortField.Type.INT)));
return new IndexWriter(FSDirectory.open(path), conf);
}
}
136 changes: 106 additions & 30 deletions src/main/java/com/github/oeuvres/alix/lucene/AlixDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,61 +13,96 @@
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.util.BytesRef;
import org.xml.sax.SAXException;

import com.github.oeuvres.alix.util.ML;

/**
* From HTML data, Create a lucene/alix document ready to index, with right
* fields names and types.
* From HTML data, Populate a lucene/alix document ready to index, with right
* fields names and types. Lucene document should be reusable, it will be cleared
* each time an id is set.
*
*/
public class AlixDocument
{
/** Lucene document to populate */
Document doc;
Document doc = new Document();
/** Non repeatable fields */
HashSet<String> uniks;
HashSet<String> uniks = new HashSet<>();
/** Required fields for this collection */
String[] required;
/**
* Create document indexer with a list of required fields, tested when lucene document is requested.
* @param required Array of field names.
*/
public AlixDocument(final String[] required)
{
this.required = required;
}

/**
* Create document, id is required, caller should ensure unicity.
* @param id
* Set id, caller should ensure unicity.
* @param id Unique among a collection.
* @return This for chaining.
*/
public AlixDocument(String id, String title, String byline)
public AlixDocument id(final String id)
{
if (bad(id)) {
throw new InvalidParameterException("An id is required for recall of documents, caller should ensure unicity.");
}
if (bad(title)) {
throw new InvalidParameterException("A title is mandatory, all document could have one.");
}
doc = new Document();
uniks = new HashSet<String>();
doc.clear();
uniks.clear();
catField(ALIX_ID, id);
catField(ALIX_TYPE, ARTICLE);
catField("title", title);

return this;
}

/**
* Set a title for a document.
*
* @param html
* @return This for chaining.
*/
public AlixDocument title(String html)
{
catField("title", html);
return this;
}

/**
* Set a byline for a document.
*
* @param html
* @return This for chaining.
*/
public AlixDocument byline(String html)
{
catField("byline", html);
return this;
}

/**
* Set only one year by document.
*
* @param year
* @return This for chaining.
*/
public void year(int year)
public AlixDocument year(int year)
{

intField("year", year);
return this;
}

/**
* Add an author, repetition allowed.
*
* @param html Field value, tags allowed.
* @return This for chaining.
*/
public void author(String html)
public AlixDocument author(String html)
{
facetField("author", html);
return this;
}

/**
Expand All @@ -77,29 +112,32 @@ public void author(String html)
*
* @param name Field name.
* @param html Field value, tags allowed.
* @return This for chaining.
*/
public void catField(String name, String html)
public AlixDocument catField(String name, String html)
{
if (bad(html)) return;
if (uniks.contains(name)) return;
if (bad(html)) return this;
if (uniks.contains(name)) return this;
uniks.add(name);
doc.add(new StoredField(name, html));
String txt = ML.detag(html);
BytesRef bytes = new BytesRef(txt);
doc.add(new SortedDocValuesField(name, bytes));
doc.add(new SortedSetDocValuesField(name, bytes));
doc.add(new StringField(name, bytes, Field.Store.NO));
return this;
}

/**
* A field type repeatable for a document.
*
* @param name
* @param html
* @param name Field name.
* @param html Field value, tags allowed.
* @return This for chaining.
*/
public void facetField(String name, String html)
public AlixDocument facetField(String name, String html)
{
if (bad(html)) return;
if (bad(html)) return this;
// first field of this name, replicate content it with name1, for sorting
if (!uniks.contains(name)) {
catField(name+"1", html);
Expand All @@ -110,20 +148,35 @@ public void facetField(String name, String html)
BytesRef bytes = new BytesRef(txt);
doc.add(new StringField(name, bytes, Field.Store.NO));
doc.add(new SortedSetDocValuesField(name, bytes));
return this;
}

public void intField(final String name, final int value)
/**
* An int field, unique for a document, for sorting and grouping, ex: year.
*
* @param name Field name.
* @param html Field value.
* @return This for chaining.
*/
public AlixDocument intField(final String name, final int value)
{
if (uniks.contains(name)) {
return this;
}
uniks.add(name);
doc.add(new IntPoint(name, value)); // to search
doc.add(new StoredField(name, value)); // to show
doc.add(new NumericDocValuesField(name, value)); // to sort
return this;
}

/**
* Set the body, caller should have strip unnecessary contents
* @param html
* Set the body, caller should have strip un-necessary contents
*
* @param html Field value.
* @return This for chaining.
*/
public void text(final String html)
public AlixDocument text(final String html)
{
final String name = TEXT;
if (bad(html)) {
Expand All @@ -136,8 +189,31 @@ public void text(final String html)
doc.add(new StoredField(name, html)); // text has to be stored for snippets and conc
doc.add(new Field(name, html, Alix.ftypeText)); // lemmas
doc.add(new Field(name + "_orth", html, Alix.ftypeText)); // orthographic forms
return this;
}

/**
* Returns the builded document
*/
public Document document()
{
boolean first =true;
String error = "";
for (String name: required) {
if (!uniks.contains(name)) {
if (first) {
error += ", ";
}
else {
first = false;
}
error += name;
}
}
// throw error or log ?
return doc;
}

/**
* Check for blank strings
* @param string
Expand Down
25 changes: 25 additions & 0 deletions src/main/java/com/github/oeuvres/alix/util/RandomName.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package com.github.oeuvres.alix.util;

import java.util.concurrent.ThreadLocalRandom;

public class RandomName {
/** Allowed chars for name */
private static final String chars = "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "1234567890_";
private static final int len = chars.length();


/**
* Get a random name of a size
*/
public static String name(int size)
{
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < size; i++) {
final int randint = ThreadLocalRandom.current().nextInt(0, len);
sb.append(chars.charAt(randint));
}
return sb.toString();
}
}

0 comments on commit 4f1f4c6

Please sign in to comment.