OK for indexation

oeuvres · Dec 28, 2023 · 15296d4 · 15296d4
1 parent 71756b4
commit 15296d4
Show file tree

Hide file tree

Showing 9 changed files with 297 additions and 274 deletions.
diff --git a/src/main/java/com/github/oeuvres/alix/cli/Load.java b/src/main/java/com/github/oeuvres/alix/cli/Load.java
@@ -81,7 +81,7 @@ public class Load implements Callable<Integer>
     File[] conflist;
     @Option(names = { "-u", "--unsafe" }, description = "For windows filesystem, no temp lucene index")
     boolean unsafe;
-
+    @Option(names = {"-t", "--threads"}, description = "Number of threads fo indexation")
     int threads;
     /** File globs to index, populated by parsing base properties */
     ArrayList<File> globs = new ArrayList<>();
@@ -416,7 +416,7 @@ public void writeUnsafe(final File dstdir, final String name) throws Exception
         }
         try {
             // only one thread
-            threads = 1;
+            // threads = 1;
             write(name, theDir.toPath());
         } 
         catch (Exception e) {

diff --git a/src/main/java/com/github/oeuvres/alix/lucene/Alix.java b/src/main/java/com/github/oeuvres/alix/lucene/Alix.java
@@ -867,10 +867,11 @@ public IndexWriter writer(final Similarity similarity) throws IOException
         if (writer != null && writer.isOpen())
             return writer;
         IndexWriterConfig conf = new IndexWriterConfig(analyzer);
-        conf.setUseCompoundFile(false); // show separate file by segment
+        // Use false for batch indexing with very large ram buffer settings.
+        conf.setUseCompoundFile(false);
         // may needed, increase the max heap size to the JVM (eg add -Xmx512m or
         // -Xmx1g):
-        conf.setRAMBufferSizeMB(2048.0);
+        conf.setRAMBufferSizeMB(1024.0);
         conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
         //
         if (similarity != null)

diff --git a/src/main/java/com/github/oeuvres/alix/lucene/SAXIndexer.java b/src/main/java/com/github/oeuvres/alix/lucene/SAXIndexer.java
@@ -40,7 +40,9 @@
 import java.util.logging.Logger;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Store;
@@ -192,9 +194,9 @@ else if (book != null)
                     break;
                 case TEXT:
                     // at this point, impossible to get document stats, tokens will be played when
-                    // writer will
-                    // add document(s). Caching tokens is a bad idea for big books, forget, do not
-                    // retry.
+                    // writer will add document(s). 
+                    // cachingTokenFilter used to be memory expensive
+                    // TeeSinkTokenFilter will need to define analysis strategy here
                     doc.add(new StoredField(name, text)); // text has to be stored for snippets and conc
                     doc.add(new Field(name, text, Alix.ftypeText));
                     String name_orth = name + "_orth";

diff --git a/src/main/java/com/github/oeuvres/alix/lucene/analysis/FlagCloudFilter.java b/src/main/java/com/github/oeuvres/alix/lucene/analysis/FlagCloudFilter.java
@@ -1,158 +1,164 @@
-/*
- * Alix, A Lucene Indexer for XML documents.
- * 
- * Copyright 2009 Pierre Dittgen <[email protected]> 
- *                Frédéric Glorieux <[email protected]>
- * Copyright 2016 Frédéric Glorieux <[email protected]>
- *
- * Alix is a java library to index and search XML text documents
- * with Lucene https://lucene.apache.org/core/
- * including linguistic expertness for French,
- * available under Apache license.
- * 
- * Alix has been started in 2009 under the javacrim project
- * https://sf.net/projects/javacrim/
- * for a java course at Inalco  http://www.er-tim.fr/
- * Alix continues the concepts of SDX under another licence
- * «Système de Documentation XML»
- * 2000-2010  Ministère de la culture et de la communication (France), AJLSM.
- * http://savannah.nongnu.org/projects/sdx/
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.github.oeuvres.alix.lucene.analysis;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-import com.github.oeuvres.alix.fr.Tag;
-import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsLemAtt;
-import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsOrthAtt;
-
-/**
- * A final token filter before indexation,
- * to plug after a lemmatizer filter,
- * providing most significant tokens for word cloud. 
- * Index lemma instead of forms when available.
- * Strip punctuation and numbers.
- * Positions of striped tokens  are deleted.
- * This allows simple computation of a token context
- * (ex: span queries, co-occurrences).
- */
-public class FlagCloudFilter extends TokenFilter
-{
-  /** The term provided by the Tokenizer */
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  /** The position increment (inform it if positions are stripped) */
-  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-  /** A linguistic category as a short number, see {@link Tag} */
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
-  /** A normalized orthographic form */
-  private final CharsOrthAtt orthAtt = addAttribute(CharsOrthAtt.class);
-  /** A lemma when possible */
-  private final CharsLemAtt lemAtt = addAttribute(CharsLemAtt.class);
-  /** output pun or not ? */
-  boolean pun;
-  /** keep right position order */
-  private int skippedPositions;
-
-
-  public FlagCloudFilter(TokenStream in, boolean pun)
-  {
-    super(in);
-    this.pun = pun;
-  }
-
-  @Override
-  public final boolean incrementToken() throws IOException
-  {
-    // skipping positions will create holes, the count of tokens will be different from the count of positions
-    skippedPositions = 0;
-    while (input.incrementToken()) {
-      if (accept()) {
-        if (skippedPositions != 0) {
-          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
-        }
-        return true;
-      }
-      skippedPositions += posIncrAtt.getPositionIncrement();
-    }
-    /*
-    while (input.incrementToken()) {
-      if (accept()) return true;
-    }
-    */
-    return false;
-  }
-
-  /**
-   * Most of the tokens are not rejected but rewrited
-   * @return
-   * @throws IOException
-   */
-  protected boolean accept() throws IOException
-  {
-    int tag = flagsAtt.getFlags();
-    if (tag == Tag.TEST.flag) {
-      System.out.println(termAtt+" — "+orthAtt);
-    }
-    // record an empty token at puctuation position
-    if (Tag.PUN.sameParent(tag) && !pun) {
-      termAtt.setEmpty().append("");
-    }
-    // unify numbers
-    else if (Tag.NUM.sameParent(tag)) {
-      termAtt.setEmpty().append("NUM");
-    }
-    // replace term by lemma when available
-    else if (lemAtt.length() != 0) {
-      termAtt.setEmpty().append(lemAtt);
-    }
-    // or take the normalized form
-    else if (orthAtt.length() != 0) {
-      termAtt.setEmpty().append(orthAtt);
-    }
-    // filter some names
-    if (Tag.NAME.sameParent(tag)) {
-      // A., B.…
-      // if (termAtt.length() == 2 && Char.isUpperCase(termAtt.charAt(0)) && termAtt.charAt(1) == '.') return false;
-      /*
-      // filter first names 
-      if (tag == Tag.NAMEpersf || tag == Tag.NAMEpersm) return false;
-      // M., A.
-      if (termAtt.charAt(termAtt.length() - 1) == '.') return false;
-      // J.-J
-      if (termAtt.charAt(termAtt.length() - 2) == '-') return false;
-      */
-    }
-    return true;
-  }
-
-  @Override
-  public void reset() throws IOException
-  {
-    super.reset();
-  }
-
-  @Override
-  public void end() throws IOException
-  {
-    super.end();
-  }
-
-}
+/*
+ * Alix, A Lucene Indexer for XML documents.
+ * 
+ * Copyright 2009 Pierre Dittgen <[email protected]> 
+ *                Frédéric Glorieux <[email protected]>
+ * Copyright 2016 Frédéric Glorieux <[email protected]>
+ *
+ * Alix is a java library to index and search XML text documents
+ * with Lucene https://lucene.apache.org/core/
+ * including linguistic expertness for French,
+ * available under Apache license.
+ * 
+ * Alix has been started in 2009 under the javacrim project
+ * https://sf.net/projects/javacrim/
+ * for a java course at Inalco  http://www.er-tim.fr/
+ * Alix continues the concepts of SDX under another licence
+ * «Système de Documentation XML»
+ * 2000-2010  Ministère de la culture et de la communication (France), AJLSM.
+ * http://savannah.nongnu.org/projects/sdx/
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.github.oeuvres.alix.lucene.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+import com.github.oeuvres.alix.fr.Tag;
+import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsLemAtt;
+import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsOrthAtt;
+
+/**
+ * A final token filter before indexation,
+ * to plug after a lemmatizer filter,
+ * providing most significant tokens for word cloud. 
+ * Index lemma instead of forms when available.
+ * Strip punctuation and numbers.
+ * Positions of striped tokens  are deleted.
+ * This allows simple computation of a token context
+ * (ex: span queries, co-occurrences).
+ */
+public class FlagCloudFilter extends TokenFilter
+{
+  /** The term provided by the Tokenizer */
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  /** The position increment (inform it if positions are stripped) */
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  /** A linguistic category as a short number, see {@link Tag} */
+  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  /** A normalized orthographic form */
+  private final CharsOrthAtt orthAtt = addAttribute(CharsOrthAtt.class);
+  /** A lemma when possible */
+  private final CharsLemAtt lemAtt = addAttribute(CharsLemAtt.class);
+  /** output pun or not ? */
+  boolean pun;
+  /** keep right position order */
+  private int skippedPositions;
+
+  public FlagCloudFilter(TokenStream in)
+  {
+    super(in);
+    this.pun = false;
+  }
+
+
+  public FlagCloudFilter(TokenStream in, boolean pun)
+  {
+    super(in);
+    this.pun = pun;
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException
+  {
+    // skipping positions will create holes, the count of tokens will be different from the count of positions
+    skippedPositions = 0;
+    while (input.incrementToken()) {
+      if (accept()) {
+        if (skippedPositions != 0) {
+          posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+        }
+        return true;
+      }
+      skippedPositions += posIncrAtt.getPositionIncrement();
+    }
+    /*
+    while (input.incrementToken()) {
+      if (accept()) return true;
+    }
+    */
+    return false;
+  }
+
+  /**
+   * Most of the tokens are not rejected but rewrited
+   * @return
+   * @throws IOException
+   */
+  protected boolean accept() throws IOException
+  {
+    int tag = flagsAtt.getFlags();
+    if (tag == Tag.TEST.flag) {
+      System.out.println(termAtt+" — "+orthAtt);
+    }
+    // record an empty token at puctuation position
+    if (Tag.PUN.sameParent(tag) && !pun) {
+      termAtt.setEmpty().append("");
+    }
+    // unify numbers
+    else if (Tag.NUM.sameParent(tag)) {
+      termAtt.setEmpty().append("NUM");
+    }
+    // replace term by lemma when available
+    else if (lemAtt.length() != 0) {
+      termAtt.setEmpty().append(lemAtt);
+    }
+    // or take the normalized form
+    else if (orthAtt.length() != 0) {
+      termAtt.setEmpty().append(orthAtt);
+    }
+    // filter some names
+    if (Tag.NAME.sameParent(tag)) {
+      // A., B.…
+      // if (termAtt.length() == 2 && Char.isUpperCase(termAtt.charAt(0)) && termAtt.charAt(1) == '.') return false;
+      /*
+      // filter first names 
+      if (tag == Tag.NAMEpersf || tag == Tag.NAMEpersm) return false;
+      // M., A.
+      if (termAtt.charAt(termAtt.length() - 1) == '.') return false;
+      // J.-J
+      if (termAtt.charAt(termAtt.length() - 2) == '-') return false;
+      */
+    }
+    return true;
+  }
+
+  @Override
+  public void reset() throws IOException
+  {
+    super.reset();
+  }
+
+  @Override
+  public void end() throws IOException
+  {
+    super.end();
+  }
+
+}