Skip to content

Commit

Permalink
OK for indexation
Browse files Browse the repository at this point in the history
  • Loading branch information
glorieux-f committed Dec 28, 2023
1 parent 71756b4 commit 15296d4
Show file tree
Hide file tree
Showing 9 changed files with 297 additions and 274 deletions.
4 changes: 2 additions & 2 deletions src/main/java/com/github/oeuvres/alix/cli/Load.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public class Load implements Callable<Integer>
File[] conflist;
@Option(names = { "-u", "--unsafe" }, description = "For windows filesystem, no temp lucene index")
boolean unsafe;

@Option(names = {"-t", "--threads"}, description = "Number of threads fo indexation")
int threads;
/** File globs to index, populated by parsing base properties */
ArrayList<File> globs = new ArrayList<>();
Expand Down Expand Up @@ -416,7 +416,7 @@ public void writeUnsafe(final File dstdir, final String name) throws Exception
}
try {
// only one thread
threads = 1;
// threads = 1;
write(name, theDir.toPath());
}
catch (Exception e) {
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/com/github/oeuvres/alix/lucene/Alix.java
Original file line number Diff line number Diff line change
Expand Up @@ -867,10 +867,11 @@ public IndexWriter writer(final Similarity similarity) throws IOException
if (writer != null && writer.isOpen())
return writer;
IndexWriterConfig conf = new IndexWriterConfig(analyzer);
conf.setUseCompoundFile(false); // show separate file by segment
// Use false for batch indexing with very large ram buffer settings.
conf.setUseCompoundFile(false);
// may needed, increase the max heap size to the JVM (eg add -Xmx512m or
// -Xmx1g):
conf.setRAMBufferSizeMB(2048.0);
conf.setRAMBufferSizeMB(1024.0);
conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
//
if (similarity != null)
Expand Down
8 changes: 5 additions & 3 deletions src/main/java/com/github/oeuvres/alix/lucene/SAXIndexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
import java.util.logging.Logger;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
Expand Down Expand Up @@ -192,9 +194,9 @@ else if (book != null)
break;
case TEXT:
// at this point, impossible to get document stats, tokens will be played when
// writer will
// add document(s). Caching tokens is a bad idea for big books, forget, do not
// retry.
// writer will add document(s).
// cachingTokenFilter used to be memory expensive
// TeeSinkTokenFilter will need to define analysis strategy here
doc.add(new StoredField(name, text)); // text has to be stored for snippets and conc
doc.add(new Field(name, text, Alix.ftypeText));
String name_orth = name + "_orth";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,158 +1,164 @@
/*
* Alix, A Lucene Indexer for XML documents.
*
* Copyright 2009 Pierre Dittgen <[email protected]>
* Frédéric Glorieux <[email protected]>
* Copyright 2016 Frédéric Glorieux <[email protected]>
*
* Alix is a java library to index and search XML text documents
* with Lucene https://lucene.apache.org/core/
* including linguistic expertness for French,
* available under Apache license.
*
* Alix has been started in 2009 under the javacrim project
* https://sf.net/projects/javacrim/
* for a java course at Inalco http://www.er-tim.fr/
* Alix continues the concepts of SDX under another licence
* «Système de Documentation XML»
* 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
* http://savannah.nongnu.org/projects/sdx/
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.oeuvres.alix.lucene.analysis;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import com.github.oeuvres.alix.fr.Tag;
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsLemAtt;
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsOrthAtt;

/**
* A final token filter before indexation,
* to plug after a lemmatizer filter,
* providing most significant tokens for word cloud.
* Index lemma instead of forms when available.
* Strip punctuation and numbers.
* Positions of striped tokens are deleted.
* This allows simple computation of a token context
* (ex: span queries, co-occurrences).
*/
public class FlagCloudFilter extends TokenFilter
{
/** The term provided by the Tokenizer */
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The position increment (inform it if positions are stripped) */
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/** A linguistic category as a short number, see {@link Tag} */
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
/** A normalized orthographic form */
private final CharsOrthAtt orthAtt = addAttribute(CharsOrthAtt.class);
/** A lemma when possible */
private final CharsLemAtt lemAtt = addAttribute(CharsLemAtt.class);
/** output pun or not ? */
boolean pun;
/** keep right position order */
private int skippedPositions;


public FlagCloudFilter(TokenStream in, boolean pun)
{
super(in);
this.pun = pun;
}

@Override
public final boolean incrementToken() throws IOException
{
// skipping positions will create holes, the count of tokens will be different from the count of positions
skippedPositions = 0;
while (input.incrementToken()) {
if (accept()) {
if (skippedPositions != 0) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
/*
while (input.incrementToken()) {
if (accept()) return true;
}
*/
return false;
}

/**
* Most of the tokens are not rejected but rewrited
* @return
* @throws IOException
*/
protected boolean accept() throws IOException
{
int tag = flagsAtt.getFlags();
if (tag == Tag.TEST.flag) {
System.out.println(termAtt+" — "+orthAtt);
}
// record an empty token at puctuation position
if (Tag.PUN.sameParent(tag) && !pun) {
termAtt.setEmpty().append("");
}
// unify numbers
else if (Tag.NUM.sameParent(tag)) {
termAtt.setEmpty().append("NUM");
}
// replace term by lemma when available
else if (lemAtt.length() != 0) {
termAtt.setEmpty().append(lemAtt);
}
// or take the normalized form
else if (orthAtt.length() != 0) {
termAtt.setEmpty().append(orthAtt);
}
// filter some names
if (Tag.NAME.sameParent(tag)) {
// A., B.…
// if (termAtt.length() == 2 && Char.isUpperCase(termAtt.charAt(0)) && termAtt.charAt(1) == '.') return false;
/*
// filter first names
if (tag == Tag.NAMEpersf || tag == Tag.NAMEpersm) return false;
// M., A.
if (termAtt.charAt(termAtt.length() - 1) == '.') return false;
// J.-J
if (termAtt.charAt(termAtt.length() - 2) == '-') return false;
*/
}
return true;
}

@Override
public void reset() throws IOException
{
super.reset();
}

@Override
public void end() throws IOException
{
super.end();
}

}
/*
* Alix, A Lucene Indexer for XML documents.
*
* Copyright 2009 Pierre Dittgen <[email protected]>
* Frédéric Glorieux <[email protected]>
* Copyright 2016 Frédéric Glorieux <[email protected]>
*
* Alix is a java library to index and search XML text documents
* with Lucene https://lucene.apache.org/core/
* including linguistic expertness for French,
* available under Apache license.
*
* Alix has been started in 2009 under the javacrim project
* https://sf.net/projects/javacrim/
* for a java course at Inalco http://www.er-tim.fr/
* Alix continues the concepts of SDX under another licence
* «Système de Documentation XML»
* 2000-2010 Ministère de la culture et de la communication (France), AJLSM.
* http://savannah.nongnu.org/projects/sdx/
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.oeuvres.alix.lucene.analysis;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import com.github.oeuvres.alix.fr.Tag;
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsLemAtt;
import com.github.oeuvres.alix.lucene.analysis.tokenattributes.CharsOrthAtt;

/**
* A final token filter before indexation,
* to plug after a lemmatizer filter,
* providing most significant tokens for word cloud.
* Index lemma instead of forms when available.
* Strip punctuation and numbers.
* Positions of striped tokens are deleted.
* This allows simple computation of a token context
* (ex: span queries, co-occurrences).
*/
public class FlagCloudFilter extends TokenFilter
{
/** The term provided by the Tokenizer */
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/** The position increment (inform it if positions are stripped) */
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
/** A linguistic category as a short number, see {@link Tag} */
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
/** A normalized orthographic form */
private final CharsOrthAtt orthAtt = addAttribute(CharsOrthAtt.class);
/** A lemma when possible */
private final CharsLemAtt lemAtt = addAttribute(CharsLemAtt.class);
/** output pun or not ? */
boolean pun;
/** keep right position order */
private int skippedPositions;

public FlagCloudFilter(TokenStream in)
{
super(in);
this.pun = false;
}


public FlagCloudFilter(TokenStream in, boolean pun)
{
super(in);
this.pun = pun;
}

@Override
public final boolean incrementToken() throws IOException
{
// skipping positions will create holes, the count of tokens will be different from the count of positions
skippedPositions = 0;
while (input.incrementToken()) {
if (accept()) {
if (skippedPositions != 0) {
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
/*
while (input.incrementToken()) {
if (accept()) return true;
}
*/
return false;
}

/**
* Most of the tokens are not rejected but rewrited
* @return
* @throws IOException
*/
protected boolean accept() throws IOException
{
int tag = flagsAtt.getFlags();
if (tag == Tag.TEST.flag) {
System.out.println(termAtt+" — "+orthAtt);
}
// record an empty token at puctuation position
if (Tag.PUN.sameParent(tag) && !pun) {
termAtt.setEmpty().append("");
}
// unify numbers
else if (Tag.NUM.sameParent(tag)) {
termAtt.setEmpty().append("NUM");
}
// replace term by lemma when available
else if (lemAtt.length() != 0) {
termAtt.setEmpty().append(lemAtt);
}
// or take the normalized form
else if (orthAtt.length() != 0) {
termAtt.setEmpty().append(orthAtt);
}
// filter some names
if (Tag.NAME.sameParent(tag)) {
// A., B.…
// if (termAtt.length() == 2 && Char.isUpperCase(termAtt.charAt(0)) && termAtt.charAt(1) == '.') return false;
/*
// filter first names
if (tag == Tag.NAMEpersf || tag == Tag.NAMEpersm) return false;
// M., A.
if (termAtt.charAt(termAtt.length() - 1) == '.') return false;
// J.-J
if (termAtt.charAt(termAtt.length() - 2) == '-') return false;
*/
}
return true;
}

@Override
public void reset() throws IOException
{
super.reset();
}

@Override
public void end() throws IOException
{
super.end();
}

}
Loading

0 comments on commit 15296d4

Please sign in to comment.