diff --git a/PCRE2_API.md b/PCRE2_API.md index 5cb8d31..34faa33 100644 --- a/PCRE2_API.md +++ b/PCRE2_API.md @@ -27,9 +27,9 @@ Here's the list of the PCRE2 API functions exposed via `org.pcre4j.api.IPcre2` a | ✅ | [pcre2_get_ovector_count](https://www.pcre.org/current/doc/html/pcre2_get_ovector_count.html) | Get the ovector count | | ✅ | [pcre2_get_ovector_pointer](https://www.pcre.org/current/doc/html/pcre2_get_ovector_pointer.html) | Get a pointer to the ovector | | | [pcre2_get_startchar](https://www.pcre.org/current/doc/html/pcre2_get_startchar.html) | Get the starting character offset | -| | [pcre2_jit_compile](https://www.pcre.org/current/doc/html/pcre2_jit_compile.html) | Process a compiled pattern with the JIT compiler | +| ✅ | [pcre2_jit_compile](https://www.pcre.org/current/doc/html/pcre2_jit_compile.html) | Process a compiled pattern with the JIT compiler | | | [pcre2_jit_free_unused_memory](https://www.pcre.org/current/doc/html/pcre2_jit_free_unused_memory.html) | Free unused JIT memory | -| | [pcre2_jit_match](https://www.pcre.org/current/doc/html/pcre2_jit_match.html) | Fast path interface to JIT matching | +| ✅ | [pcre2_jit_match](https://www.pcre.org/current/doc/html/pcre2_jit_match.html) | Fast path interface to JIT matching | | | [pcre2_jit_stack_assign](https://www.pcre.org/current/doc/html/pcre2_jit_stack_assign.html) | Assign stack for JIT matching | | | [pcre2_jit_stack_create](https://www.pcre.org/current/doc/html/pcre2_jit_stack_create.html) | Create a stack for JIT matching | | | [pcre2_jit_stack_free](https://www.pcre.org/current/doc/html/pcre2_jit_stack_free.html) | Free a JIT matching stack | diff --git a/README.md b/README.md index 8303a2c..d68ef15 100644 --- a/README.md +++ b/README.md @@ -91,12 +91,7 @@ Add the following dependencies to your `pom.xml` file: Proceed using the PCRE4J library in your Java code: ```java -import org.pcre4j.Pcre2Code; -import org.pcre4j.Pcre2CompileOption; -import org.pcre4j.Pcre2MatchData; -import org.pcre4j.Pcre2MatchOption; -import org.pcre4j.Pcre4j; -import org.pcre4j.Pcre4jUtils; +import org.pcre4j.*; // TODO: Select one of the following imports for the backend you want to use: import org.pcre4j.jna.Pcre2; // import org.pcre4j.ffm.Pcre2; @@ -107,11 +102,21 @@ public class Usage { } public static String[] example(String pattern, String subject) { - final var code = new Pcre2Code( - pattern, - EnumSet.noneOf(Pcre2CompileOption.class), - null - ); + final Pcre2Code code; + if (Pcre4jUtils.isJitSupported(Pcre4j.api())) { + code = new Pcre2JitCode( + pattern, + EnumSet.noneOf(Pcre2CompileOption.class), + null, + null + ); + } else { + code = new Pcre2Code( + pattern, + EnumSet.noneOf(Pcre2CompileOption.class), + null + ); + } final var matchData = new Pcre2MatchData(code); code.match( subject, @@ -175,8 +180,8 @@ The PCRE4J library supports several backends to invoke the `pcre2` API. ### `jna` The `jna` backend uses the [Java Native Access](https://github.com/java-native-access/jna) library to invoke the `pcre2` -shared library. For this backend to work, the `pcre2` shared library must be installed on the system and be visible to -the JNA. +shared library. For this backend to work, the `pcre2` shared library must be installed on the system and be visible via +`jna.library.path`. ### `ffm` diff --git a/api/src/main/java/org/pcre4j/api/IPcre2.java b/api/src/main/java/org/pcre4j/api/IPcre2.java index 15b17fc..29eec27 100644 --- a/api/src/main/java/org/pcre4j/api/IPcre2.java +++ b/api/src/main/java/org/pcre4j/api/IPcre2.java @@ -824,6 +824,29 @@ public interface IPcre2 { */ public int patternInfo(long code, int what, ByteBuffer where); + /** + * JIT-compile a compiled pattern. + * + * @param code the compiled pattern handle + * @param options option bits + * @return 0 on success, otherwise a negative error code + */ + public int jitCompile(long code, int options); + + /** + * Match a compiled pattern against a subject string. + * + * @param code the compiled pattern handle + * @param subject the subject string + * @param startoffset the starting offset in the subject string + * @param options option bits + * @param matchData the match data handle + * @param mcontext the match context handle + * @return the number of captures plus one, zero if the {@code matchData} is too small, or a negative value if there + * was no match or an actual error occurred + */ + public int jitMatch(long code, String subject, int startoffset, int options, long matchData, long mcontext); + /** * Create a new match data block. * diff --git a/ffm/src/main/java/org/pcre4j/ffm/Pcre2.java b/ffm/src/main/java/org/pcre4j/ffm/Pcre2.java index e4a68db..f5cc9e8 100644 --- a/ffm/src/main/java/org/pcre4j/ffm/Pcre2.java +++ b/ffm/src/main/java/org/pcre4j/ffm/Pcre2.java @@ -45,6 +45,9 @@ public class Pcre2 implements IPcre2 { private final MethodHandle pcre2_get_error_message; private final MethodHandle pcre2_pattern_info; + private final MethodHandle pcre2_jit_compile; + private final MethodHandle pcre2_jit_match; + private final MethodHandle pcre2_match_data_create; private final MethodHandle pcre2_match_data_create_from_pattern; private final MethodHandle pcre2_match_data_free; @@ -178,6 +181,27 @@ public Pcre2(String library, String suffix) { ) ); + pcre2_jit_compile = LINKER.downcallHandle( + SYMBOL_LOOKUP.find("pcre2_jit_compile" + suffix).orElseThrow(), + FunctionDescriptor.of(ValueLayout.JAVA_INT, // int + ValueLayout.ADDRESS, // pcre2_code* + ValueLayout.JAVA_INT // int + ) + ); + + pcre2_jit_match = LINKER.downcallHandle( + SYMBOL_LOOKUP.find("pcre2_jit_match" + suffix).orElseThrow(), + FunctionDescriptor.of(ValueLayout.JAVA_INT, // int + ValueLayout.ADDRESS, // pcre2_code* + ValueLayout.ADDRESS, // PCRE2_SPTR + ValueLayout.ADDRESS, // PCRE2_SIZE + ValueLayout.ADDRESS, // PCRE2_SIZE + ValueLayout.JAVA_INT, // int + ValueLayout.ADDRESS, // pcre2_match_data* + ValueLayout.ADDRESS // pcre2_match_context* + ) + ); + pcre2_match_data_create = LINKER.downcallHandle( SYMBOL_LOOKUP.find("pcre2_match_data_create" + suffix).orElseThrow(), FunctionDescriptor.of(ValueLayout.ADDRESS, // pcre2_match_data* @@ -569,6 +593,48 @@ public int patternInfo(long code, int what, ByteBuffer where) { } } + @Override + public int jitCompile(long code, int options) { + try (var arena = Arena.ofConfined()) { + final var pCode = MemorySegment.ofAddress(code); + + return (int) pcre2_jit_compile.invokeExact( + pCode, + options + ); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public int jitMatch(long code, String subject, int startoffset, int options, long matchData, long mcontext) { + if (subject == null) { + throw new IllegalArgumentException("subject must not be null"); + } + + try (var arena = Arena.ofConfined()) { + final var pCode = MemorySegment.ofAddress(code); + final var pszSubject = arena.allocateUtf8String(subject); + final var subjectLength = MemorySegment.ofAddress(pszSubject.byteSize() - 1); + final var startOffset = MemorySegment.ofAddress(startoffset); + final var pMatchData = MemorySegment.ofAddress(matchData); + final var pMatchContext = MemorySegment.ofAddress(mcontext); + + return (int) pcre2_jit_match.invokeExact( + pCode, + pszSubject, + subjectLength, + startOffset, + options, + pMatchData, + pMatchContext + ); + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + @Override public long matchDataCreate(int ovecsize, long gcontext) { try (var arena = Arena.ofConfined()) { diff --git a/jna/src/main/java/org/pcre4j/jna/Pcre2.java b/jna/src/main/java/org/pcre4j/jna/Pcre2.java index 05a34e1..a4e693f 100644 --- a/jna/src/main/java/org/pcre4j/jna/Pcre2.java +++ b/jna/src/main/java/org/pcre4j/jna/Pcre2.java @@ -238,6 +238,30 @@ public int patternInfo(long code, int what, ByteBuffer where) { return result; } + @Override + public int jitCompile(long code, int options) { + return library.pcre2_jit_compile(new Pointer(code), options); + } + + @Override + public int jitMatch(long code, String subject, int startoffset, int options, long matchData, long mcontext) { + if (subject == null) { + throw new IllegalArgumentException("subject must not be null"); + } + + final var pszSubject = subject.getBytes(StandardCharsets.UTF_8); + + return library.pcre2_jit_match( + new Pointer(code), + pszSubject, + pszSubject.length, + startoffset, + options, + new Pointer(matchData), + new Pointer(mcontext) + ); + } + @Override public long matchDataCreate(int ovecsize, long gcontext) { Pointer matchData = library.pcre2_match_data_create(ovecsize, new Pointer(gcontext)); @@ -336,6 +360,18 @@ Pointer pcre2_compile( int pcre2_pattern_info(Pointer code, int what, Pointer where); + int pcre2_jit_compile(Pointer code, int options); + + int pcre2_jit_match( + Pointer code, + byte[] subject, + long length, + long startoffset, + int options, + Pointer matchData, + Pointer mcontext + ); + Pointer pcre2_match_data_create(int ovecsize, Pointer gcontext); Pointer pcre2_match_data_create_from_pattern(Pointer code, Pointer gcontext); diff --git a/lib/src/main/java/org/pcre4j/Pcre2Code.java b/lib/src/main/java/org/pcre4j/Pcre2Code.java index af9557f..4e4fd9c 100644 --- a/lib/src/main/java/org/pcre4j/Pcre2Code.java +++ b/lib/src/main/java/org/pcre4j/Pcre2Code.java @@ -22,6 +22,9 @@ import org.pcre4j.api.IPcre2; +/** + * A compiled pattern. + */ public class Pcre2Code { private static final Cleaner cleaner = Cleaner.create(); @@ -45,10 +48,22 @@ public class Pcre2Code { * Constructor for Pcre2Code * * @param pattern the pattern to compile - * @param options the flags to compile the pattern with, see {@link Pcre2CompileOption} + * @param options the flags to compile the pattern with, see {@link Pcre2CompileOption} or null for default + * options * @param compileContext the compile context to use or null */ - public Pcre2Code(String pattern, EnumSet options, Pcre2CompileContext compileContext) { + public Pcre2Code( + String pattern, + EnumSet options, + Pcre2CompileContext compileContext + ) { + if (pattern == null) { + throw new IllegalArgumentException("pattern cannot be null"); + } + if (options == null) { + options = EnumSet.noneOf(Pcre2CompileOption.class); + } + final var api = Pcre4j.api(); final var errorcode = new int[1]; @@ -413,25 +428,10 @@ public int match( throw new IllegalArgumentException("matchData must not be null"); } - // For the UTF-8, convert the startOffset from characters to bytes - var startOffsetInBytes = 0; - for (var charIndex = 0; charIndex < startOffset; charIndex++) { - final var theChar = subject.charAt(charIndex); - if (theChar <= 0x007F) { - startOffsetInBytes += 1; - } else if (theChar <= 0x07FF) { - startOffsetInBytes += 2; - } else if (Character.isHighSurrogate(theChar) || Character.isLowSurrogate(theChar)) { - startOffsetInBytes += 2; - } else { - startOffsetInBytes += 3; - } - } - return api.match( handle, subject, - startOffsetInBytes, + Pcre4jUtils.convertCharacterIndexToByteOffset(subject, startOffset), options .stream() .mapToInt(Pcre2MatchOption::value) diff --git a/lib/src/main/java/org/pcre4j/Pcre2JitCode.java b/lib/src/main/java/org/pcre4j/Pcre2JitCode.java new file mode 100644 index 0000000..f9f9ceb --- /dev/null +++ b/lib/src/main/java/org/pcre4j/Pcre2JitCode.java @@ -0,0 +1,98 @@ +package org.pcre4j; + +import java.util.EnumSet; + +/** + * A JIT-compiled pattern. + */ +public class Pcre2JitCode extends Pcre2Code { + + /** + * The supported match options for JIT-compiled patterns. + */ + private final static EnumSet SUPPORTED_MATCH_OPTIONS = EnumSet.of( + Pcre2MatchOption.NOTBOL, + Pcre2MatchOption.NOTEOL, + Pcre2MatchOption.NOTEMPTY, + Pcre2MatchOption.NOTEMPTY_ATSTART, + Pcre2MatchOption.PARTIAL_HARD, + Pcre2MatchOption.PARTIAL_SOFT + ); + + /** + * Get the supported match options for JIT-compiled patterns. + * + * @return the supported match options + */ + public static EnumSet getSupportedMatchOptions() { + return EnumSet.copyOf(SUPPORTED_MATCH_OPTIONS); + } + + /** + * Constructor for Pcre2JitCode + * + * @param pattern the pattern to compile + * @param options the flags to compile the pattern with, see {@link Pcre2CompileOption} or null for default + * options + * @param jitOptions the flags to compile the pattern with JIT, see {@link Pcre2JitOption} or null for default + * options + * @param compileContext the compile context to use or null + */ + public Pcre2JitCode( + String pattern, + EnumSet options, + EnumSet jitOptions, + Pcre2CompileContext compileContext + ) { + super(pattern, options, compileContext); + + if (jitOptions == null) { + jitOptions = EnumSet.of( + Pcre2JitOption.COMPLETE, + Pcre2JitOption.PARTIAL_SOFT, + Pcre2JitOption.PARTIAL_HARD + ); + } + + final var jitResult = api.jitCompile( + handle, + jitOptions + .stream() + .mapToInt(Pcre2JitOption::value).sum() + ); + if (jitResult != 0) { + throw new IllegalStateException(Pcre4jUtils.getErrorMessage(api, jitResult)); + } + } + + @Override + public int match( + String subject, + int startOffset, + EnumSet options, + Pcre2MatchData matchData, + Pcre2MatchContext matchContext + ) { + if (subject == null) { + throw new IllegalArgumentException("subject must not be null"); + } + if (startOffset < 0) { + throw new IllegalArgumentException("startOffset must be greater than or equal to zero"); + } + if (startOffset >= subject.length()) { + throw new IllegalArgumentException("startOffset must be less than the length of the subject"); + } + if (matchData == null) { + throw new IllegalArgumentException("matchData must not be null"); + } + + return api.jitMatch( + handle, + subject, + Pcre4jUtils.convertCharacterIndexToByteOffset(subject, startOffset), + options.stream().mapToInt(Pcre2MatchOption::value).sum(), + matchData.handle, + matchContext != null ? matchContext.handle : 0 + ); + } +} diff --git a/lib/src/main/java/org/pcre4j/Pcre4jUtils.java b/lib/src/main/java/org/pcre4j/Pcre4jUtils.java index a07d2cb..3696054 100644 --- a/lib/src/main/java/org/pcre4j/Pcre4jUtils.java +++ b/lib/src/main/java/org/pcre4j/Pcre4jUtils.java @@ -319,6 +319,41 @@ public static EnumSet getCompiledWidths(IPcre2 api) { return widths; } + /** + * Convert a character index to a byte offset. + * + * @param input the input string + * @param index the character index + * @return the byte offset + */ + public static int convertCharacterIndexToByteOffset(String input, int index) { + if (input == null) { + throw new IllegalArgumentException("input must not be null"); + } + if (index < 0) { + throw new IllegalArgumentException("index must be non-negative"); + } + if (index >= input.length()) { + throw new IllegalArgumentException("index must be within the bounds of the input string"); + } + + var offset = 0; + for (var charIndex = 0; charIndex < index; charIndex++) { + final var theChar = input.charAt(charIndex); + if (theChar <= 0x007F) { + offset += 1; + } else if (theChar <= 0x07FF) { + offset += 2; + } else if (Character.isHighSurrogate(theChar) || Character.isLowSurrogate(theChar)) { + offset += 2; + } else { + offset += 3; + } + } + + return offset; + } + /** * Get what \R matches by default. * diff --git a/regex/src/main/java/org/pcre4j/regex/Matcher.java b/regex/src/main/java/org/pcre4j/regex/Matcher.java index 92bdff2..7b2ac08 100644 --- a/regex/src/main/java/org/pcre4j/regex/Matcher.java +++ b/regex/src/main/java/org/pcre4j/regex/Matcher.java @@ -14,9 +14,7 @@ */ package org.pcre4j.regex; -import org.pcre4j.Pcre2MatchData; -import org.pcre4j.Pcre2MatchOption; -import org.pcre4j.Pcre4jUtils; +import org.pcre4j.*; import org.pcre4j.api.IPcre2; import java.nio.charset.StandardCharsets; @@ -263,11 +261,21 @@ public boolean hasMatch() { * @return {@code true} if the input sequence region starts with the pattern, otherwise {@code false} */ public boolean lookingAt() { - final var matchData = new Pcre2MatchData(pattern.code); - final var result = pattern.code.match( + final EnumSet matchOptions; + final Pcre2Code lookingAtCode; + if (pattern.lookingAtCode != null) { + lookingAtCode = pattern.lookingAtCode; + matchOptions = EnumSet.noneOf(Pcre2MatchOption.class); + } else { + lookingAtCode = pattern.code; + matchOptions = EnumSet.of(Pcre2MatchOption.ANCHORED); + } + + final var matchData = new Pcre2MatchData(lookingAtCode); + final var result = lookingAtCode.match( input.subSequence(0, regionEnd).toString(), regionStart, - EnumSet.of(Pcre2MatchOption.ANCHORED), + matchOptions, matchData, null ); @@ -276,7 +284,7 @@ public boolean lookingAt() { return false; } - final var errorMessage = Pcre4jUtils.getErrorMessage(pattern.code.api(), result); + final var errorMessage = Pcre4jUtils.getErrorMessage(pattern.lookingAtCode.api(), result); throw new RuntimeException("Failed to find an anchored match", new IllegalStateException(errorMessage)); } @@ -292,11 +300,21 @@ public boolean lookingAt() { * @return {@code true} if the entire input sequence region matches the pattern, otherwise {@code false} */ public boolean matches() { - final var matchData = new Pcre2MatchData(pattern.code); - final var result = pattern.code.match( + final Pcre2Code matchingCode; + final EnumSet matchOptions; + if (pattern.matchingCode != null) { + matchingCode = pattern.matchingCode; + matchOptions = EnumSet.noneOf(Pcre2MatchOption.class); + } else { + matchingCode = pattern.code; + matchOptions = EnumSet.of(Pcre2MatchOption.ANCHORED, Pcre2MatchOption.ENDANCHORED); + } + + final var matchData = new Pcre2MatchData(matchingCode); + final var result = matchingCode.match( input.subSequence(0, regionEnd).toString(), regionStart, - EnumSet.of(Pcre2MatchOption.ANCHORED, Pcre2MatchOption.ENDANCHORED), + matchOptions, matchData, null ); @@ -305,7 +323,7 @@ public boolean matches() { return false; } - final var errorMessage = Pcre4jUtils.getErrorMessage(pattern.code.api(), result); + final var errorMessage = Pcre4jUtils.getErrorMessage(pattern.matchingCode.api(), result); throw new RuntimeException("Failed to find an anchored match", new IllegalStateException(errorMessage)); } diff --git a/regex/src/main/java/org/pcre4j/regex/Pattern.java b/regex/src/main/java/org/pcre4j/regex/Pattern.java index 11742aa..d0930ba 100644 --- a/regex/src/main/java/org/pcre4j/regex/Pattern.java +++ b/regex/src/main/java/org/pcre4j/regex/Pattern.java @@ -14,9 +14,7 @@ */ package org.pcre4j.regex; -import org.pcre4j.Pcre2Code; -import org.pcre4j.Pcre2CompileError; -import org.pcre4j.Pcre2CompileOption; +import org.pcre4j.*; import java.util.ArrayList; import java.util.EnumSet; @@ -62,6 +60,8 @@ public class Pattern { // TODO: public static final int UNIX_LINES = java.util.regex.Pattern.UNIX_LINES; /* package-private */ final Pcre2Code code; + /* package-private */ final Pcre2Code matchingCode; + /* package-private */ final Pcre2Code lookingAtCode; private final Map namedGroups; private final String regex; private final int flags; @@ -70,28 +70,62 @@ private Pattern(String regex, int flags) { this.regex = regex; this.flags = flags; - final var options = EnumSet.of(Pcre2CompileOption.UTF); + final var compileOptions = EnumSet.of(Pcre2CompileOption.UTF); if ((flags & CASE_INSENSITIVE) != 0) { - options.add(Pcre2CompileOption.CASELESS); + compileOptions.add(Pcre2CompileOption.CASELESS); } if ((flags & DOTALL) != 0) { - options.add(Pcre2CompileOption.DOTALL); + compileOptions.add(Pcre2CompileOption.DOTALL); } if ((flags & LITERAL) != 0) { - options.add(Pcre2CompileOption.LITERAL); + compileOptions.add(Pcre2CompileOption.LITERAL); } if ((flags & MULTILINE) != 0) { - options.add(Pcre2CompileOption.MULTILINE); + compileOptions.add(Pcre2CompileOption.MULTILINE); } try { - this.code = new Pcre2Code(regex, options, null); + if (Pcre4jUtils.isJitSupported(Pcre4j.api())) { + this.code = new Pcre2JitCode( + regex, + compileOptions, + EnumSet.of(Pcre2JitOption.COMPLETE), + null + ); + + final var matchingCompileOptions = EnumSet.copyOf(compileOptions); + matchingCompileOptions.add(Pcre2CompileOption.ANCHORED); + matchingCompileOptions.add(Pcre2CompileOption.ENDANCHORED); + this.matchingCode = new Pcre2JitCode( + regex, + matchingCompileOptions, + EnumSet.of(Pcre2JitOption.COMPLETE), + null + ); + + final var lookingAtCompileOptions = EnumSet.copyOf(compileOptions); + lookingAtCompileOptions.add(Pcre2CompileOption.ANCHORED); + this.lookingAtCode = new Pcre2JitCode( + regex, + lookingAtCompileOptions, + EnumSet.of(Pcre2JitOption.COMPLETE), + null + ); + } else { + this.code = new Pcre2Code( + regex, + compileOptions, + null + ); + this.matchingCode = null; + this.lookingAtCode = null; + } } catch (Pcre2CompileError e) { throw new PatternSyntaxException(e.message(), e.pattern(), (int) e.offset()); } namedGroups = new HashMap<>(); - for (var nameTableEntry : code.nameTable()) { + for (var nameTableEntry : this.code.nameTable()) { namedGroups.put(nameTableEntry.name(), nameTableEntry.group()); } } diff --git a/regex/src/test/java/org/pcre4j/regex/PatternTests.java b/regex/src/test/java/org/pcre4j/regex/PatternTests.java index 773ac2f..cafa2f6 100644 --- a/regex/src/test/java/org/pcre4j/regex/PatternTests.java +++ b/regex/src/test/java/org/pcre4j/regex/PatternTests.java @@ -27,7 +27,7 @@ public class PatternTests { static { - Pcre4j.setup(new Pcre2()); + Pcre4j.setup(new Pcre2("/opt/homebrew/lib/libpcre2-8.dylib", "_8")); } @Test