I am implementing the PatternCaptureGroupTokenFilter
in my code to generate tokens based on multiple regular expressions, with the goal of highlighting any matches found within the string. Currently, I am working with Lucene 9, but I am encountering the following error during execution.
I'm using one of the latest Lucene jars (9.11.1) for PatternCaptureGroupTokenFilter
, but the token positions are not as expected, making it difficult to accurately highlight the search results.
Here are the tokens generated for the string:
{
"tokens" : [
{
"token" : "test:data",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 0
},
{
"token" : "test",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 0
},
{
"token" : ":data",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 0
},
{
"token" : "test:",
"start_offset" : 0,
"end_offset" : 9,
"type" : "word",
"position" : 0
},
{
"token" : "test",
"start_offset" : 10,
"end_offset" : 14,
"type" : "word",
"position" : 1
}
]
}
The positions are not good for highlighting,
public final class PatternCaptureGroupTokenFilter extends TokenFilter {
private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private State state;
private final Matcher[] matchers;
private final CharsRefBuilder spare = new CharsRefBuilder();
private final int[] groupCounts;
private final boolean preserveOriginal;
private int[] currentGroup;
private int currentMatcher;
/**
* @param input the input {@link TokenStream}
* @param preserveOriginal set to true to return the original token even if one of the patterns
* matches
* @param patterns an array of {@link Pattern} objects to match against each token
*/
public PatternCaptureGroupTokenFilter(
TokenStream input, boolean preserveOriginal, Pattern... patterns) {
super(input);
this.preserveOriginal = preserveOriginal;
this.matchers = new Matcher[patterns.length];
this.groupCounts = new int[patterns.length];
this.currentGroup = new int[patterns.length];
for (int i = 0; i < patterns.length; i++) {
this.matchers[i] = patterns[i].matcher("");
this.groupCounts[i] = this.matchers[i].groupCount();
this.currentGroup[i] = -1;
}
}
private boolean nextCapture() {
int min_offset = Integer.MAX_VALUE;
currentMatcher = -1;
Matcher matcher;
for (int i = 0; i < matchers.length; i++) {
matcher = matchers[i];
if (currentGroup[i] == -1) {
currentGroup[i] = matcher.find() ? 1 : 0;
}
if (currentGroup[i] != 0) {
while (currentGroup[i] < groupCounts[i] + 1) {
final int start = matcher.start(currentGroup[i]);
final int end = matcher.end(currentGroup[i]);
if (start == end || preserveOriginal && start == 0 && spare.length() == end) {
currentGroup[i]++;
continue;
}
if (start < min_offset) {
min_offset = start;
currentMatcher = i;
}
break;
}
if (currentGroup[i] == groupCounts[i] + 1) {
currentGroup[i] = -1;
i--;
}
}
}
return currentMatcher != -1;
}
@Override
public boolean incrementToken() throws IOException {
if (currentMatcher != -1 && nextCapture()) {
assert state != null;
clearAttributes();
restoreState(state);
final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
posAttr.setPositionIncrement(0);
charTermAttr.copyBuffer(spare.chars(), start, end - start);
currentGroup[currentMatcher]++;
return true;
}
if (!input.incrementToken()) {
return false;
}
char[] buffer = charTermAttr.buffer();
int length = charTermAttr.length();
spare.copyChars(buffer, 0, length);
state = captureState();
for (int i = 0; i < matchers.length; i++) {
matchers[i].reset(spare.get());
currentGroup[i] = -1;
}
if (preserveOriginal) {
currentMatcher = 0;
} else if (nextCapture()) {
final int start = matchers[currentMatcher].start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher].end(currentGroup[currentMatcher]);
// if we start at 0 we can simply set the length and save the copy
if (start == 0) {
charTermAttr.setLength(end);
} else {
charTermAttr.copyBuffer(spare.chars(), start, end - start);
}
currentGroup[currentMatcher]++;
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
state = null;
currentMatcher = -1;
}
}
Can anyone provide guidance or suggestions on how to resolve this issue and generate the expected tokens?