I'm using the PatternCaptureGroupTokenFilter
in my code to generate tokens based on multiple regular expressions and highlight matches in the string. I'm working with Lucene 9, but it's returning the following error.
{"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=0,endOffset=5,lastStartOffset=4 for field 'title_special.en'"}],"type":"illegal_argument_exception","reason":"startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=0,endOffset=5,lastStartOffset=4 for field 'title_special.en'"},"status":400}
{
“tokens” : [
{
“token” : “test:data”,
“start_offset” : 0,
“end_offset” : 9,
“type” : “word”,
“position” : 0
},
{
“token” : “test”,
“start_offset” : 0,
“end_offset” : 4,
“type” : “word”,
“position” : 0
},
{
“token” : “:data”,
“start_offset” : 4,
“end_offset” : 9,
“type” : “word”,
“position” : 0
},
{
“token” : “test:”,
“start_offset” : 0,
“end_offset” : 5,
“type” : “word”,
“position” : 0
},
{
“token” : “test”,
“start_offset” : 10,
“end_offset” : 14,
“type” : “word”,
“position” : 1
}
]
}
I am using the following Java code
public final class PatternCaptureGroupTokenFilter extends TokenFilter {
private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private State state;
private final Matcher[] matchers;
private final CharsRefBuilder spare = new CharsRefBuilder();
private final int[] groupCounts;
private final boolean preserveOriginal;
private int[] currentGroup;
private int currentMatcher;
private int main_token_start;
private int main_token_end;
public PatternCaptureGroupTokenFilter(TokenStream input,
boolean preserveOriginal, Pattern... patterns) {
super(input);
this.preserveOriginal = preserveOriginal;
this.matchers = new Matcher[patterns.length];
this.groupCounts = new int[patterns.length];
this.currentGroup = new int[patterns.length];
for (int i = 0; i < patterns.length; i++) {
this.matchers[i] = patterns[i].matcher("");
this.groupCounts[i] = this.matchers[i].groupCount();
this.currentGroup[i] = -1;
}
}
private boolean nextCapture() {
int min_offset = Integer.MAX_VALUE;
currentMatcher = -1;
Matcher matcher;
for (int i = 0; i < matchers.length; i++) {
matcher = matchers[i];
if (currentGroup[i] == -1) {
currentGroup[i] = matcher.find() ? 1 : 0;
}
if (currentGroup[i] != 0) {
while (currentGroup[i] < groupCounts[i] + 1) {
final int start = matcher.start(currentGroup[i]);
final int end = matcher.end(currentGroup[i]);
if (start == end || preserveOriginal && start == 0
&& spare.length() == end) {
currentGroup[i]++;
continue;
}
if (start < min_offset) {
min_offset = start;
currentMatcher = i;
}
break;
}
if (currentGroup[i] == groupCounts[i] + 1) {
currentGroup[i] = -1;
i--;
}
}
}
return currentMatcher != -1;
}
@Override
public boolean incrementToken() throws IOException {
if (currentMatcher != -1 && nextCapture()) {
assert state != null;
clearAttributes();
restoreState(state);
final int start = matchers[currentMatcher]
.start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher]
.end(currentGroup[currentMatcher]);
// modified code starts
main_token_start = offsetAtt.startOffset();
main_token_end = offsetAtt.endOffset();
final int newStart = start + main_token_start;
final int newEnd = end + main_token_start;
offsetAtt.setOffset(newStart, newEnd);
// modified code ends
posAttr.setPositionIncrement(0);
charTermAttr.copyBuffer(spare.chars(), start, end - start);
currentGroup[currentMatcher]++;
return true;
}
if (!input.incrementToken()) {
return false;
}
char[] buffer = charTermAttr.buffer();
int length = charTermAttr.length();
spare.copyChars(buffer, 0, length);
state = captureState();
for (int i = 0; i < matchers.length; i++) {
matchers[i].reset(spare.get());
currentGroup[i] = -1;
}
if (preserveOriginal) {
currentMatcher = 0;
} else if (nextCapture()) {
final int start = matchers[currentMatcher]
.start(currentGroup[currentMatcher]);
final int end = matchers[currentMatcher]
.end(currentGroup[currentMatcher]);
// if we start at 0 we can simply set the length and save the copy
if (start == 0) {
charTermAttr.setLength(end);
} else {
charTermAttr.copyBuffer(spare.chars(), start, end - start);
}
currentGroup[currentMatcher]++;
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
state = null;
currentMatcher = -1;
}
}
This error is occurring in Lucene's latest versions. Please refer to: Start offset going backwards has a legitimate purpose [LUCENE-8776] · Issue #9820 · apache/lucene · GitHub
Can anyone suggest how I can achieve this?