Making PatternCaptureGroupTokenFilter matches globally

I have created a regex. But, the issue with that regex is that it is returning only the first match and stopped matching afterwards. Inside my java file, I know I can use the find and group methods for returning all the matches. But, according to my use case, I can't use them. I am using the below regex:

"(?<![^\p{Punct}])(\p{Alnum}+)"

It is returning the first match correctly but after that it stops matching further matches.

The file containing the code for pattern is

package org.apache.lucene.analysis.pattern;
 
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRefBuilder;
 
public final class PatternCaptureGroupTokenFilter extends TokenFilter {
 
  private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
  private State state;
  private final Matcher[] matchers;
  private final CharsRefBuilder spare = new CharsRefBuilder();
  private final int[] groupCounts;
  private final boolean preserveOriginal;
  private int[] currentGroup;
  private int currentMatcher;
 
  public PatternCaptureGroupTokenFilter(TokenStream input,
      boolean preserveOriginal, Pattern... patterns) {
    super(input);
    this.preserveOriginal = preserveOriginal;
    this.matchers = new Matcher[patterns.length];
    this.groupCounts = new int[patterns.length];
    this.currentGroup = new int[patterns.length];
    for (int i = 0; i < patterns.length; i++) {
      this.matchers[i] = patterns[i].matcher("");
      this.groupCounts[i] = this.matchers[i].groupCount();
      this.currentGroup[i] = -1;
    }
  }
 
  private boolean nextCapture() {
    int min_offset = Integer.MAX_VALUE;
    currentMatcher = -1;
    Matcher matcher;
 
    for (int i = 0; i < matchers.length; i++) {
      matcher = matchers[i];
      if (currentGroup[i] == -1) {
        currentGroup[i] = matcher.find() ? 1 : 0;
      }
      if (currentGroup[i] != 0) {
        while (currentGroup[i] < groupCounts[i] + 1) {
          final int start = matcher.start(currentGroup[i]);
          final int end = matcher.end(currentGroup[i]);
          if (start == end || preserveOriginal && start == 0
              && spare.length() == end) {
            currentGroup[i]++;
            continue;
          }
          if (start < min_offset) {
            min_offset = start;
            currentMatcher = i;
          }
          break;
        }
        if (currentGroup[i] == groupCounts[i] + 1) {
          currentGroup[i] = -1;
          i--;
        }
      }
    }
    return currentMatcher != -1;
  }
 
  @Override
  public boolean incrementToken() throws IOException {
 
    if (currentMatcher != -1 && nextCapture()) {
      assert state != null;
      clearAttributes();
      restoreState(state);
      final int start = matchers[currentMatcher]
          .start(currentGroup[currentMatcher]);
      final int end = matchers[currentMatcher]
          .end(currentGroup[currentMatcher]);
 
      posAttr.setPositionIncrement(0);
      charTermAttr.copyBuffer(spare.chars(), start, end - start);
      currentGroup[currentMatcher]++;
      return true;
    }
 
    if (!input.incrementToken()) {
      return false;
    }
 
    char[] buffer = charTermAttr.buffer();
    int length = charTermAttr.length();
    spare.copyChars(buffer, 0, length);
    state = captureState();
 
    for (int i = 0; i < matchers.length; i++) {
      matchers[i].reset(spare.get());
      currentGroup[i] = -1;
    }
 
    if (preserveOriginal) {
      currentMatcher = 0;
    } else if (nextCapture()) {
      final int start = matchers[currentMatcher]
          .start(currentGroup[currentMatcher]);
      final int end = matchers[currentMatcher]
          .end(currentGroup[currentMatcher]);
 
      // if we start at 0 we can simply set the length and save the copy
      if (start == 0) {
        charTermAttr.setLength(end);
      } else {
        charTermAttr.copyBuffer(spare.chars(), start, end - start);
      }
      currentGroup[currentMatcher]++;
    }
    return true;
 
  }
 
  @Override
  public void reset() throws IOException {
    super.reset();
    state = null;
    currentMatcher = -1;
  }
 
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.