Hi!
We've got a custom Lucene tokenizer that after upgrading from Elasticsearch 5.5.2
to 6.1.2
doesn't seem to handle offset correctly when the indexed field has multiple values.
So for example, when trying to index a document field of children.name
when there are more than one child:
{
"name": "Foo",
"children": [
{"name": "Foo Bar"},
{"name": "Bar Xyzzy"}
]
}
we will receive an error of saying:
startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=1,endOffset=3,lastStartOffset=3 for field 'children.name.our_custom_analyzer'
This is basically our Tokenizer (in Scala) which used to work in Elasticsearch 5.5.2:
class MyCustomTokenizer extends Tokenizer {
val termAtt = addAttribute(classOf[CharTermAttribute])
val offsetAtt = addAttribute(classOf[OffsetAttribute])
@volatile var tokens: Iterator[String] = null
@volatile var startOffset = 0
override def reset(): Unit = {
super.reset()
tokens = null
startOffset = 0
Option(getAttribute(classOf[PositionIncrementAttribute])).
foreach(_.setPositionIncrement(1))
}
override def incrementToken(): Boolean = {
if (tokens == null) {
val in = new BufferedReader(input)
val lines = Iterator.continually(in.readLine).takeWhile(_ != null)
val inputStr = lines.mkString(" ") // turn lines to whitespace
val tokenList: List[String] = OurCodeThatConvertsInputTextIntoListOfTokens.doIt(inputStr)
tokens = tokenList.iterator
}
if (tokens.hasNext) {
val token = tokens.next
val endOffset = startOffset + token.length
termAtt.setEmpty().append(token)
offsetAtt.setOffset(startOffset, endOffset)
startOffset = endOffset
true
} else false
}
}
So what are we doing wrong here? If the reset
is called between children (in this case Foo Bar
and Bar Xyzzy
), how are we supposed to be track the "correct" offset?