Hi,
I'm actually using standard tokenizer wich contains whitespace tokenizer.
The problem is that i want to separate a sentence by tokens with whitespace but i also want the entire sentence as token. How is this possible ?
Concrete example:
want to parse "Hello How Are You" in these tokens --> ["hello","how","are","you","hello how are you"].
My actual config :
{
"analysis": {
"analyzer": {
"smd_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [
"html_strip",
"smd_filter",
],
"filter": [
"lowercase",
"asciifolding",
"smd_length",
"smd_stop"
]
}
},
"char_filter": {
"smd_filter": {
"type": "pattern_replace",
"pattern": "(\\p{L}+)'(\\p{L}+)",
"replacement": "$0 $1 $2"
}
},
"filter":{
"smd_length": {
"type": "length",
"min": 2
},
"smd_stop": {
"type": "stop",
"ignore_case": true,
"stopwords": [ "LE", "LA", "LES", "DU", "DES", "OU", "ET", "SI", "STE", "CIE","SOC", "GEN", "GIE", "NV", "SA", "SARL", "ST", "BS", "CP", "CV","DA", "DS", "OAT", "TP", "TSDI", "TSR", "ZZ"]
}
}
}
}