Ok as a test dataset I have 100 or so lyrics from Frank Ocean - yes... Frank Ocean.
I am using this to build up my autocomplete filter:
PUT /test_songs
{
"settings": {
"index": {
"analysis": {
"filter": {
"stemmer": {
"type": "stemmer",
"language": "english"
},
"autocompleteFilter": {
"max_shingle_size": 10,
"min_shingle_size": 4,
"output_unigrams_if_no_shingles": true,
"output_unigrams" : false,
"type": "shingle"
},
"stopwords": {
"type": "stop",
"stopwords": [
"_english_"
]
}
},
"analyzer": {
"didYouMean": {
"filter": [
"lowercase"
],
"type": "custom",
"tokenizer": "standard"
},
"autocomplete": {
"filter": [
"lowercase",
"autocompleteFilter"
],
"type": "custom",
"tokenizer": "standard"
},
"default": {
"filter": [
"lowercase",
"stopwords",
"stemmer"
],
"type": "custom",
"tokenizer": "standard"
}
}
}
}
}
}
Here is an example query:
GET /test_songs/_search/
{
"size": 0,
"aggs": {
"autocomplete": {
"terms": {
"field": "autocomplete",
"order": {
"_count": "desc"
},
"include": {
"pattern": "i want.*"
}
}
}
},
"query": {
"prefix": {
"autocomplete": {
"value": "i want"
}
}
}
}
And response
"aggregations": {
"autocomplete": {
"doc_count_error_upper_bound": 1,
"sum_other_doc_count": 158,
"buckets": [
{
**"key": "i want to be",**
"doc_count": 3
},
{
"key": "i want to be the",
"doc_count": 2
},
{
**"key": "i want you to",**
"doc_count": 2
},
{
"key": "i want and baby",
"doc_count": 1
},
{
"key": "i want and baby i",
"doc_count": 1
},
{
"key": "i want and baby i ain't",
"doc_count": 1
},
{
"key": "i want and baby i ain't never",
"doc_count": 1
},
{
"key": "i want and baby i ain't never fell",
"doc_count": 1
},
{
"key": "i want and baby i ain't never fell in",
"doc_count": 1
},
{
**"key": "i want and baby i ain't never fell in love",**
"doc_count": 1
}
]
}
}
I need more of:
- "i want to be"
- "i want you to"
- "i want and baby i ain't never fell in love"
And less of the buildup to those results.
Suggestions?