I have an index of about 15M items, I am using version 6.7.0 installed in AWS. We adopted denormalisation approach in our design, each project has many items. A project has a title, description, tags, popularity, project_score ... and each item has its own fields: height, width, overall_score, likes_count, category ... Here is a snippet of item index mapping:
{
"item_v1": {
"mappings": {
"_doc": {
"properties": {
"project_description": {
"type": "text"
},
"project_title": {
"type": "text"
},
"project_tags": {
"type": "keyword"
},
"project_popularity": {
"type": "long"
},
"project_score": {
"type": "long"
},
"project_id": {
"type": "long"
},
"project_location": {
"type": "geo_point"
},
"height": {
"type": "short"
},
"width": {
"type": "short"
},
"likes_count": {
"type": "long"
}
}
}
}
}
}
We are issuing a query using a little complex function_score. because we have the same project.title for every item, when I search for example a keyword word1, and imagine word1 exists in project title of project_1, project_2 and project_3. In the result set I am getting all project_1 items first, then all project_2 items, and finally all project_3 items. To avoid this duplication, I tried to aggregate results with project_id, but This is taking too much time (more than 600ms). Here is a snippet of our query:
{
"from": 0,
"size": 15,
"query": {
"function_score": {
"query": {
"bool": {
"filter": [
],
"should": [
{
"multi_match": {
"query": "word1",
"fields": [
"description^0.5",
"tags^12.0",
"title^5.0"
],
"type": "phrase",
"operator": "AND",
"slop": 0,
"prefix_length": 0,
"max_expansions": 50,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"fuzzy_transpositions": true,
"boost": 8.0
}
},
{
"multi_match": {
"query": "word1",
"fields": [
"description^0.5",
"tags^12.0",
"title^5.0"
],
"type": "cross_fields",
"operator": "AND",
"slop": 0,
"prefix_length": 0,
"max_expansions": 50,
"zero_terms_query": "NONE",
"auto_generate_synonyms_phrase_query": true,
"fuzzy_transpositions": true,
"boost": 1.0
}
}
],
"adjust_pure_negative": true,
"minimum_should_match": "1",
"boost": 1.0
}
},
"functions": [
{
"filter": {
"match_all": {
"boost": 1.0
}
},
"weight": 000,
"script_score": {
"script": {
"source": "0.99 * _score / (1 + _score)",
"lang": "painless"
}
}
},
{
"filter": {
"term": {
"xxxx": {
"value": true,
"boost": 1.0
}
}
},
"weight": 0000,
"linear": {
"project_location": {
"origin": {
"lat": 0000,
"lon": 0000
},
"scale": "75km",
"offset": "10km",
"decay": 0.3
},
"multi_value_mode": "MIN"
}
},
{
"filter": {
"term": {
"xxxxx": {
"value": false,
"boost": 1.0
}
}
},
"weight": 0000,
"exp": {
"location": {
"origin": {
"lat": 00000,
"lon": 00000
},
"scale": "10km",
"offset": "10km",
"decay": 0.5
},
"multi_value_mode": "MIN"
}
},
{
"filter": {
"exists": {
"field": "xxxxxx",
"boost": 1.0
}
},
"weight": 1.0
},
{
"filter": {
"term": {
"xxxxx": {
"value": "yy",
"boost": 1.0
}
}
},
"weight": 0000
},
{
"filter": {
"match_all": {
"boost": 1.0
}
},
"weight": 0000,
"exp": {
"created_at": {
"origin": "2020-10-19",
"scale": "100d",
"offset": "7d",
"decay": 0.6
},
"multi_value_mode": "MIN"
}
},
{
"filter": {
"range": {
"overall_score": {
"from": 1,
"to": null,
"include_lower": true,
"include_upper": true,
"boost": 1.0
}
}
},
"weight": 0000,
"script_score": {
"script": {
"source": "Math.log(1 + doc['overall_score'].value)",
"lang": "painless"
}
}
}
],
"score_mode": "sum",
"boost_mode": "replace",
"max_boost": 3.4028235E38,
"min_score": 0.0,
"boost": 1.0
}
},
"aggs": {
"users": {
"terms": {
"field": "project_id"
}
}
}
}