Ok so I am trying to increase performance of an Elasticsearch application.
For this case there is a single Elasticsearch node (non-sharded) with about 7M docs (index is like 40G).
We were doing a query to get top docs (like 1000) and then another query to do aggregation on some fields on those docs filter ids values (those 1000 docs) and then aggregate mutual_information for those terms.
I had thought that doing a sampler aggregation would help by doing 1 query instead of 2 and that this would speed things up, but not seeing that currently.
Ok so here's the orig query:
{
"from" : 0,
"size" : 1000,
"query" : {
"bool" : {
"should" : [
{
"match" : {
"name" : {
"query" : "workout",
"operator" : "AND",
"prefix_length" : 0,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1.0
}
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"min_score" : 7.0,
"_source" : false,
"sort" : [
{
"_score" : {
"order" : "desc"
}
},
{
"mau" : {
"order" : "desc"
}
}
]
}
followed by:
{
"from" : 0,
"size" : 1000,
"query" : {
"bool" : {
"filter" : [
{
"ids" : {
"type" : [ ],
"values" : [
"AWBNMzCn5eVrMnnV89Iw",
.
.
. (lots of these)
],
"boost" : 1.0
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"sort" : [
{
"_score" : {
"order" : "desc"
}
},
{
"mau" : {
"order" : "desc"
}
}
],
"aggregations" : {
"tracks" : {
"significant_terms" : {
"field" : "tracks.raw",
"size" : 1000,
"min_doc_count" : 2,
"shard_min_doc_count" : 0,
"mutual_information" : {
"include_negatives" : false,
"background_is_superset" : true
}
}
}
}
}
vs:
{
"from" : 0,
"size" : 0,
"query" : {
"bool" : {
"should" : [
{
"match" : {
"name" : {
"query" : "workout",
"operator" : "AND",
"prefix_length" : 0,
"max_expansions" : 50,
"fuzzy_transpositions" : true,
"lenient" : false,
"zero_terms_query" : "NONE",
"boost" : 1.0
}
}
}
],
"disable_coord" : false,
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"min_score" : 7.0,
"_source" : false,
"sort" : [
{
"_score" : {
"order" : "desc"
}
},
{
"mau" : {
"order" : "desc"
}
}
],
"aggregations" : {
"tracks" : {
"sampler" : {
"shard_size" : 1000
},
"aggregations" : {
"tracks" : {
"significant_terms" : {
"field" : "tracks.raw",
"size" : 1000,
"min_doc_count" : 2,
"shard_min_doc_count" : 2,
"mutual_information" : {
"include_negatives" : false,
"background_is_superset" : true
}
}
}
}
}
}
}
Does that make sense?
Thanks,
Adam