Strange shard cache misses with huge evictions


(Py Silver) #1

Hi! I'm having hard times to make shard cache work for my aggregations. I've spent hours to figure out what the cause and I'm afraid I'm out of ideas. My indices.requests.cache.size: 10% and there is 4 machines dedicated for ES with 32GB heap and total 64GB memory available per machine.

It is highly possible that most of the searches are unique so they cause so huge eviction rate? But it's not like there is lots of them. Barely 1.2M per day (googlebot & co goes through products aggregation website).

Typical Request uses multi-search agg + hits query ( I expect agg query to be cached/fetched from cache – works locally on a single machine but not on production )

https://pastebin.com/eZjmADm9

(Poor) Cache usage stats:

https://pastebin.com/z2QNCEWT

Nodes stats:
Fields: https://www.elastic.co/guide/en/elasticsearch/reference/6.5/cat-nodes.html

$ curl -X GET "localhost:9200/_cat/nodes?v&h=id,v,m,dup,hp,hm,rp,rm,fm,fe,qcm,qce,rcm,rce,rchc,rcmc,ft,idc,idto,iic,iito,iif,mc,mt,mtd,rto,scrcc"
id   v     m   dup hp     hm rp     rm    fm fe    qcm    qce   rcm   rce rchc   rcmc ft idc idto iic iito iif mc mt  mtd rto scrcc
E-Tc 6.2.4 - 14.22 38 31.8gb 99 62.9gb 1.4mb  0 28.3mb 426320 3.1gb 55784  290 121309 40   0    0   0   57   0  0  3  239 271     4
8krG 6.2.4 - 14.30 40 31.8gb 99 62.9gb 1.4mb  0 30.4mb 537357 3.1gb 50737  302 121381 42   0    0   0   51   0  0  1  542 274     4
8jT2 6.2.4 * 14.36 40 31.8gb 99 62.9gb 1.4mb  0   29mb 535770 3.1gb 56134  322 121574 39   0    0   0   60   0  0  1  389 266     4
yDct 6.2.4 - 14.26 40 31.8gb 99 62.9gb 1.4mb  0 30.1mb 527464 3.1gb 51137  332 121294 37   0    0   0   42   0  0  3 1659 254     4
jUjg 6.2.4 - 13.99 40 31.8gb 97 62.9gb 1.4mb  0 31.1mb 436395 3.1gb 50837  273 121193 43   0    0   0   60   0  0  2 5140 283     4

Cluster stats:

$ curl -X GET "localhost:9200/_cluster/stats?human&pretty"
{
  "_nodes" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "cluster_name" : "elasticsearch",
  "timestamp" : 1547182159269,
  "status" : "green",
  "indices" : {
    "count" : 10,
    "shards" : {
      "total" : 150,
      "primaries" : 50,
      "replication" : 2.0,
      "index" : {
        "shards" : {
          "min" : 15,
          "max" : 15,
          "avg" : 15.0
        },
        "primaries" : {
          "min" : 5,
          "max" : 5,
          "avg" : 5.0
        },
        "replication" : {
          "min" : 2.0,
          "max" : 2.0,
          "avg" : 2.0
        }
      }
    },
    "docs" : {
      "count" : 8247168,
      "deleted" : 3755256
    },
    "store" : {
      "size" : "171.4gb",
      "size_in_bytes" : 184121016371
    },
    "fielddata" : {
      "memory_size" : "7.3mb",
      "memory_size_in_bytes" : 7691488,
      "evictions" : 0
    },
    "query_cache" : {
      "memory_size" : "146.8mb",
      "memory_size_in_bytes" : 153961819,
      "total_count" : 140669771,
      "hit_count" : 57522292,
      "miss_count" : 83147479,
      "cache_size" : 24892,
      "cache_count" : 2135173,
      "evictions" : 2110281
    },
    "completion" : {
      "size" : "0b",
      "size_in_bytes" : 0
    },
    "segments" : {
      "count" : 2798,
      "memory" : "546.3mb",
      "memory_in_bytes" : 572872596,
      "terms_memory" : "424.1mb",
      "terms_memory_in_bytes" : 444726409,
      "stored_fields_memory" : "33mb",
      "stored_fields_memory_in_bytes" : 34607912,
      "term_vectors_memory" : "0b",
      "term_vectors_memory_in_bytes" : 0,
      "norms_memory" : "9.3mb",
      "norms_memory_in_bytes" : 9805824,
      "points_memory" : "11.7mb",
      "points_memory_in_bytes" : 12332739,
      "doc_values_memory" : "68mb",
      "doc_values_memory_in_bytes" : 71399712,
      "index_writer_memory" : "0b",
      "index_writer_memory_in_bytes" : 0,
      "version_map_memory" : "0b",
      "version_map_memory_in_bytes" : 0,
      "fixed_bit_set" : "0b",
      "fixed_bit_set_memory_in_bytes" : 0,
      "max_unsafe_auto_id_timestamp" : -1,
      "file_sizes" : { }
    }
  },
  "nodes" : {
    "count" : {
      "total" : 5,
      "data" : 5,
      "coordinating_only" : 0,
      "master" : 5,
      "ingest" : 5
    },
    "versions" : [
      "6.2.4"
    ],
    "os" : {
      "available_processors" : 80,
      "allocated_processors" : 80,
      "names" : [
        {
          "name" : "Linux",
          "count" : 5
        }
      ],
      "mem" : {
        "total" : "314.5gb",
        "total_in_bytes" : 337764552704,
        "free" : "4.2gb",
        "free_in_bytes" : 4548927488,
        "used" : "310.3gb",
        "used_in_bytes" : 333215625216,
        "free_percent" : 1,
        "used_percent" : 99
      }
    },
    "process" : {
      "cpu" : {
        "percent" : 135
      },
      "open_file_descriptors" : {
        "min" : 1356,
        "max" : 1365,
        "avg" : 1361
      }
    },
    "jvm" : {
      "max_uptime" : "42.3m",
      "max_uptime_in_millis" : 2542560,
      "versions" : [
        {
          "version" : "1.8.0_191",
          "vm_name" : "OpenJDK 64-Bit Server VM",
          "vm_version" : "25.191-b12",
          "vm_vendor" : "Oracle Corporation",
          "count" : 5
        }
      ],
      "mem" : {
        "heap_used" : "55.9gb",
        "heap_used_in_bytes" : 60060317832,
        "heap_max" : "159.4gb",
        "heap_max_in_bytes" : 171231805440
      },
      "threads" : 886
    },
    "fs" : {
      "total" : "2.1tb",
      "total_in_bytes" : 2342573998080,
      "free" : "1.9tb",
      "free_in_bytes" : 2128665124864,
      "available" : "1.8tb",
      "available_in_bytes" : 2009311944704
    },
    "plugins" : [
      {
        "name" : "analysis-icu",
        "version" : "6.2.4",
        "description" : "The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components.",
        "classname" : "org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "requires_keystore" : false
      },
      {
        "name" : "analysis-stempel",
        "version" : "6.2.4",
        "description" : "The Stempel (Polish) Analysis plugin integrates Lucene stempel (polish) analysis module into elasticsearch.",
        "classname" : "org.elasticsearch.plugin.analysis.stempel.AnalysisStempelPlugin",
        "extended_plugins" : [ ],
        "has_native_controller" : false,
        "requires_keystore" : false
      }
    ],
    "network_types" : {
      "transport_types" : {
        "netty4" : 5
      },
      "http_types" : {
        "netty4" : 5
      }
    }
  }
}

and finally current tasks are (Index is not updating at the moment):

https://pastebin.com/6eKxymh3

Thanks for any suggestions!