Elasticsearch cluster going down , whenever trying to run query for more than 2 days

Hi All,

I am new to elasticsearch, so don't much idea about it's performance, but whenever I am trying to run a visualization report for more than 2 days then it is resulting in cluster going down. We have some huge load like 10 million documents per day in the impacted index

We have 6 node cluster.

i). 1 coordinating node
ii). 3 master + data node
iii). 2 data nodes

We have created each index with 5 shards and 1 replica.

1 Like

What is the output from GET /_cluster/stats?human&pretty?

Hi @warkolm : Thanks for your response, please find below output

{
"_nodes" : {
"total" : 6,
"successful" : 6,
"failed" : 0
},
"cluster_name" : "Testing",
"cluster_uuid" : "zckRg1AbQey2LgnX3Xe5Mg",
"timestamp" : 1600671502433,
"status" : "yellow",
"indices" : {
"count" : 47,
"shards" : {
"total" : 194,
"primaries" : 99,
"replication" : 0.9595959595959596,
"index" : {
"shards" : {
"min" : 2,
"max" : 10,
"avg" : 4.127659574468085
},
"primaries" : {
"min" : 1,
"max" : 5,
"avg" : 2.106382978723404
},
"replication" : {
"min" : 0.6,
"max" : 1.0,
"avg" : 0.9829787234042554
}
}
},
"docs" : {
"count" : 13353744729,
"deleted" : 1358
},
"store" : {
"size" : "16tb",
"size_in_bytes" : 17644496139954
},
"fielddata" : {
"memory_size" : "4.8kb",
"memory_size_in_bytes" : 4976,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "64.5kb",
"memory_size_in_bytes" : 66079,
"total_count" : 713,
"hit_count" : 297,
"miss_count" : 416,
"cache_size" : 24,
"cache_count" : 24,
"evictions" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 6667,
"memory" : "184.6mb",
"memory_in_bytes" : 193630968,
"terms_memory" : "85.5mb",
"terms_memory_in_bytes" : 89746416,
"stored_fields_memory" : "78.3mb",
"stored_fields_memory_in_bytes" : 82169848,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "11mb",
"norms_memory_in_bytes" : 11582400,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "9.6mb",
"doc_values_memory_in_bytes" : 10132304,
"index_writer_memory" : "490.9mb",
"index_writer_memory_in_bytes" : 514779868,
"version_map_memory" : "37.7kb",
"version_map_memory_in_bytes" : 38686,
"fixed_bit_set" : "100.2kb",
"fixed_bit_set_memory_in_bytes" : 102640,
"max_unsafe_auto_id_timestamp" : 1600671102338,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 6,
"index_count" : 2
},
{
"name" : "binary",
"count" : 7,
"index_count" : 2
},
{
"name" : "boolean",
"count" : 211,
"index_count" : 31
},
{
"name" : "byte",
"count" : 10,
"index_count" : 10
},
{
"name" : "date",
"count" : 259,
"index_count" : 45
},
{
"name" : "double",
"count" : 203,
"index_count" : 3
},
{
"name" : "flattened",
"count" : 1,
"index_count" : 1
},
{
"name" : "float",
"count" : 285,
"index_count" : 13
},
{
"name" : "geo_point",
"count" : 14,
"index_count" : 2
},
{
"name" : "geo_shape",
"count" : 1,
"index_count" : 1
},
{
"name" : "half_float",
"count" : 56,
"index_count" : 14
},
{
"name" : "integer",
"count" : 181,
"index_count" : 10
},
{
"name" : "ip",
"count" : 42,
"index_count" : 2
},
{
"name" : "keyword",
"count" : 2849,
"index_count" : 46
},
{
"name" : "long",
"count" : 5826,
"index_count" : 32
},
{
"name" : "nested",
"count" : 29,
"index_count" : 9
},
{
"name" : "object",
"count" : 5816,
"index_count" : 37
},
{
"name" : "scaled_float",
"count" : 224,
"index_count" : 2
},
{
"name" : "short",
"count" : 16,
"index_count" : 8
},
{
"name" : "text",
"count" : 459,
"index_count" : 34
}
]
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : [
{
"name" : "pattern_capture",
"count" : 1,
"index_count" : 1
}
],
"analyzer_types" : [
{
"name" : "custom",
"count" : 1,
"index_count" : 1
}
],
"built_in_char_filters" : ,
"built_in_tokenizers" : [
{
"name" : "uax_url_email",
"count" : 1,
"index_count" : 1
}
],
"built_in_filters" : [
{
"name" : "lowercase",
"count" : 1,
"index_count" : 1
},
{
"name" : "unique",
"count" : 1,
"index_count" : 1
}
],
"built_in_analyzers" :
}
},
"nodes" : {
"count" : {
"total" : 6,
"coordinating_only" : 0,
"data" : 5,
"ingest" : 5,
"master" : 3,
"ml" : 6,
"remote_cluster_client" : 6,
"transform" : 5,
"voting_only" : 0
},
"versions" : [
"7.7.0"
],
"os" : {
"available_processors" : 48,
"allocated_processors" : 48,
"names" : [
{
"name" : "Linux",
"count" : 6
}
],
"pretty_names" : [
{
"pretty_name" : "Red Hat Enterprise Linux Server 7.8 (Maipo)",
"count" : 6
}
],
"mem" : {
"total" : "376.5gb",
"total_in_bytes" : 404268306432,
"free" : "39.3gb",
"free_in_bytes" : 42282094592,
"used" : "337.1gb",
"used_in_bytes" : 361986211840,
"free_percent" : 10,
"used_percent" : 90
}
},
"process" : {
"cpu" : {
"percent" : 201
},
"open_file_descriptors" : {
"min" : 565,
"max" : 1265,
"avg" : 1026
}
},
"jvm" : {
"max_uptime" : "2.7d",
"max_uptime_in_millis" : 240952954,
"versions" : [
{
"version" : "14",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "14+36",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 6
}
],
"mem" : {
"heap_used" : "3.6gb",
"heap_used_in_bytes" : 3931130080,
"heap_max" : "6gb",
"heap_max_in_bytes" : 6442450944
},
"threads" : 660
},
"fs" : {
"total" : "41.8tb",
"total_in_bytes" : 46054047244288,
"free" : "23.6tb",
"free_in_bytes" : 26053512101888,
"available" : "23.6tb",
"available_in_bytes" : 26053512101888
},
"plugins" : [
{
"name" : "mapper-size",
"version" : "7.7.0",
"elasticsearch_version" : "7.7.0",
"java_version" : "1.8",
"description" : "The Mapper Size plugin allows document to record their uncompressed size at index time.",
"classname" : "org.elasticsearch.plugin.mapper.MapperSizePlugin",
"extended_plugins" : ,
"has_native_controller" : false
}
],
"network_types" : {
"transport_types" : {
"security4" : 6
},
"http_types" : {
"security4" : 6
}
},
"discovery_types" : {
"zen" : 6
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "rpm",
"count" : 6
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}

Are you running with a heap size of 1GB per node with that amount of data in the cluster? If so I suspect you need to increase it significantly.

@Christian_Dahlqvist : Can you suggest , how much heap size we need,

PS: This data is of just one index , total data is around 20 million documents per day

@warkolm : Can you please help here.

Set it to 50% if available RAM (or max 30GB) and see if that helps.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.