Hi everyone,
we have an elasticsearch-7.17 cluster with 4 nodes (master&data)
3x: (ELK)
- 1x AMD Ryzen 7 3700X, 8 cores, 3.6Ghz
- 64GB DDR4 ram
- 2xSSD, 2TB
1x: (APM + kibana)
- Inter i7-7700, 4 cores, 4.2Ghz
- 32GB ram
- 2xSSD, 500GB
The main issue here is (+screen below)
{"error":{"root_cause":[{"type":"circuit_breaking_exception","reason":"[parent] Data too large, data for [<http_request>] would be [49761516418/46.3gb], which is larger than the limit of [48962627174/45.5gb], real usage: [49761513464/46.3gb], new bytes reserved: [2954/2.8kb], usages [request=0/0b, fielddata=21889252977/20.3gb, in_flight_requests=2954/2.8kb, model_inference=0/0b, eql_sequence=0/0b, accounting=77020098/73.4mb]","bytes_wanted":49761516418,"bytes_limit":48962627174,"durability":"PERMANENT"}],"type":"circuit_breaking_exception","reason":"[parent] Data too large, data for [<http_request>] would be [49761516418/46.3gb], which is larger than the limit of [48962627174/45.5gb], real usage: [49761513464/46.3gb], new bytes reserved: [2954/2.8kb], usages [request=0/0b, fielddata=21889252977/20.3gb, in_flight_requests=2954/2.8kb, model_inference=0/0b, eql_sequence=0/0b, accounting=77020098/73.4mb]","bytes_wanted":49761516418,"bytes_limit":48962627174,"durability":"PERMANENT"},"status":429}
We have rails application (if it's relevant) that is indexing large amount (~80-100 mil/24h) of events per index.
As I understand/guess, problem is that in short period of time we try to index too many events so the heap is bloating faster than GC can release excees memory and then circuit breaker comes to stop it.
I've already read that heap should not exceed 30GB, and yet we have 48GB. Higher than 48GB probably will call OOM, lower in the other hand will trigger circuit braker sooner.
I've noticed that in some indexes, in which we have a steady growth of events circuit breaker isn't triggered.
So the questions are:
- should we increase number of data nodes? separate master nodes and don't combine them with data nodes?
- set lower/higher Xms/Xmx values? or change other elastic/jvm settings?
- slow down throughput? (we wouldn't want to)
- any other suggestions?
_cluster/stats
output:
{
"_nodes" : {
"total" : 4,
"successful" : 4,
"failed" : 0
},
"cluster_name" : "cluster",
"cluster_uuid" : "YGLfhht1Te2PeFsoHPlWPQ",
"timestamp" : 1686596175875,
"status" : "green",
"indices" : {
"count" : 238,
"shards" : {
"total" : 832,
"primaries" : 416,
"replication" : 1.0,
"index" : {
"shards" : {
"min" : 2,
"max" : 10,
"avg" : 3.495798319327731
},
"primaries" : {
"min" : 1,
"max" : 5,
"avg" : 1.7478991596638656
},
"replication" : {
"min" : 1.0,
"max" : 1.0,
"avg" : 1.0
}
}
},
"docs" : {
"count" : 4909419394,
"deleted" : 159867100
},
"store" : {
"size" : "5.7tb",
"size_in_bytes" : 6373864339929,
"total_data_set_size" : "5.7tb",
"total_data_set_size_in_bytes" : 6373864339929,
"reserved" : "0b",
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size" : "58.7gb",
"memory_size_in_bytes" : 63121067008,
"evictions" : 0
},
"query_cache" : {
"memory_size" : "1.8gb",
"memory_size_in_bytes" : 2037112529,
"total_count" : 33516493,
"hit_count" : 4273014,
"miss_count" : 29243479,
"cache_size" : 124817,
"cache_count" : 238417,
"evictions" : 113600
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 16845,
"memory" : "219.9mb",
"memory_in_bytes" : 230635262,
"terms_memory" : "168.8mb",
"terms_memory_in_bytes" : 177017784,
"stored_fields_memory" : "21.7mb",
"stored_fields_memory_in_bytes" : 22755384,
"term_vectors_memory" : "0b",
"term_vectors_memory_in_bytes" : 0,
"norms_memory" : "2.6mb",
"norms_memory_in_bytes" : 2817472,
"points_memory" : "0b",
"points_memory_in_bytes" : 0,
"doc_values_memory" : "26.7mb",
"doc_values_memory_in_bytes" : 28044622,
"index_writer_memory" : "2.5gb",
"index_writer_memory_in_bytes" : 2790097300,
"version_map_memory" : "586.1mb",
"version_map_memory_in_bytes" : 614665896,
"fixed_bit_set" : "23.6mb",
"fixed_bit_set_memory_in_bytes" : 24826208,
"max_unsafe_auto_id_timestamp" : 1686528003373,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "alias",
"count" : 301,
"index_count" : 85,
"script_count" : 0
},
{
"name" : "boolean",
"count" : 4708,
"index_count" : 131,
"script_count" : 0
},
{
"name" : "byte",
"count" : 85,
"index_count" : 85,
"script_count" : 0
},
{
"name" : "constant_keyword",
"count" : 282,
"index_count" : 94,
"script_count" : 0
},
{
"name" : "date",
"count" : 7847,
"index_count" : 207,
"script_count" : 0
},
{
"name" : "double",
"count" : 666,
"index_count" : 18,
"script_count" : 0
},
{
"name" : "flattened",
"count" : 1560,
"index_count" : 85,
"script_count" : 0
},
{
"name" : "float",
"count" : 1086,
"index_count" : 111,
"script_count" : 0
},
{
"name" : "geo_point",
"count" : 969,
"index_count" : 159,
"script_count" : 0
},
{
"name" : "half_float",
"count" : 56,
"index_count" : 14,
"script_count" : 0
},
{
"name" : "histogram",
"count" : 67,
"index_count" : 67,
"script_count" : 0
},
{
"name" : "integer",
"count" : 154,
"index_count" : 7,
"script_count" : 0
},
{
"name" : "ip",
"count" : 3511,
"index_count" : 94,
"script_count" : 0
},
{
"name" : "ip_range",
"count" : 18,
"index_count" : 18,
"script_count" : 0
},
{
"name" : "keyword",
"count" : 153285,
"index_count" : 208,
"script_count" : 0
},
{
"name" : "long",
"count" : 32082,
"index_count" : 152,
"script_count" : 0
},
{
"name" : "match_only_text",
"count" : 3886,
"index_count" : 67,
"script_count" : 0
},
{
"name" : "nested",
"count" : 1188,
"index_count" : 100,
"script_count" : 0
},
{
"name" : "object",
"count" : 38320,
"index_count" : 207,
"script_count" : 0
},
{
"name" : "scaled_float",
"count" : 554,
"index_count" : 85,
"script_count" : 0
},
{
"name" : "short",
"count" : 1854,
"index_count" : 18,
"script_count" : 0
},
{
"name" : "text",
"count" : 2831,
"index_count" : 188,
"script_count" : 0
},
{
"name" : "version",
"count" : 8,
"index_count" : 8,
"script_count" : 0
},
{
"name" : "wildcard",
"count" : 1023,
"index_count" : 85,
"script_count" : 0
}
],
"runtime_field_types" : [ ]
},
"analysis" : {
"char_filter_types" : [ ],
"tokenizer_types" : [ ],
"filter_types" : [ ],
"analyzer_types" : [ ],
"built_in_char_filters" : [ ],
"built_in_tokenizers" : [ ],
"built_in_filters" : [ ],
"built_in_analyzers" : [ ]
},
"versions" : [
{
"version" : "6.5.4",
"index_count" : 2,
"primary_shard_count" : 6,
"total_primary_size" : "63.5kb",
"total_primary_bytes" : 65120
},
{
"version" : "6.8.9",
"index_count" : 2,
"primary_shard_count" : 2,
"total_primary_size" : "881.5kb",
"total_primary_bytes" : 902744
},
{
"version" : "7.15.2",
"index_count" : 139,
"primary_shard_count" : 209,
"total_primary_size" : "813gb",
"total_primary_bytes" : 873009093350
},
{
"version" : "7.17.9",
"index_count" : 95,
"primary_shard_count" : 199,
"total_primary_size" : "2.1tb",
"total_primary_bytes" : 2312505239339
}
]
},
"nodes" : {
"count" : {
"total" : 4,
"coordinating_only" : 0,
"data" : 4,
"data_cold" : 4,
"data_content" : 4,
"data_frozen" : 4,
"data_hot" : 4,
"data_warm" : 4,
"ingest" : 4,
"master" : 3,
"ml" : 4,
"remote_cluster_client" : 4,
"transform" : 4,
"voting_only" : 0
},
"versions" : [
"7.17.9"
],
"os" : {
"available_processors" : 56,
"allocated_processors" : 56,
"names" : [
{
"name" : "Linux",
"count" : 4
}
],
"pretty_names" : [
{
"pretty_name" : "CentOS Linux 7 (Core)",
"count" : 1
},
{
"pretty_name" : "Debian GNU/Linux 10 (buster)",
"count" : 3
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 4
}
],
"mem" : {
"total" : "219.3gb",
"total_in_bytes" : 235498385408,
"free" : "2.3gb",
"free_in_bytes" : 2527797248,
"used" : "216.9gb",
"used_in_bytes" : 232970588160,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 0
},
"open_file_descriptors" : {
"min" : 419,
"max" : 3975,
"avg" : 2904
}
},
"jvm" : {
"max_uptime" : "123.4d",
"max_uptime_in_millis" : 10668815477,
"versions" : [
{
"version" : "19.0.2",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "19.0.2+7-44",
"vm_vendor" : "Oracle Corporation",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 4
}
],
"mem" : {
"heap_used" : "100.1gb",
"heap_used_in_bytes" : 107528858080,
"heap_max" : "145gb",
"heap_max_in_bytes" : 155692564480
},
"threads" : 546
},
"fs" : {
"total" : "9.2tb",
"total_in_bytes" : 10217112739840,
"free" : "1.9tb",
"free_in_bytes" : 2189648367616,
"available" : "1.5tb",
"available_in_bytes" : 1690145968128
},
"plugins" : [ ],
"network_types" : {
"transport_types" : {
"security4" : 4
},
"http_types" : {
"security4" : 4
}
},
"discovery_types" : {
"zen" : 4
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "rpm",
"count" : 1
},
{
"flavor" : "default",
"type" : "deb",
"count" : 3
}
],
"ingest" : {
"number_of_pipelines" : 31,
"processor_stats" : {
"conditional" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"convert" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"date" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"dot_expander" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"geoip" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"grok" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"json" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"pipeline" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"remove" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"rename" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"set" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"set_security_user" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"split" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
},
"user_agent" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time" : "0s",
"time_in_millis" : 0
}
}
}
}
}
jvm.options
:
-Xms48g
-Xmx48g
## GC configuration
#-XX:+UseConcMarkSweepGC
#-XX:CMSInitiatingOccupancyFraction=75
#-XX:+UseCMSInitiatingOccupancyOnly
## optimizations
# pre-touch memory pages used by the JVM during initialization
-XX:+AlwaysPreTouch
## basic
# explicitly set the stack size
-Xss1m
# set to headless, just in case
-Djava.awt.headless=true
# ensure UTF-8 encoding by default (e.g. filenames)
-Dfile.encoding=UTF-8
# use our provided JNA always versus the system one
-Djna.nosys=true
-XX:-OmitStackTraceInFastThrow
# flags to configure Netty
-Dio.netty.noUnsafe=true
-Dio.netty.noKeySetOptimization=true
-Dio.netty.recycler.maxCapacityPerThread=0
# log4j 2
-Dlog4j.shutdownHookEnabled=false
-Dlog4j2.disable.jmx=true
-Djava.io.tmpdir=${ES_TMPDIR}
## heap dumps
# generate a heap dump when an allocation from the Java heap fails
# heap dumps are created in the working directory of the JVM
-XX:+HeapDumpOnOutOfMemoryError
# specify an alternative path for heap dumps
# ensure the directory exists and has sufficient space
-XX:HeapDumpPath=/var/lib/elasticsearch
## JDK 8 GC logging
8:-XX:+PrintGCDetails
8:-XX:+PrintGCDateStamps
8:-XX:+PrintTenuringDistribution
8:-XX:+PrintGCApplicationStoppedTime
8:-Xloggc:/var/log/elasticsearch/gc.log
8:-XX:+UseGCLogFileRotation
8:-XX:NumberOfGCLogFiles=32
8:-XX:GCLogFileSize=64m
# JDK 9+ GC loggin
9-:-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m
# due to internationalization enhancements in JDK 9 Elasticsearch need to set the provider to COMPAT otherwise
# time/date parsing will break in an incompatible way for some date patterns and locals
9-:-Djava.locale.providers=COMPAT
#log4j fuckup
-Dlog4j2.formatMsgNoLookups=true
# gc tune
8-13:-XX:+UseConcMarkSweepGC
8-13:-XX:CMSInitiatingOccupancyFraction=75
8-13:-XX:+UseCMSInitiatingOccupancyOnly
14-:-XX:+UseG1GC
14-:-XX:G1ReservePercent=25
14-:-XX:InitiatingHeapOccupancyPercent=30
elasticsearch.yml
:
cluster.routing.allocation.disk.threshold_enabled: false
thread_pool.write.queue_size: 1000
search.max_buckets: 130000
screen of one node from kibana stack monitoring when problem occured: