Elasticsearch - high load CPU

Hello guys,

We have a problem with elasticserach performance.
My env is:

3 masters:

  • AWS t3.large
  • VM on premises CPU: 2 Memory: 8GB
  • VM on premises CPU: 2 Memory: 8GB

15 Datas

  • AWS 5 EC2 instance m5.4xlarge
  • 10 VMs on premises each with: CPU: 16 Memory 64GB

We have had this cluster for at least 1 year, however now we are having problems with high CPU usage and load. We updated the version of elasticsearch to 7.12.0 and felt a slight improvement, but not enough to use in production.

one more information

Welcome to our community! :smiley:
We aren't all guys though.

Just to be clear here, you have nodes split between EC2 and something on premises?

Hello, thanks a lot for your answer. :slight_smile:

Yes, we have a hybrid env.

Ok, that's not really supported due to Elasticsearch being sensitive to network latency. Is there a reason you have it like this?

Just resilience. does elasticsearch not support even when using aws direct link? And maybe this is my problem with high CPU usage?

The interesting thing is that we take AWS out of the cluster for testing, and we continue with the same problem.

It'll work, it's just not a supported configuration. Mostly if we encounter this we'd ask that you replicate it without that in place, which you just indicated you have done.

What does Monitoring show when you see this load?

we don't use the monitoring, we only use newrelic for this.

Ok, what does it show?

What does hot threads or slow log show? What about Elasticsearch logs?
What is the output from the _cluster/stats?pretty&human API?

at this point the clusters are yellow because we are doing some more tests.

"_nodes" : {
"total" : 21,
"successful" : 21,
"failed" : 0
},
"cluster_name" : "
",
"cluster_uuid" : "6OKGsd_3RM28HnjzRkozVQ",
"timestamp" : 1616563807996,
"status" : "yellow",
"indices" : {
"count" : 92,
"shards" : {
"total" : 1276,
"primaries" : 452,
"replication" : 1.823008849557522,
"index" : {
"shards" : {
"min" : 2,
"max" : 15,
"avg" : 13.869565217391305
},
"primaries" : {
"min" : 1,
"max" : 5,
"avg" : 4.913043478260869
},
"replication" : {
"min" : 0.8,
"max" : 2.0,
"avg" : 1.8086956521739135
}
}
},
"docs" : {
"count" : 8439488678,
"deleted" : 1652525275
},
"store" : {
"size_in_bytes" : 29372336501676,
"reserved_in_bytes" : 0
},
"fielddata" : {
"memory_size_in_bytes" : 1256,
"evictions" : 0
},
"query_cache" : {
"memory_size_in_bytes" : 3865641671,
"total_count" : 1999791335,
"hit_count" : 10303862,
"miss_count" : 1989487473,
"cache_size" : 7942882,
"cache_count" : 10566729,
"evictions" : 2623847
},
"completion" : {
"size_in_bytes" : 0
},
"segments" : {
"count" : 33853,
"memory_in_bytes" : 6534447550,
"terms_memory_in_bytes" : 297488808,
"stored_fields_memory_in_bytes" : 6151237352,
"term_vectors_memory_in_bytes" : 0,
"norms_memory_in_bytes" : 8048064,
"points_memory_in_bytes" : 0,
"doc_values_memory_in_bytes" : 77673326,
"index_writer_memory_in_bytes" : 0,
"version_map_memory_in_bytes" : 171,
"fixed_bit_set_memory_in_bytes" : 304,
"max_unsafe_auto_id_timestamp" : 1579788397426,
"file_sizes" : { }
},
"mappings" : {
"field_types" : [
{
"name" : "boolean",
"count" : 458,
"index_count" : 82
},
{
"name" : "date",
"count" : 350,
"index_count" : 82
},
{
"name" : "float",
"count" : 2,
"index_count" : 1
},
{
"name" : "keyword",
"count" : 3221,
"index_count" : 90
},
{
"name" : "long",
"count" : 1468,
"index_count" : 89
},
{
"name" : "object",
"count" : 5,
"index_count" : 2
},
{
"name" : "scaled_float",
"count" : 498,
"index_count" : 81
},
{
"name" : "text",
"count" : 335,
"index_count" : 82
}
]
},
"analysis" : {
"char_filter_types" : ,
"tokenizer_types" : ,
"filter_types" : ,
"analyzer_types" : ,
"built_in_char_filters" : ,
"built_in_tokenizers" : ,
"built_in_filters" : ,
"built_in_analyzers" :
},
"versions" : [
{
"version" : "6.2.4",
"index_count" : 1,
"primary_shard_count" : 5,
"total_primary_bytes" : 164942
},
{
"version" : "7.1.0",
"index_count" : 84,
"primary_shard_count" : 412,
"total_primary_bytes" : 9800627442726
},
{
"version" : "7.10.2",
"index_count" : 7,
"primary_shard_count" : 35,
"total_primary_bytes" : 582627349608
}
]
},
"nodes" : {
"count" : {
"total" : 21,
"coordinating_only" : 0,
"data" : 17,
"data_cold" : 17,
"data_content" : 17,
"data_frozen" : 17,
"data_hot" : 17,
"data_warm" : 17,
"ingest" : 0,
"master" : 4,
"ml" : 21,
"remote_cluster_client" : 0,
"transform" : 17,
"voting_only" : 1
},
"versions" : [
"7.12.0"
],
"os" : {
"available_processors" : 282,
"allocated_processors" : 282,
"names" : [
{
"name" : "Linux",
"count" : 21
}
],
"pretty_names" : [
{
"pretty_name" : "SUSE Linux Enterprise Server 15 SP1",
"count" : 12
},
{
"pretty_name" : "Amazon Linux 2",
"count" : 9
}
],
"architectures" : [
{
"arch" : "amd64",
"count" : 21
}
],
"mem" : {
"total_in_bytes" : 1167424684032,
"free_in_bytes" : 12850872320,
"used_in_bytes" : 1154573811712,
"free_percent" : 1,
"used_percent" : 99
}
},
"process" : {
"cpu" : {
"percent" : 194
},
"open_file_descriptors" : {
"min" : 763,
"max" : 1846,
"avg" : 1539
}
},
"jvm" : {
"max_uptime_in_millis" : 35720852,
"versions" : [
{
"version" : "15.0.1",
"vm_name" : "OpenJDK 64-Bit Server VM",
"vm_version" : "15.0.1+9",
"vm_vendor" : "AdoptOpenJDK",
"bundled_jdk" : true,
"using_bundled_jdk" : true,
"count" : 21
}
],
"mem" : {
"heap_used_in_bytes" : 230952439000,
"heap_max_in_bytes" : 564788199424
},
"threads" : 2268
},
"fs" : {
"total_in_bytes" : 66021777629184,
"free_in_bytes" : 36484486242304,
"available_in_bytes" : 36480210264064
},
"plugins" : [
{
"name" : "repository-s3",
"version" : "7.12.0",
"elasticsearch_version" : "7.12.0",
"java_version" : "1.8",
"description" : "The S3 repository plugin adds S3 repositories",
"classname" : "org.elasticsearch.repositories.s3.S3RepositoryPlugin",
"extended_plugins" : ,
"has_native_controller" : false,
"licensed" : false,
"type" : "isolated"
}
],
"network_types" : {
"transport_types" : {
"security4" : 21
},
"http_types" : {
"security4" : 21
}
},
"discovery_types" : {
"zen" : 21
},
"packaging_types" : [
{
"flavor" : "default",
"type" : "rpm",
"count" : 21
}
],
"ingest" : {
"number_of_pipelines" : 2,
"processor_stats" : {
"gsub" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
},
"script" : {
"count" : 0,
"failed" : 0,
"current" : 0,
"time_in_millis" : 0
}
}
}
}
}

I didn't find no mention of error within the log.

Thanks.
Anything that is in the log might be helpful.

Please format your code/logs/config using the </> button, or markdown style back ticks. It helps to make things easy to read which helps us help you :slight_smile:

tail -f -n 60 elasticsearch-prod.log
[2021-03-24T00:31:41,278][INFO ][o.e.t.NettyAllocator     ] [transsrch-data-16] creating NettyAllocator with the following configs: [name=elasticsearch_configured, chunk_size=1mb, suggested_max_allocation_
size=1mb, factors={es.unsafe.use_netty_default_chunk_and_page_size=false, g1gc_enabled=true, g1gc_region_size=16mb}]
[2021-03-24T00:31:41,330][INFO ][o.e.d.DiscoveryModule    ] [transsrch-data-16] using discovery type [zen] and seed hosts providers [settings]
[2021-03-24T00:31:41,667][INFO ][o.e.g.DanglingIndicesState] [transsrch-data-16] gateway.auto_import_dangling_indices is disabled, dangling indices will not be automatically detected or imported and must b
e managed manually[2021-03-24T00:31:42,027][INFO ][o.e.n.Node               ] [transsrch-data-16] initialized[2021-03-24T00:31:42,028][INFO ][o.e.n.Node               ] [transsrch-data-16] starting ...[2021-03-24T00:31:42,043][INFO ][o.e.x.s.c.PersistentCache] [transsrch-data-16] persistent cache index loaded
[2021-03-24T00:31:42,121][INFO ][o.e.t.TransportService   ] [transsrch-data-16] publish_address XXXXXXXXXXXXXXX, bound_addresses {[::]:9300}
[2021-03-24T00:31:42,709][INFO ][o.e.b.BootstrapChecks    ] [transsrch-data-16] bound or publishing to a non-loopback address, enforcing bootstrap checks
[2021-03-24T00:31:42,726][INFO ][o.e.c.c.Coordinator      ] [transsrch-data-16] cluster UUID [6OKGsd_3RM28HnjzRkozVQ]
[2021-03-24T00:31:42,965][INFO ][o.e.c.s.ClusterApplierService] [transsrch-data-16] master node changed {previous [], current [{transsrch-master-2}{7pw1aibRRUaUrCXNV4ZrsA}{x6tXCrPLRWimx_BBTgG1-g}{XXXXXXXXXXXXXXX}XXXXXXXXXXXXXXX{lm}{rack_id=aws, ml.machine_memory=8257146880, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=4294967296, transform.node=false}]}, added {{transsrch-data-15}{qxvxWyaATVmAB
68558AERQ}{sipwKUMdQai-lZZ54Ujb1w}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-4}{w-JrZB7xQSi5khSXbDc7GA}{WaAdKd72Q_aatpwEtQhmaQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-1}{BlrcLmwSRHmvQDvIDK3MNg}{bwkq6jbnQyCI1a9tQGFmDw}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-10}{h5WYnupBSOSo8DSX67LgNQ}{FYgX5TPTTz-xWp0GlU3lpQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfh
lstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-0}{edrkItDVQVaOcDWdpw2AGA}{czKHqzfXRfyM8yFcZ
dV6iQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-dat
a-13}{IVH96DGIQ0a631rs0Gwuyg}{lrqfHS4nRdK96uZjzBiVZA}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-11}{VM3mgkAURK-Cq6ydjZvgXQ}{lRijrrYYR42Ua3MaBpAGIQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-5}{n8FBDPbUTEa7BZhjB4dbAA}{7EXqNsG5RBeO8K8YYcqkTg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-12}{yizZ0IHvSICVnTF5uFIS4Q}{btgQBEB2QEur45xj2K9GvQ}{XXXXXXXXX}XXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-19}{dolU-0RlRqmU
7xVEVzYbxA}{l6PDy83tS0uxTwrv6pIrGQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.
node=true},{transsrch-master-0}{g5d-5sCFRPKhc6D9XasHlQ}{nqp0FbZHSuCYhuuxpWUtaA}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{lm}{rack_id=tambore, ml.machine_memory=8079503360, ml.max_open_jobs=20, xpack.installed=tr
ue, ml.max_jvm_size=4294967296, transform.node=false},{transsrch-data-20}{3XJJ233ERNOeboYxAmhO7w}{Vufw_eUHQHGRMQglp7y9dA}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192
, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-8}{KlQwz73nS3W-gxiy-0CIWw}{1hSojsF7QDW4HdsTfwfuaQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw
}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-21}{YQbD88NDTNG6fUN7kU8AhQ}{iGhy3f-YRsWzVk8xUP
GZKg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-6}{QI0uEJQQRiqTMhCs9OgK7g}{EJQKGu9aQxiZ3lsCfK8MPg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-7}{TU6Oa0DvQwewai6Hm1e-2A}{_bRPWclES8GXdEtSYVemcg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-9}{R7o5CivOT2a-4_nEHgFW1w}{nbTJwsXtRAy2IeSIL_kGGA}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=tambore, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-3}{fbDGMXHgT3CvOVZoPw-dbg}{XawMY7FlQNeCGeUGYZ20EQ}XXXXXXXXXXXXXXX{xxxxxxxxxx}{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-18}{gibvCVWBQNaqNF8R
2PmTtw}{m7Lj2Sv3Th6e-1UQxt_fZg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node
=true},{transsrch-data-14}{ZnJvrjUoTyODC4v_6d8mDw}{K8qjETI6QjWoyglTsB7Fqg}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true
, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-master-2}{7pw1aibRRUaUrCXNV4ZrsA}{x6tXCrPLRWimx_BBTgG1-g}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{lm}{rack_id=aws, ml.machine_memory=8257146880, ml.max
_open_jobs=20, xpack.installed=true, ml.max_jvm_size=4294967296, transform.node=false},{transsrch-master-1}{vtDFPn-QRxCVs7NvTZAwqA}{j4cA11ZSQZu8qQVmeTIqkQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{lm}{rack_id=gl
ete, ml.machine_memory=8079495168, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=4294967296, transform.node=false},{transsrch-data-17}{VI7lgP5pRWKdK6BpCCBxag}{w-QJ6asXSJG-fgb_st2nww}{xxxxxxxxxxx}XXXXXXXXXXXXXXX{cdfhlstw}{rack_id=aws, ml.machine_memory=66008072192, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.node=true},{transsrch-data-2}{HYtOJTslRfSUgjiQk
tlBgA}{SbKyQrp5ThWTKNXAKp-FAQ}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{cdfhlstw}{rack_id=glete, ml.machine_memory=67269488640, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=32212254720, transform.n
ode=true},{transsrch-master-3-vo}{Ey0vmyC0TjOMDxBg3pH6Jw}{gCKTqvdxQ7Gs5Fg8VF6N3w}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{lmv}{rack_id=aws, ml.machine_memory=8257146880, ml.max_open_jobs=20, xpack.installed=true, m
l.max_jvm_size=4294967296, transform.node=false}}, term: 40, version: 1549057, reason: ApplyCommitRequest{term=40, version=1549057, sourceNode={transsrch-master-2}{7pw1aibRRUaUrCXNV4ZrsA}{x6tXCrPLRWimx_BBT
gG1-g}XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX{lm}{rack_id=aws, ml.machine_memory=8257146880, ml.max_open_jobs=20, xpack.installed=true, ml.max_jvm_size=4294967296, transform.node=false}}
1 Like

I exchanged my IPs for xxxxxxxx, we have some security policies in the company.

1 Like

Thanks for that, hugely appreciated!

So transsrch-data-16 is the node with the issues? Is that all the log?
It'd be helpful if you could grab hot threads from the node if this happens again.

Also what does newrelic show at that time outside CPU, memory, anything else that spikes?

we don't have a specific node that is the cause of problems.

In the first graph it looks like there are 3 nodes that show a higher CPU usage that is somewhat in alignment. I noticed from the stats that you have replication set to 2 for most indices meaning there are three copies of each shard. Could this be related to something happening indexing wise on some specific index at that time? If you are using rack awareness, are those nodes located across all zones?

It looks like you might have a lot of update operations and a lot of small segments. Can you describe your use case and data model? Are you using features that might require occasional rebuilding of data structures in memory, e.g. parent-child or maybe nested documents?

We use a lot of updates in our documents, but the problem is we have two clusters with the same confs (hardware, amount of nodes, data, template...) a "legacy" cluster work well but the new one we have this kind of problem.

that is our template

      "order": 1,
      "index_patterns": [
        "ps*"
      ],
      "settings": {
        "index": {
          "number_of_shards": "5",
          "number_of_replicas": "2",
          "write": {
            "wait_for_active_shards": "1"
          }
        }
      },
      "mappings": {
        "_source": {
          "enabled": true
        },
        "dynamic_templates": [
          {
            "ids": {
              "match_pattern": "regex",
              "mapping": {
                "null_value": 0,
                "type": "long"
              },
              "match": ".*[Ii]d$"
            }
          },
          {
            "dates": {
              "mapping": {
                "format": "date_hour_minute_second",
                "type": "date"
              },
              "match": "*Date"
            }
          },
          {
            "values": {
              "match_pattern": "regex",
              "mapping": {
                "null_value": 0,
                "scaling_factor": 100,
                "type": "scaled_float"
              },
              "match": ".*[Vv]alue$"
            }
          },
          {
            "names": {
              "match_pattern": "regex",
              "mapping": {
                "type": "text"
              },
              "match_mapping_type": "string",
              "match": ".*[Nn]ame$"
            }
          },
          {
            "bools": {
              "mapping": {
                "type": "boolean"
              },
              "match": "is*"
            }
          },
          {
            "others": {
              "mapping": {
                "type": "keyword"
              },
              "match_mapping_type": "string",
              "match": "*"
            }
          }
        ]
      },
      "aliases": {}
    }```