Cluster state update taking more than 10 seconds warnings

Hi,

I'm getting the following warning every time I update a cluster setting:

took [10.6s], which is over [10s], to compute cluster state update for [update-settings]

Is there any way to identify where it is taking more time to acknowledge cluster updates?

My config is:

3 master nodes
3 data nodes
3 ingest nodes

Which version are you running? What is the output of the cluster stats API?

Version 7.6.0

    {
        "_nodes": {
            "total": 9,
            "successful": 9,
            "failed": 0
        },
        "cluster_name": "elkcorp-cluster",
        "cluster_uuid": "YptW__s4TZyXsYWo5lZOxg",
        "timestamp": 1596572267359,
        "status": "yellow",
        "indices": {
            "count": 1015,
            "shards": {
                "total": 2332,
                "primaries": 1015,
                "replication": 1.2975369458128079,
                "index": {
                    "shards": {
                        "min": 2,
                        "max": 3,
                        "avg": 2.297536945812808
                    },
                    "primaries": {
                        "min": 1,
                        "max": 1,
                        "avg": 1.0
                    },
                    "replication": {
                        "min": 1.0,
                        "max": 2.0,
                        "avg": 1.2975369458128079
                    }
                }
            },
            "docs": {
                "count": 9182461780,
                "deleted": 175
            },
            "store": {
                "size_in_bytes": 9725188662488
            },
            "fielddata": {
                "memory_size_in_bytes": 6840328,
                "evictions": 0
            },
            "query_cache": {
                "memory_size_in_bytes": 496038280,
                "total_count": 1645373,
                "hit_count": 36801,
                "miss_count": 1608572,
                "cache_size": 12020,
                "cache_count": 12084,
                "evictions": 64
            },
            "completion": {
                "size_in_bytes": 0
            },
            "segments": {
                "count": 41691,
                "memory_in_bytes": 9991103459,
                "terms_memory_in_bytes": 5406852079,
                "stored_fields_memory_in_bytes": 4498307056,
                "term_vectors_memory_in_bytes": 0,
                "norms_memory_in_bytes": 61719744,
                "points_memory_in_bytes": 0,
                "doc_values_memory_in_bytes": 24224580,
                "index_writer_memory_in_bytes": 4596288,
                "version_map_memory_in_bytes": 0,
                "fixed_bit_set_memory_in_bytes": 31026640,
                "max_unsafe_auto_id_timestamp": 1596570085532,
                "file_sizes": {}
            }
        },
        "nodes": {
            "count": {
                "total": 9,
                "coordinating_only": 0,
                "data": 3,
                "ingest": 3,
                "master": 3,
                "ml": 9,
                "voting_only": 0
            },
            "versions": [
                "7.6.0"
            ],
            "os": {
                "available_processors": 504,
                "allocated_processors": 504,
                "names": [
                    {
                        "name": "Linux",
                        "count": 9
                    }
                ],
                "pretty_names": [
                    {
                        "pretty_name": "Oracle Linux Server 7.5",
                        "count": 9
                    }
                ],
                "mem": {
                    "total_in_bytes": 2525252702208,
                    "free_in_bytes": 69380960256,
                    "used_in_bytes": 2455871741952,
                    "free_percent": 3,
                    "used_percent": 97
                }
            },
            "process": {
                "cpu": {
                    "percent": 49
                },
                "open_file_descriptors": {
                    "min": 485,
                    "max": 14639,
                    "avg": 4959
                }
            },
            "jvm": {
                "max_uptime_in_millis": 448615349,
                "versions": [
                    {
                        "version": "13.0.2",
                        "vm_name": "OpenJDK 64-Bit Server VM",
                        "vm_version": "13.0.2+8",
                        "vm_vendor": "AdoptOpenJDK",
                        "bundled_jdk": true,
                        "using_bundled_jdk": true,
                        "count": 9
                    }
                ],
                "mem": {
                    "heap_used_in_bytes": 51132148392,
                    "heap_max_in_bytes": 106300440576
                },
                "threads": 3567
            },
            "fs": {
                "total_in_bytes": 20029425844224,
                "free_in_bytes": 8902947749888,
                "available_in_bytes": 8902947749888
            },
            "plugins": [],
            "network_types": {
                "transport_types": {
                    "security4": 9
                },
                "http_types": {
                    "security4": 9
                }
            },
            "discovery_types": {
                "zen": 9
            },
            "packaging_types": [
                {
                    "flavor": "default",
                    "type": "tar",
                    "count": 9
                }
            ],
            "ingest": {
                "number_of_pipelines": 2,
                "processor_stats": {
                    "gsub": {
                        "count": 0,
                        "failed": 0,
                        "current": 0,
                        "time_in_millis": 0
                    },
                    "script": {
                        "count": 0,
                        "failed": 0,
                        "current": 0,
                        "time_in_millis": 0
                    }
                }
            }
        }
    }

That looks OK as far as I can see. What type of hardware, especially storage, is this cluster deployed on? Is the cluster under heavy load?

Master Nodes are in 3 Virtual Machines:
8 vCPU - 32GB RAM - 1GB Heap (G1GC)
Storage in VMDK

Data Nodes and Ingest Nodes are in 3 Physical Machines:
2x20 Core CPU - 384 RAM
Storage in Local attached SAS Disks
Data Node 30GB Heap (G1GC)
Ingest Node 8GB Heap (G1GC)

It's more indexing than searching currently.

Write I/O is ~900/s
Search I/O ~10/s

Check the logs of your master eligible nodes for signs of long and/or frequent GC. If you find any it is quite possible that they need more heap assigned. 1GB sounds a bit little. If there are no GC issues it maybe issues with the performance of the storage they use.

1 Like

1GB Heap very small and makes no sense on 32GB VMs, or with several TB of data, a thousand indexes, etc. Why not use the 50% recommendation of 16GB? Else all the other RAM is mostly wasted, especially on the masters which need no file cache?

1 Like

Thank you for your answers.

I'll increase the heap to 16GB and monitor the logs.

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.