Kibana crashes nodes when doing an aggregation

We use Graylog to gather our logs because we found out about it before we found ELK. We're going to migrate to ELK, but we've got issues with Kibana to solve first.

Graylog can do a search on our 100 top log sources including a donut chart of the top 20 for the past 24hrs with no problems. But, when I try to replicate the donut chart of just the top 10 for the past 15 minutes in Kibana, one of my three nodes will crash, sometimes two of them.

We've got three data nodes and one master node. They're all physical machines with a mix of 4,12 & 16 core Xeon CPUs. The nodes have 16GB RAM and the master has 96GB (it's one of our old VMware hosts). What would cause this and how can I troubleshoot the issue?

I think this is more of an Elasticsearch question rather than a Kibana one.
Can you give us more details, like what the answer back from Elasticsearch looks like? Do you get any errors in ES logs? If it's something in there, we can help, otherwise I'd suggest you to create a topic in the ES part of the forum. Normally this shouldn't happen in Kibana, but it depends a whole lot of factors relating to Elasticsearch and how your data is structured.

Since Graylog can successfully complete a much more intense search than Kibana, I thought the issue would be related to Kibana. I accidentally closed the browser tab. When I reopened it, two nodes bombed out. I'm allowing the cluster to completely recover right now. Once that's completed, I'll try again and post the errors in the logs.

Please let me know if you think this is still more of an ES issue and I'll re-post in that area.

Is Graylog deployed on similar hardware? Which version of Elasticsearch are you using? How much data do you have in the cluster? Are you using time-based indices? What is the full output of the cluster stats API?

Graylog is deployed on a VM with 4 cores and 16GB RAM and I had/have the same issue when running the query against Kibana on that server. I thought there may simply not be enough resources on that VM, so I created a new master server just this past week with no improvements. The Graylog server is now not even running ES.

Here's the output of the cluster stats:

{
"_nodes": {
"total": 4,
"successful": 4,
"failed": 0
},
"cluster_name": "cor-es",
"timestamp": 1512664069648,
"status": "green",
"indices": {
"count": 176,
"shards": {
"total": 2583,
"primaries": 876,
"replication": 1.9486301369863013,
"index": {
"shards": {
"min": 3,
"max": 15,
"avg": 14.676136363636363
},
"primaries": {
"min": 1,
"max": 5,
"avg": 4.9772727272727275
},
"replication": {
"min": 1,
"max": 2,
"avg": 1.9488636363636365
}
}
},
"docs": {
"count": 3402695448,
"deleted": 2981
},
"store": {
"size": "9.7tb",
"size_in_bytes": 10757508071521,
"throttle_time": "0s",
"throttle_time_in_millis": 0
},
"fielddata": {
"memory_size": "0b",
"memory_size_in_bytes": 0,
"evictions": 0
},
"query_cache": {
"memory_size": "2.9mb",
"memory_size_in_bytes": 3045576,
"total_count": 85232,
"hit_count": 207,
"miss_count": 85025,
"cache_size": 18,
"cache_count": 18,
"evictions": 0
},
"completion": {
"size": "0b",
"size_in_bytes": 0
},
"segments": {
"count": 3972,
"memory": "12.9gb",
"memory_in_bytes": 13906403665,
"terms_memory": "8.9gb",
"terms_memory_in_bytes": 9621613126,
"stored_fields_memory": "3.6gb",
"stored_fields_memory_in_bytes": 3966458000,
"term_vectors_memory": "0b",
"term_vectors_memory_in_bytes": 0,
"norms_memory": "994.1kb",
"norms_memory_in_bytes": 1017984,
"points_memory": "162.7mb",
"points_memory_in_bytes": 170667547,
"doc_values_memory": "139.8mb",
"doc_values_memory_in_bytes": 146647008,
"index_writer_memory": "9mb",
"index_writer_memory_in_bytes": 9513096,
"version_map_memory": "5.7kb",
"version_map_memory_in_bytes": 5863,
"fixed_bit_set": "0b",
"fixed_bit_set_memory_in_bytes": 0,
"max_unsafe_auto_id_timestamp": 9223372036854776000,
"file_sizes": {}
}
},
"nodes": {
"count": {
"total": 4,
"data": 3,
"coordinating_only": 0,
"master": 1,
"ingest": 4
},
"versions": [
"5.5.2"
],
"os": {
"available_processors": 48,
"allocated_processors": 48,
"names": [
{
"name": "Linux",
"count": 4
}
],
"mem": {
"total": "137.4gb",
"total_in_bytes": 147555135488,
"free": "57.5gb",
"free_in_bytes": 61796896768,
"used": "79.8gb",
"used_in_bytes": 85758238720,
"free_percent": 42,
"used_percent": 58
}
},
"process": {
"cpu": {
"percent": 47
},
"open_file_descriptors": {
"min": 466,
"max": 2497,
"avg": 1784
}
},
"jvm": {
"max_uptime": "57.3m",
"max_uptime_in_millis": 3440014,
"versions": [
{
"version": "1.8.0_151",
"vm_name": "OpenJDK 64-Bit Server VM",
"vm_version": "25.151-b12",
"vm_vendor": "Oracle Corporation",
"count": 1
},
{
"version": "1.8.0_144",
"vm_name": "Java HotSpot(TM) 64-Bit Server VM",
"vm_version": "25.144-b01",
"vm_vendor": "Oracle Corporation",
"count": 2
},
{
"version": "1.8.0_131",
"vm_name": "OpenJDK 64-Bit Server VM",
"vm_version": "25.131-b11",
"vm_vendor": "Oracle Corporation",
"count": 1
}
],
"mem": {
"heap_used": "17.7gb",
"heap_used_in_bytes": 19039917064,
"heap_max": "49.6gb",
"heap_max_in_bytes": 53338243072
},
"threads": 568
},
"fs": {
"total": "20.5tb",
"total_in_bytes": 22628050759680,
"free": "10.7tb",
"free_in_bytes": 11810631925760,
"available": "9.7tb",
"available_in_bytes": 10689890062336,
"spins": "true"
},
"plugins": [],
"network_types": {
"transport_types": {
"netty4": 4
},
"http_types": {
"netty4": 4
}
}
}
}

This topic was automatically closed 28 days after the last reply. New replies are no longer allowed.