So I've got this Elasticsearch single-node instance running out on an EC2
m3.2xlarge in the cloud. I'm sending about 2,000 documents per second to it
for writing. But it's dropping almost all of them on the floor, and any
search queries I try to make against it are taking an extraordinarily long
time to complete, if they ever do at all. So I'm a novice at Elasticsearch
system administration, so I'm hoping the group here can help me figure out
why my Elasticsearch is so unhealthy. Here's what I get from curl -XGET
'http://localhost:9200/_nodes/stats', which takes a couple minutes to
compute:
{
"cluster_name": "campfire.production.local",
"nodes": {
"ZkP5567KSNmab6x0oq6HaQ": {
"timestamp": 1429411379818,
"name": "node-0ba1d794-5a6a-41fd-8c23-6ccc45fb35fb",
"transport_address": [redacted],
"host": [redacted],
"ip": [
"inet[redacted]",
"NONE"
],
"indices": {
"docs": {
"count": 2257831457,
"deleted": 20
},
"store": {
"size_in_bytes": 677010690614,
"throttle_time_in_millis": 340863
},
"indexing": {
"index_total": 11743407,
"index_time_in_millis": 95117790,
"index_current": 69923,
"delete_total": 6,
"delete_time_in_millis": 40,
"delete_current": 0
},
"get": {
"total": 6,
"time_in_millis": 75,
"exists_total": 6,
"exists_time_in_millis": 75,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 99643,
"query_time_in_millis": 208069,
"query_current": 0,
"fetch_total": 32537,
"fetch_time_in_millis": 46771,
"fetch_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 9259,
"total_time_in_millis": 132148793,
"total_docs": 81942369,
"total_size_in_bytes": 26105798379
},
"refresh": {
"total": 83142,
"total_time_in_millis": 17075927
},
"flush": {
"total": 404,
"total_time_in_millis": 182115
},
"warmer": {
"current": 0,
"total": 166077,
"total_time_in_millis": 113904
},
"filter_cache": {
"memory_size_in_bytes": 13076,
"evictions": 0
},
"id_cache": {
"memory_size_in_bytes": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"percolate": {
"total": 0,
"time_in_millis": 0,
"current": 0,
"memory_size_in_bytes": -1,
"memory_size": "-1b",
"queries": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 9225,
"memory_in_bytes": 22644267832,
"index_writer_memory_in_bytes": 1885176,
"version_map_memory_in_bytes": 95472
},
"translog": {
"operations": 39008,
"size_in_bytes": 0
},
"suggest": {
"total": 0,
"time_in_millis": 0,
"current": 0
}
},
"os": {
"timestamp": 1429411459205,
"uptime_in_millis": 2863626,
"load_average": [
3.16,
3.01,
3.19
],
"cpu": {
"sys": 1,
"user": 35,
"idle": 62,
"usage": 36,
"stolen": 0
},
"mem": {
"free_in_bytes": 183566336,
"used_in_bytes": 31380459520,
"free_percent": 30,
"used_percent": 69,
"actual_free_in_bytes": 9597153280,
"actual_used_in_bytes": 21966872576
},
"swap": {
"used_in_bytes": 0,
"free_in_bytes": 0
}
},
"process": {
"timestamp": 1429411459206,
"open_file_descriptors": 25719,
"cpu": {
"percent": 142,
"sys_in_millis": 6301800,
"user_in_millis": 191977900,
"total_in_millis": 198279700
},
"mem": {
"resident_in_bytes": 8216096768,
"share_in_bytes": 14045184,
"total_virtual_in_bytes": 269005692928
}
},
"jvm": {
"timestamp": 1429411459236,
"uptime_in_millis": 131281770,
"mem": {
"heap_used_in_bytes": 7815295968,
"heap_used_percent": 99,
"heap_committed_in_bytes": 7821852672,
"heap_max_in_bytes": 7821852672,
"non_heap_used_in_bytes": 54967760,
"non_heap_committed_in_bytes": 84406272,
"pools": {
"young": {
"used_in_bytes": 558432256,
"max_in_bytes": 558432256,
"peak_used_in_bytes": 558432256,
"peak_max_in_bytes": 558432256
},
"survivor": {
"used_in_bytes": 65672464,
"max_in_bytes": 69730304,
"peak_used_in_bytes": 69730304,
"peak_max_in_bytes": 69730304
},
"old": {
"used_in_bytes": 7191197144,
"max_in_bytes": 7193690112,
"peak_used_in_bytes": 7193690112,
"peak_max_in_bytes": 7193690112
}
}
},
"threads": {
"count": 133,
"peak_count": 140
},
"gc": {
"collectors": {
"young": {
"collection_count": 1056,
"collection_time_in_millis": 6987
},
"old": {
"collection_count": 50480,
"collection_time_in_millis": 82300221
}
}
},
"buffer_pools": {
"direct": {
"count": 147,
"used_in_bytes": 25199773,
"total_capacity_in_bytes": 25199773
},
"mapped": {
"count": 9265,
"used_in_bytes": 256402636679,
"total_capacity_in_bytes": 256402636679
}
}
},
"thread_pool": {
"generic": {
"threads": 1,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 5,
"completed": 14687
},
"index": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 14
},
"snapshot_data": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bench": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"get": {
"threads": 6,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 6,
"completed": 6
},
"snapshot": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"merge": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 79309
},
"suggest": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bulk": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 64924
},
"optimize": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"warmer": {
"threads": 3,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 83012
},
"flush": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 50856
},
"search": {
"threads": 24,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 24,
"completed": 132495
},
"percolate": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"management": {
"threads": 5,
"queue": 0,
"active": 1,
"rejected": 0,
"largest": 5,
"completed": 2467169
},
"refresh": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 81978
}
},
"network": {
"tcp": {
"active_opens": 20988754,
"passive_opens": 23904325,
"curr_estab": 61,
"in_segs": 3495928683,
"out_segs": 4792521542,
"retrans_segs": 158505,
"estab_resets": 52113,
"attempt_fails": 284654,
"in_errs": 260,
"out_rsts": 447516
}
},
"fs": {
"timestamp": 1429411459237,
"total": {
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
},
"data": [
{
"path": [redacted],
"mount": [redacted],
"dev": "/dev/xvdf",
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
}
]
},
"transport": {
"server_open": 13,
"rx_count": 10,
"rx_size_in_bytes": 3870,
"tx_count": 10,
"tx_size_in_bytes": 3870
},
"http": {
"current_open": 6,
"total_opened": 5457
},
"fielddata_breaker": {
"maximum_size_in_bytes": 4693111603,
"maximum_size": "4.3gb",
"estimated_size_in_bytes": 0,
"estimated_size": "0b",
"overhead": 1.03,
"tripped": 0
}
}
}
}
And here's curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
(also took a couple minutes to complete):
{
"timestamp" : 1429412322912,
"cluster_name" : "campfire.production.local",
"status" : "green",
"indices" : {
"count" : 236,
"shards" : {
"total" : 1180,
"primaries" : 1180,
"replication" : 0.0,
"index" : {
"shards" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"primaries" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"replication" : {
"min" : 0.0,
"max" : 0.0,
"avg" : 0.0
}
}
},
"docs" : {
"count" : 2257932671,
"deleted" : 20
},
"store" : {
"size" : "630.5gb",
"size_in_bytes" : 677043574900,
"throttle_time" : "5.7m",
"throttle_time_in_millis" : 342217
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"filter_cache" : {
"memory_size" : "12.7kb",
"memory_size_in_bytes" : 13076,
"evictions" : 0
},
"id_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 9230,
"memory" : "21gb",
"memory_in_bytes" : 22645584136,
"index_writer_memory" : "0b",
"index_writer_memory_in_bytes" : 0,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0
},
"percolate" : {
"total" : 0,
"get_time" : "0s",
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
}
},
"nodes" : {
"count" : {
"total" : 1,
"master_only" : 0,
"data_only" : 0,
"master_data" : 1,
"client" : 0
},
"versions" : [ "1.3.2" ],
"os" : {
"available_processors" : 8,
"mem" : {
"total" : "29.3gb",
"total_in_bytes" : 31564025856
},
"cpu" : [ {
"vendor" : "Intel",
"model" : "Xeon",
"mhz" : 2500,
"total_cores" : 8,
"total_sockets" : 8,
"cores_per_socket" : 32,
"cache_size" : "25kb",
"cache_size_in_bytes" : 25600,
"count" : 1
} ]
},
"process" : {
"cpu" : {
"percent" : 168
},
"open_file_descriptors" : {
"min" : 25733,
"max" : 25733,
"avg" : 25733
}
},
"jvm" : {
"max_uptime" : "1.5d",
"max_uptime_in_millis" : 132087449,
"versions" : [ {
"version" : "1.7.0_72",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.72-b04",
"vm_vendor" : "Oracle Corporation",
"count" : 1
} ],
"mem" : {
"heap_used" : "7.2gb",
"heap_used_in_bytes" : 7751476496,
"heap_max" : "7.2gb",
"heap_max_in_bytes" : 7821852672
},
"threads" : 134
},
"fs" : {
"total" : "1007.8gb",
"total_in_bytes" : 1082125373440,
"free" : "219.6gb",
"free_in_bytes" : 235869388800,
"available" : "168.4gb",
"available_in_bytes" : 180877033472,
"disk_reads" : 67926240,
"disk_writes" : 93745279,
"disk_io_op" : 161671519,
"disk_read_size" : "2.4tb",
"disk_read_size_in_bytes" : 2663667340288,
"disk_write_size" : "8.8tb",
"disk_write_size_in_bytes" : 9760728731648,
"disk_io_size" : "11.2tb",
"disk_io_size_in_bytes" : 12424396071936
},
"plugins" : [ ]
}
}
I'm fairly confident that somewhere in that morass of information there are
some big, glowing red flags that say what's going wrong, but the output is
largely Greek to me; the documentation is pretty sparse on the matter. Any
help figuring out what's wrong with my Elasticsearch would be greatly
appreciated. And let me know if there's any other diagnostics I can run to
help debug. Thanks!
--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/6212a969-0ec2-4fc4-9f0e-696cf83b548f%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.