Very sluggish Elasticsearch node; not sure why

So I've got this Elasticsearch single-node instance running out on an EC2
m3.2xlarge in the cloud. I'm sending about 2,000 documents per second to it
for writing. But it's dropping almost all of them on the floor, and any
search queries I try to make against it are taking an extraordinarily long
time to complete, if they ever do at all. So I'm a novice at Elasticsearch
system administration, so I'm hoping the group here can help me figure out
why my Elasticsearch is so unhealthy. Here's what I get from curl -XGET
'http://localhost:9200/_nodes/stats', which takes a couple minutes to
compute:

{
"cluster_name": "campfire.production.local",
"nodes": {
"ZkP5567KSNmab6x0oq6HaQ": {
"timestamp": 1429411379818,
"name": "node-0ba1d794-5a6a-41fd-8c23-6ccc45fb35fb",
"transport_address": [redacted],
"host": [redacted],
"ip": [
"inet[redacted]",
"NONE"
],
"indices": {
"docs": {
"count": 2257831457,
"deleted": 20
},
"store": {
"size_in_bytes": 677010690614,
"throttle_time_in_millis": 340863
},
"indexing": {
"index_total": 11743407,
"index_time_in_millis": 95117790,
"index_current": 69923,
"delete_total": 6,
"delete_time_in_millis": 40,
"delete_current": 0
},
"get": {
"total": 6,
"time_in_millis": 75,
"exists_total": 6,
"exists_time_in_millis": 75,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 99643,
"query_time_in_millis": 208069,
"query_current": 0,
"fetch_total": 32537,
"fetch_time_in_millis": 46771,
"fetch_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 9259,
"total_time_in_millis": 132148793,
"total_docs": 81942369,
"total_size_in_bytes": 26105798379
},
"refresh": {
"total": 83142,
"total_time_in_millis": 17075927
},
"flush": {
"total": 404,
"total_time_in_millis": 182115
},
"warmer": {
"current": 0,
"total": 166077,
"total_time_in_millis": 113904
},
"filter_cache": {
"memory_size_in_bytes": 13076,
"evictions": 0
},
"id_cache": {
"memory_size_in_bytes": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"percolate": {
"total": 0,
"time_in_millis": 0,
"current": 0,
"memory_size_in_bytes": -1,
"memory_size": "-1b",
"queries": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 9225,
"memory_in_bytes": 22644267832,
"index_writer_memory_in_bytes": 1885176,
"version_map_memory_in_bytes": 95472
},
"translog": {
"operations": 39008,
"size_in_bytes": 0
},
"suggest": {
"total": 0,
"time_in_millis": 0,
"current": 0
}
},
"os": {
"timestamp": 1429411459205,
"uptime_in_millis": 2863626,
"load_average": [
3.16,
3.01,
3.19
],
"cpu": {
"sys": 1,
"user": 35,
"idle": 62,
"usage": 36,
"stolen": 0
},
"mem": {
"free_in_bytes": 183566336,
"used_in_bytes": 31380459520,
"free_percent": 30,
"used_percent": 69,
"actual_free_in_bytes": 9597153280,
"actual_used_in_bytes": 21966872576
},
"swap": {
"used_in_bytes": 0,
"free_in_bytes": 0
}
},
"process": {
"timestamp": 1429411459206,
"open_file_descriptors": 25719,
"cpu": {
"percent": 142,
"sys_in_millis": 6301800,
"user_in_millis": 191977900,
"total_in_millis": 198279700
},
"mem": {
"resident_in_bytes": 8216096768,
"share_in_bytes": 14045184,
"total_virtual_in_bytes": 269005692928
}
},
"jvm": {
"timestamp": 1429411459236,
"uptime_in_millis": 131281770,
"mem": {
"heap_used_in_bytes": 7815295968,
"heap_used_percent": 99,
"heap_committed_in_bytes": 7821852672,
"heap_max_in_bytes": 7821852672,
"non_heap_used_in_bytes": 54967760,
"non_heap_committed_in_bytes": 84406272,
"pools": {
"young": {
"used_in_bytes": 558432256,
"max_in_bytes": 558432256,
"peak_used_in_bytes": 558432256,
"peak_max_in_bytes": 558432256
},
"survivor": {
"used_in_bytes": 65672464,
"max_in_bytes": 69730304,
"peak_used_in_bytes": 69730304,
"peak_max_in_bytes": 69730304
},
"old": {
"used_in_bytes": 7191197144,
"max_in_bytes": 7193690112,
"peak_used_in_bytes": 7193690112,
"peak_max_in_bytes": 7193690112
}
}
},
"threads": {
"count": 133,
"peak_count": 140
},
"gc": {
"collectors": {
"young": {
"collection_count": 1056,
"collection_time_in_millis": 6987
},
"old": {
"collection_count": 50480,
"collection_time_in_millis": 82300221
}
}
},
"buffer_pools": {
"direct": {
"count": 147,
"used_in_bytes": 25199773,
"total_capacity_in_bytes": 25199773
},
"mapped": {
"count": 9265,
"used_in_bytes": 256402636679,
"total_capacity_in_bytes": 256402636679
}
}
},
"thread_pool": {
"generic": {
"threads": 1,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 5,
"completed": 14687
},
"index": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 14
},
"snapshot_data": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bench": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"get": {
"threads": 6,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 6,
"completed": 6
},
"snapshot": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"merge": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 79309
},
"suggest": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bulk": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 64924
},
"optimize": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"warmer": {
"threads": 3,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 83012
},
"flush": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 50856
},
"search": {
"threads": 24,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 24,
"completed": 132495
},
"percolate": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"management": {
"threads": 5,
"queue": 0,
"active": 1,
"rejected": 0,
"largest": 5,
"completed": 2467169
},
"refresh": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 81978
}
},
"network": {
"tcp": {
"active_opens": 20988754,
"passive_opens": 23904325,
"curr_estab": 61,
"in_segs": 3495928683,
"out_segs": 4792521542,
"retrans_segs": 158505,
"estab_resets": 52113,
"attempt_fails": 284654,
"in_errs": 260,
"out_rsts": 447516
}
},
"fs": {
"timestamp": 1429411459237,
"total": {
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
},
"data": [
{
"path": [redacted],
"mount": [redacted],
"dev": "/dev/xvdf",
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
}
]
},
"transport": {
"server_open": 13,
"rx_count": 10,
"rx_size_in_bytes": 3870,
"tx_count": 10,
"tx_size_in_bytes": 3870
},
"http": {
"current_open": 6,
"total_opened": 5457
},
"fielddata_breaker": {
"maximum_size_in_bytes": 4693111603,
"maximum_size": "4.3gb",
"estimated_size_in_bytes": 0,
"estimated_size": "0b",
"overhead": 1.03,
"tripped": 0
}
}
}
}

And here's curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
(also took a couple minutes to complete):

{
"timestamp" : 1429412322912,
"cluster_name" : "campfire.production.local",
"status" : "green",
"indices" : {
"count" : 236,
"shards" : {
"total" : 1180,
"primaries" : 1180,
"replication" : 0.0,
"index" : {
"shards" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"primaries" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"replication" : {
"min" : 0.0,
"max" : 0.0,
"avg" : 0.0
}
}
},
"docs" : {
"count" : 2257932671,
"deleted" : 20
},
"store" : {
"size" : "630.5gb",
"size_in_bytes" : 677043574900,
"throttle_time" : "5.7m",
"throttle_time_in_millis" : 342217
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"filter_cache" : {
"memory_size" : "12.7kb",
"memory_size_in_bytes" : 13076,
"evictions" : 0
},
"id_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 9230,
"memory" : "21gb",
"memory_in_bytes" : 22645584136,
"index_writer_memory" : "0b",
"index_writer_memory_in_bytes" : 0,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0
},
"percolate" : {
"total" : 0,
"get_time" : "0s",
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
}
},
"nodes" : {
"count" : {
"total" : 1,
"master_only" : 0,
"data_only" : 0,
"master_data" : 1,
"client" : 0
},
"versions" : [ "1.3.2" ],
"os" : {
"available_processors" : 8,
"mem" : {
"total" : "29.3gb",
"total_in_bytes" : 31564025856
},
"cpu" : [ {
"vendor" : "Intel",
"model" : "Xeon",
"mhz" : 2500,
"total_cores" : 8,
"total_sockets" : 8,
"cores_per_socket" : 32,
"cache_size" : "25kb",
"cache_size_in_bytes" : 25600,
"count" : 1
} ]
},
"process" : {
"cpu" : {
"percent" : 168
},
"open_file_descriptors" : {
"min" : 25733,
"max" : 25733,
"avg" : 25733
}
},
"jvm" : {
"max_uptime" : "1.5d",
"max_uptime_in_millis" : 132087449,
"versions" : [ {
"version" : "1.7.0_72",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.72-b04",
"vm_vendor" : "Oracle Corporation",
"count" : 1
} ],
"mem" : {
"heap_used" : "7.2gb",
"heap_used_in_bytes" : 7751476496,
"heap_max" : "7.2gb",
"heap_max_in_bytes" : 7821852672
},
"threads" : 134
},
"fs" : {
"total" : "1007.8gb",
"total_in_bytes" : 1082125373440,
"free" : "219.6gb",
"free_in_bytes" : 235869388800,
"available" : "168.4gb",
"available_in_bytes" : 180877033472,
"disk_reads" : 67926240,
"disk_writes" : 93745279,
"disk_io_op" : 161671519,
"disk_read_size" : "2.4tb",
"disk_read_size_in_bytes" : 2663667340288,
"disk_write_size" : "8.8tb",
"disk_write_size_in_bytes" : 9760728731648,
"disk_io_size" : "11.2tb",
"disk_io_size_in_bytes" : 12424396071936
},
"plugins" : [ ]
}
}

I'm fairly confident that somewhere in that morass of information there are
some big, glowing red flags that say what's going wrong, but the output is
largely Greek to me; the documentation is pretty sparse on the matter. Any
help figuring out what's wrong with my Elasticsearch would be greatly
appreciated. And let me know if there's any other diagnostics I can run to
help debug. Thanks!

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/6212a969-0ec2-4fc4-9f0e-696cf83b548f%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

I'd guess that you are hitting the capacity of the node.

Try closing/deleting indices or upgrading the instance.

On 19 April 2015 at 13:03, Dave Galbraith david92galbraith@gmail.com
wrote:

So I've got this Elasticsearch single-node instance running out on an EC2
m3.2xlarge in the cloud. I'm sending about 2,000 documents per second to it
for writing. But it's dropping almost all of them on the floor, and any
search queries I try to make against it are taking an extraordinarily long
time to complete, if they ever do at all. So I'm a novice at Elasticsearch
system administration, so I'm hoping the group here can help me figure out
why my Elasticsearch is so unhealthy. Here's what I get from curl -XGET '
http://localhost:9200/_nodes/stats', which takes a couple minutes to
compute:

{
"cluster_name": "campfire.production.local",
"nodes": {
"ZkP5567KSNmab6x0oq6HaQ": {
"timestamp": 1429411379818,
"name": "node-0ba1d794-5a6a-41fd-8c23-6ccc45fb35fb",
"transport_address": [redacted],
"host": [redacted],
"ip": [
"inet[redacted]",
"NONE"
],
"indices": {
"docs": {
"count": 2257831457,
"deleted": 20
},
"store": {
"size_in_bytes": 677010690614,
"throttle_time_in_millis": 340863
},
"indexing": {
"index_total": 11743407,
"index_time_in_millis": 95117790,
"index_current": 69923,
"delete_total": 6,
"delete_time_in_millis": 40,
"delete_current": 0
},
"get": {
"total": 6,
"time_in_millis": 75,
"exists_total": 6,
"exists_time_in_millis": 75,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 99643,
"query_time_in_millis": 208069,
"query_current": 0,
"fetch_total": 32537,
"fetch_time_in_millis": 46771,
"fetch_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 9259,
"total_time_in_millis": 132148793,
"total_docs": 81942369,
"total_size_in_bytes": 26105798379
},
"refresh": {
"total": 83142,
"total_time_in_millis": 17075927
},
"flush": {
"total": 404,
"total_time_in_millis": 182115
},
"warmer": {
"current": 0,
"total": 166077,
"total_time_in_millis": 113904
},
"filter_cache": {
"memory_size_in_bytes": 13076,
"evictions": 0
},
"id_cache": {
"memory_size_in_bytes": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"percolate": {
"total": 0,
"time_in_millis": 0,
"current": 0,
"memory_size_in_bytes": -1,
"memory_size": "-1b",
"queries": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 9225,
"memory_in_bytes": 22644267832,
"index_writer_memory_in_bytes": 1885176,
"version_map_memory_in_bytes": 95472
},
"translog": {
"operations": 39008,
"size_in_bytes": 0
},
"suggest": {
"total": 0,
"time_in_millis": 0,
"current": 0
}
},
"os": {
"timestamp": 1429411459205,
"uptime_in_millis": 2863626,
"load_average": [
3.16,
3.01,
3.19
],
"cpu": {
"sys": 1,
"user": 35,
"idle": 62,
"usage": 36,
"stolen": 0
},
"mem": {
"free_in_bytes": 183566336,
"used_in_bytes": 31380459520,
"free_percent": 30,
"used_percent": 69,
"actual_free_in_bytes": 9597153280,
"actual_used_in_bytes": 21966872576
},
"swap": {
"used_in_bytes": 0,
"free_in_bytes": 0
}
},
"process": {
"timestamp": 1429411459206,
"open_file_descriptors": 25719,
"cpu": {
"percent": 142,
"sys_in_millis": 6301800,
"user_in_millis": 191977900,
"total_in_millis": 198279700
},
"mem": {
"resident_in_bytes": 8216096768,
"share_in_bytes": 14045184,
"total_virtual_in_bytes": 269005692928
}
},
"jvm": {
"timestamp": 1429411459236,
"uptime_in_millis": 131281770,
"mem": {
"heap_used_in_bytes": 7815295968,
"heap_used_percent": 99,
"heap_committed_in_bytes": 7821852672,
"heap_max_in_bytes": 7821852672,
"non_heap_used_in_bytes": 54967760,
"non_heap_committed_in_bytes": 84406272,
"pools": {
"young": {
"used_in_bytes": 558432256,
"max_in_bytes": 558432256,
"peak_used_in_bytes": 558432256,
"peak_max_in_bytes": 558432256
},
"survivor": {
"used_in_bytes": 65672464,
"max_in_bytes": 69730304,
"peak_used_in_bytes": 69730304,
"peak_max_in_bytes": 69730304
},
"old": {
"used_in_bytes": 7191197144,
"max_in_bytes": 7193690112,
"peak_used_in_bytes": 7193690112,
"peak_max_in_bytes": 7193690112
}
}
},
"threads": {
"count": 133,
"peak_count": 140
},
"gc": {
"collectors": {
"young": {
"collection_count": 1056,
"collection_time_in_millis": 6987
},
"old": {
"collection_count": 50480,
"collection_time_in_millis": 82300221
}
}
},
"buffer_pools": {
"direct": {
"count": 147,
"used_in_bytes": 25199773,
"total_capacity_in_bytes": 25199773
},
"mapped": {
"count": 9265,
"used_in_bytes": 256402636679,
"total_capacity_in_bytes": 256402636679
}
}
},
"thread_pool": {
"generic": {
"threads": 1,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 5,
"completed": 14687
},
"index": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 14
},
"snapshot_data": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bench": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"get": {
"threads": 6,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 6,
"completed": 6
},
"snapshot": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"merge": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 79309
},
"suggest": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bulk": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 64924
},
"optimize": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"warmer": {
"threads": 3,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 83012
},
"flush": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 50856
},
"search": {
"threads": 24,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 24,
"completed": 132495
},
"percolate": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"management": {
"threads": 5,
"queue": 0,
"active": 1,
"rejected": 0,
"largest": 5,
"completed": 2467169
},
"refresh": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 81978
}
},
"network": {
"tcp": {
"active_opens": 20988754,
"passive_opens": 23904325,
"curr_estab": 61,
"in_segs": 3495928683,
"out_segs": 4792521542,
"retrans_segs": 158505,
"estab_resets": 52113,
"attempt_fails": 284654,
"in_errs": 260,
"out_rsts": 447516
}
},
"fs": {
"timestamp": 1429411459237,
"total": {
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
},
"data": [
{
"path": [redacted],
"mount": [redacted],
"dev": "/dev/xvdf",
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
}
]
},
"transport": {
"server_open": 13,
"rx_count": 10,
"rx_size_in_bytes": 3870,
"tx_count": 10,
"tx_size_in_bytes": 3870
},
"http": {
"current_open": 6,
"total_opened": 5457
},
"fielddata_breaker": {
"maximum_size_in_bytes": 4693111603,
"maximum_size": "4.3gb",
"estimated_size_in_bytes": 0,
"estimated_size": "0b",
"overhead": 1.03,
"tripped": 0
}
}
}
}

And here's curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
(also took a couple minutes to complete):

{
"timestamp" : 1429412322912,
"cluster_name" : "campfire.production.local",
"status" : "green",
"indices" : {
"count" : 236,
"shards" : {
"total" : 1180,
"primaries" : 1180,
"replication" : 0.0,
"index" : {
"shards" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"primaries" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"replication" : {
"min" : 0.0,
"max" : 0.0,
"avg" : 0.0
}
}
},
"docs" : {
"count" : 2257932671,
"deleted" : 20
},
"store" : {
"size" : "630.5gb",
"size_in_bytes" : 677043574900,
"throttle_time" : "5.7m",
"throttle_time_in_millis" : 342217
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"filter_cache" : {
"memory_size" : "12.7kb",
"memory_size_in_bytes" : 13076,
"evictions" : 0
},
"id_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 9230,
"memory" : "21gb",
"memory_in_bytes" : 22645584136,
"index_writer_memory" : "0b",
"index_writer_memory_in_bytes" : 0,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0
},
"percolate" : {
"total" : 0,
"get_time" : "0s",
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
}
},
"nodes" : {
"count" : {
"total" : 1,
"master_only" : 0,
"data_only" : 0,
"master_data" : 1,
"client" : 0
},
"versions" : [ "1.3.2" ],
"os" : {
"available_processors" : 8,
"mem" : {
"total" : "29.3gb",
"total_in_bytes" : 31564025856
},
"cpu" : [ {
"vendor" : "Intel",
"model" : "Xeon",
"mhz" : 2500,
"total_cores" : 8,
"total_sockets" : 8,
"cores_per_socket" : 32,
"cache_size" : "25kb",
"cache_size_in_bytes" : 25600,
"count" : 1
} ]
},
"process" : {
"cpu" : {
"percent" : 168
},
"open_file_descriptors" : {
"min" : 25733,
"max" : 25733,
"avg" : 25733
}
},
"jvm" : {
"max_uptime" : "1.5d",
"max_uptime_in_millis" : 132087449,
"versions" : [ {
"version" : "1.7.0_72",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.72-b04",
"vm_vendor" : "Oracle Corporation",
"count" : 1
} ],
"mem" : {
"heap_used" : "7.2gb",
"heap_used_in_bytes" : 7751476496,
"heap_max" : "7.2gb",
"heap_max_in_bytes" : 7821852672
},
"threads" : 134
},
"fs" : {
"total" : "1007.8gb",
"total_in_bytes" : 1082125373440,
"free" : "219.6gb",
"free_in_bytes" : 235869388800,
"available" : "168.4gb",
"available_in_bytes" : 180877033472,
"disk_reads" : 67926240,
"disk_writes" : 93745279,
"disk_io_op" : 161671519,
"disk_read_size" : "2.4tb",
"disk_read_size_in_bytes" : 2663667340288,
"disk_write_size" : "8.8tb",
"disk_write_size_in_bytes" : 9760728731648,
"disk_io_size" : "11.2tb",
"disk_io_size_in_bytes" : 12424396071936
},
"plugins" : [ ]
}
}

I'm fairly confident that somewhere in that morass of information there
are some big, glowing red flags that say what's going wrong, but the output
is largely Greek to me; the documentation is pretty sparse on the matter.
Any help figuring out what's wrong with my Elasticsearch would be greatly
appreciated. And let me know if there's any other diagnostics I can run to
help debug. Thanks!

--
You received this message because you are subscribed to the Google Groups
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an
email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit
https://groups.google.com/d/msgid/elasticsearch/6212a969-0ec2-4fc4-9f0e-696cf83b548f%40googlegroups.com
https://groups.google.com/d/msgid/elasticsearch/6212a969-0ec2-4fc4-9f0e-696cf83b548f%40googlegroups.com?utm_medium=email&utm_source=footer
.
For more options, visit https://groups.google.com/d/optout.

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/CAEYi1X_TeUaueuKa3vgen--m3W%2BfLD6yTjmhJW_5Z-V%3D5NrCEw%40mail.gmail.com.
For more options, visit https://groups.google.com/d/optout.

Hi,

You seem to have quite a large number of shards (1180) for a single node
with only 7GB heap. As the total data volume is a bit over 600GB, the
average shard size is only a bit over 500MB, which is not very large. As
each shard is a separate Lucene index and carries some overhead, you would
probably benefit from having fewer, larger shards.

With 236 indices it looks like you still have the default 5 shards per
index. I would recommend reducing the number of shards per index to 1 going
forward in order to increase the average shard size.

It is also generally recommended to allocate 50% of the available RAM to
heap, so you may want to increase it from 7GB to 15GB as you have 30GB
available on a EC2 m3.2xlarge instance.

As there is some throttling going on I would also recommend taking a look
at iostat to check if you are limited by the performance of your storage.

If this does not help it is possible that you, as Mark pointed out, are
hitting the capacity of the node.

Best regards,

Christian

On Sunday, April 19, 2015 at 4:03:35 AM UTC+1, Dave Galbraith wrote:

So I've got this Elasticsearch single-node instance running out on an EC2
m3.2xlarge in the cloud. I'm sending about 2,000 documents per second to it
for writing. But it's dropping almost all of them on the floor, and any
search queries I try to make against it are taking an extraordinarily long
time to complete, if they ever do at all. So I'm a novice at Elasticsearch
system administration, so I'm hoping the group here can help me figure out
why my Elasticsearch is so unhealthy. Here's what I get from curl -XGET '
http://localhost:9200/_nodes/stats', which takes a couple minutes to
compute:

{
"cluster_name": "campfire.production.local",
"nodes": {
"ZkP5567KSNmab6x0oq6HaQ": {
"timestamp": 1429411379818,
"name": "node-0ba1d794-5a6a-41fd-8c23-6ccc45fb35fb",
"transport_address": [redacted],
"host": [redacted],
"ip": [
"inet[redacted]",
"NONE"
],
"indices": {
"docs": {
"count": 2257831457,
"deleted": 20
},
"store": {
"size_in_bytes": 677010690614,
"throttle_time_in_millis": 340863
},
"indexing": {
"index_total": 11743407,
"index_time_in_millis": 95117790,
"index_current": 69923,
"delete_total": 6,
"delete_time_in_millis": 40,
"delete_current": 0
},
"get": {
"total": 6,
"time_in_millis": 75,
"exists_total": 6,
"exists_time_in_millis": 75,
"missing_total": 0,
"missing_time_in_millis": 0,
"current": 0
},
"search": {
"open_contexts": 0,
"query_total": 99643,
"query_time_in_millis": 208069,
"query_current": 0,
"fetch_total": 32537,
"fetch_time_in_millis": 46771,
"fetch_current": 0
},
"merges": {
"current": 0,
"current_docs": 0,
"current_size_in_bytes": 0,
"total": 9259,
"total_time_in_millis": 132148793,
"total_docs": 81942369,
"total_size_in_bytes": 26105798379
},
"refresh": {
"total": 83142,
"total_time_in_millis": 17075927
},
"flush": {
"total": 404,
"total_time_in_millis": 182115
},
"warmer": {
"current": 0,
"total": 166077,
"total_time_in_millis": 113904
},
"filter_cache": {
"memory_size_in_bytes": 13076,
"evictions": 0
},
"id_cache": {
"memory_size_in_bytes": 0
},
"fielddata": {
"memory_size_in_bytes": 0,
"evictions": 0
},
"percolate": {
"total": 0,
"time_in_millis": 0,
"current": 0,
"memory_size_in_bytes": -1,
"memory_size": "-1b",
"queries": 0
},
"completion": {
"size_in_bytes": 0
},
"segments": {
"count": 9225,
"memory_in_bytes": 22644267832,
"index_writer_memory_in_bytes": 1885176,
"version_map_memory_in_bytes": 95472
},
"translog": {
"operations": 39008,
"size_in_bytes": 0
},
"suggest": {
"total": 0,
"time_in_millis": 0,
"current": 0
}
},
"os": {
"timestamp": 1429411459205,
"uptime_in_millis": 2863626,
"load_average": [
3.16,
3.01,
3.19
],
"cpu": {
"sys": 1,
"user": 35,
"idle": 62,
"usage": 36,
"stolen": 0
},
"mem": {
"free_in_bytes": 183566336,
"used_in_bytes": 31380459520,
"free_percent": 30,
"used_percent": 69,
"actual_free_in_bytes": 9597153280,
"actual_used_in_bytes": 21966872576
},
"swap": {
"used_in_bytes": 0,
"free_in_bytes": 0
}
},
"process": {
"timestamp": 1429411459206,
"open_file_descriptors": 25719,
"cpu": {
"percent": 142,
"sys_in_millis": 6301800,
"user_in_millis": 191977900,
"total_in_millis": 198279700
},
"mem": {
"resident_in_bytes": 8216096768,
"share_in_bytes": 14045184,
"total_virtual_in_bytes": 269005692928
}
},
"jvm": {
"timestamp": 1429411459236,
"uptime_in_millis": 131281770,
"mem": {
"heap_used_in_bytes": 7815295968,
"heap_used_percent": 99,
"heap_committed_in_bytes": 7821852672,
"heap_max_in_bytes": 7821852672,
"non_heap_used_in_bytes": 54967760,
"non_heap_committed_in_bytes": 84406272,
"pools": {
"young": {
"used_in_bytes": 558432256,
"max_in_bytes": 558432256,
"peak_used_in_bytes": 558432256,
"peak_max_in_bytes": 558432256
},
"survivor": {
"used_in_bytes": 65672464,
"max_in_bytes": 69730304,
"peak_used_in_bytes": 69730304,
"peak_max_in_bytes": 69730304
},
"old": {
"used_in_bytes": 7191197144,
"max_in_bytes": 7193690112,
"peak_used_in_bytes": 7193690112,
"peak_max_in_bytes": 7193690112
}
}
},
"threads": {
"count": 133,
"peak_count": 140
},
"gc": {
"collectors": {
"young": {
"collection_count": 1056,
"collection_time_in_millis": 6987
},
"old": {
"collection_count": 50480,
"collection_time_in_millis": 82300221
}
}
},
"buffer_pools": {
"direct": {
"count": 147,
"used_in_bytes": 25199773,
"total_capacity_in_bytes": 25199773
},
"mapped": {
"count": 9265,
"used_in_bytes": 256402636679,
"total_capacity_in_bytes": 256402636679
}
}
},
"thread_pool": {
"generic": {
"threads": 1,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 5,
"completed": 14687
},
"index": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 14
},
"snapshot_data": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bench": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"get": {
"threads": 6,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 6,
"completed": 6
},
"snapshot": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"merge": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 79309
},
"suggest": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"bulk": {
"threads": 8,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 8,
"completed": 64924
},
"optimize": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"warmer": {
"threads": 3,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 83012
},
"flush": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 50856
},
"search": {
"threads": 24,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 24,
"completed": 132495
},
"percolate": {
"threads": 0,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 0,
"completed": 0
},
"management": {
"threads": 5,
"queue": 0,
"active": 1,
"rejected": 0,
"largest": 5,
"completed": 2467169
},
"refresh": {
"threads": 4,
"queue": 0,
"active": 0,
"rejected": 0,
"largest": 4,
"completed": 81978
}
},
"network": {
"tcp": {
"active_opens": 20988754,
"passive_opens": 23904325,
"curr_estab": 61,
"in_segs": 3495928683,
"out_segs": 4792521542,
"retrans_segs": 158505,
"estab_resets": 52113,
"attempt_fails": 284654,
"in_errs": 260,
"out_rsts": 447516
}
},
"fs": {
"timestamp": 1429411459237,
"total": {
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
},
"data": [
{
"path": [redacted],
"mount": [redacted],
"dev": "/dev/xvdf",
"total_in_bytes": 1082125373440,
"free_in_bytes": 236036149248,
"available_in_bytes": 181043793920,
"disk_reads": 67918592,
"disk_writes": 93727805,
"disk_io_op": 161646397,
"disk_read_size_in_bytes": 2662825100288,
"disk_write_size_in_bytes": 9759008243712,
"disk_io_size_in_bytes": 12421833344000
}
]
},
"transport": {
"server_open": 13,
"rx_count": 10,
"rx_size_in_bytes": 3870,
"tx_count": 10,
"tx_size_in_bytes": 3870
},
"http": {
"current_open": 6,
"total_opened": 5457
},
"fielddata_breaker": {
"maximum_size_in_bytes": 4693111603,
"maximum_size": "4.3gb",
"estimated_size_in_bytes": 0,
"estimated_size": "0b",
"overhead": 1.03,
"tripped": 0
}
}
}
}

And here's curl -XGET 'http://localhost:9200/_cluster/stats?human&pretty'
(also took a couple minutes to complete):

{
"timestamp" : 1429412322912,
"cluster_name" : "campfire.production.local",
"status" : "green",
"indices" : {
"count" : 236,
"shards" : {
"total" : 1180,
"primaries" : 1180,
"replication" : 0.0,
"index" : {
"shards" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"primaries" : {
"min" : 5,
"max" : 5,
"avg" : 5.0
},
"replication" : {
"min" : 0.0,
"max" : 0.0,
"avg" : 0.0
}
}
},
"docs" : {
"count" : 2257932671,
"deleted" : 20
},
"store" : {
"size" : "630.5gb",
"size_in_bytes" : 677043574900,
"throttle_time" : "5.7m",
"throttle_time_in_millis" : 342217
},
"fielddata" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0,
"evictions" : 0
},
"filter_cache" : {
"memory_size" : "12.7kb",
"memory_size_in_bytes" : 13076,
"evictions" : 0
},
"id_cache" : {
"memory_size" : "0b",
"memory_size_in_bytes" : 0
},
"completion" : {
"size" : "0b",
"size_in_bytes" : 0
},
"segments" : {
"count" : 9230,
"memory" : "21gb",
"memory_in_bytes" : 22645584136,
"index_writer_memory" : "0b",
"index_writer_memory_in_bytes" : 0,
"version_map_memory" : "0b",
"version_map_memory_in_bytes" : 0
},
"percolate" : {
"total" : 0,
"get_time" : "0s",
"time_in_millis" : 0,
"current" : 0,
"memory_size_in_bytes" : -1,
"memory_size" : "-1b",
"queries" : 0
}
},
"nodes" : {
"count" : {
"total" : 1,
"master_only" : 0,
"data_only" : 0,
"master_data" : 1,
"client" : 0
},
"versions" : [ "1.3.2" ],
"os" : {
"available_processors" : 8,
"mem" : {
"total" : "29.3gb",
"total_in_bytes" : 31564025856
},
"cpu" : [ {
"vendor" : "Intel",
"model" : "Xeon",
"mhz" : 2500,
"total_cores" : 8,
"total_sockets" : 8,
"cores_per_socket" : 32,
"cache_size" : "25kb",
"cache_size_in_bytes" : 25600,
"count" : 1
} ]
},
"process" : {
"cpu" : {
"percent" : 168
},
"open_file_descriptors" : {
"min" : 25733,
"max" : 25733,
"avg" : 25733
}
},
"jvm" : {
"max_uptime" : "1.5d",
"max_uptime_in_millis" : 132087449,
"versions" : [ {
"version" : "1.7.0_72",
"vm_name" : "Java HotSpot(TM) 64-Bit Server VM",
"vm_version" : "24.72-b04",
"vm_vendor" : "Oracle Corporation",
"count" : 1
} ],
"mem" : {
"heap_used" : "7.2gb",
"heap_used_in_bytes" : 7751476496,
"heap_max" : "7.2gb",
"heap_max_in_bytes" : 7821852672
},
"threads" : 134
},
"fs" : {
"total" : "1007.8gb",
"total_in_bytes" : 1082125373440,
"free" : "219.6gb",
"free_in_bytes" : 235869388800,
"available" : "168.4gb",
"available_in_bytes" : 180877033472,
"disk_reads" : 67926240,
"disk_writes" : 93745279,
"disk_io_op" : 161671519,
"disk_read_size" : "2.4tb",
"disk_read_size_in_bytes" : 2663667340288,
"disk_write_size" : "8.8tb",
"disk_write_size_in_bytes" : 9760728731648,
"disk_io_size" : "11.2tb",
"disk_io_size_in_bytes" : 12424396071936
},
"plugins" : [ ]
}
}

I'm fairly confident that somewhere in that morass of information there
are some big, glowing red flags that say what's going wrong, but the output
is largely Greek to me; the documentation is pretty sparse on the matter.
Any help figuring out what's wrong with my Elasticsearch would be greatly
appreciated. And let me know if there's any other diagnostics I can run to
help debug. Thanks!

--
You received this message because you are subscribed to the Google Groups "elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email to elasticsearch+unsubscribe@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/elasticsearch/c28f6a00-2e0f-4ca9-a0c7-d5a28059273f%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.