Frequent GC, evictions, next steps?

Hi,

We have two clusters, cluster A and cluster B. I believe we're hitting resource constraints. Moreso in cluster A.

In both clusters we see the heap fill, empty, fill, empty, etc. It occurs more frequently during periods of heavy usage. This is presumably garbage collection kicking in and cleaning out several GB at a time. Should we see a pattern like this or is that a problem? Is it possible to smooth that garbage collection out?

In cluster A we are now seeing evictions. In that cluster, we see the following node stats...

{
"cluster_name" : "elasticsearch",
"nodes" : {
"VwxhfESBSEazqUUunn8rYQ" : {
"name" : "Gibney, Kyle",
"indices" : {
"size" : "5.5gb",
"size_in_bytes" : 6004262913,
"docs" : {
"num_docs" : 3686945
},
"cache" : {
"field_evictions" : 610,
"field_size" : "538.8mb",
"field_size_in_bytes" : 564976986,
"filter_size" : "1.6gb",
"filter_size_in_bytes" : 1806101592
},
"merges" : {
"current" : 0,
"total" : 19961,
"total_time" : "31.9m",
"total_time_in_millis" : 1919798
}
},
"os" : {
"timestamp" : 1304385315490,
"uptime" : "15 weeks, 5 days, 21 hours, 54 minutes and 47 seconds",
"uptime_in_millis" : 9582887000,
"load_average" : [ 0.38, 0.52, 0.52 ],
"cpu" : {
"sys" : 0,
"user" : 2,
"idle" : 95
},
"mem" : {
"free" : "13gb",
"free_in_bytes" : 14043938816,
"used" : "18.2gb",
"used_in_bytes" : 19626369024,
"free_percent" : 67,
"used_percent" : 32,
"actual_free" : "21.1gb",
"actual_free_in_bytes" : 22664294400,
"actual_used" : "10.2gb",
"actual_used_in_bytes" : 11006013440
},
"swap" : {
"used" : "0b",
"used_in_bytes" : 0,
"free" : "0b",
"free_in_bytes" : 0
}
},
"process" : {
"timestamp" : 1304385315490,
"cpu" : {
"percent" : 16,
"sys" : "2 hours, 1 minute, 28 seconds and 60 milliseconds",
"sys_in_millis" : 7288060,
"user" : "3 days, 5 hours, 20 minutes, 31 seconds and 140 milliseconds",
"user_in_millis" : 278431140,
"total" : "-1 milliseconds",
"total_in_millis" : -1
},
"mem" : {
"resident" : "9.1gb",
"resident_in_bytes" : 9819578368,
"share" : "11.2mb",
"share_in_bytes" : 11821056,
"total_virtual" : "9.5gb",
"total_virtual_in_bytes" : 10278338560
},
"fd" : {
"total" : 1793
}
},
"jvm" : {
"timestamp" : 1304385315492,
"uptime" : "1 week, 5 days, 19 hours, 10 minutes, 29 seconds and 150 milliseconds",
"uptime_in_millis" : 1105829150,
"mem" : {
"heap_used" : "4.5gb",
"heap_used_in_bytes" : 4912341176,
"heap_committed" : "7.9gb",
"heap_committed_in_bytes" : 8581218304,
"non_heap_used" : "50.4mb",
"non_heap_used_in_bytes" : 52858584,
"non_heap_committed" : "77.4mb",
"non_heap_committed_in_bytes" : 81203200
},
"threads" : {
"count" : 61,
"peak_count" : 327
},
"gc" : {
"collection_count" : 151048,
"collection_time" : "1 hour, 25 minutes, 50 seconds and 117 milliseconds",
"collection_time_in_millis" : 5150117,
"collectors" : {
"ParNew" : {
"collection_count" : 148544,
"collection_time" : "1 hour, 23 minutes, 40 seconds and 954 milliseconds",
"collection_time_in_millis" : 5020954
},
"ConcurrentMarkSweep" : {
"collection_count" : 2504,
"collection_time" : "2 minutes, 9 seconds and 163 milliseconds",
"collection_time_in_millis" : 129163
}
}
}
},
"network" : {
"tcp" : {
"active_opens" : 656097,
"passive_opens" : 6009321,
"curr_estab" : 146,
"in_segs" : 352904730,
"out_segs" : 331738122,
"retrans_segs" : 9077,
"estab_resets" : 1110,
"attempt_fails" : 50263,
"in_errs" : 2,
"out_rsts" : 51859
}
},
"transport" : {
"rx_count" : 23492645,
"rx_size" : "5.9gb",
"rx_size_in_bytes" : 6346979194,
"tx_count" : 23439275,
"tx_size" : "6.1gb",
"tx_size_in_bytes" : 6631081728
}
},
"wrjYOoseQ721H9DB4YLhWA" : {
"name" : "Kro",
"indices" : {
"size" : "5.5gb",
"size_in_bytes" : 5986822332,
"docs" : {
"num_docs" : 3686946
},
"cache" : {
"field_evictions" : 525,
"field_size" : "535.4mb",
"field_size_in_bytes" : 561417978,
"filter_size" : "1.9gb",
"filter_size_in_bytes" : 2099494008
},
"merges" : {
"current" : 0,
"total" : 19368,
"total_time" : "34.7m",
"total_time_in_millis" : 2084985
}
},
"os" : {
"timestamp" : 1304385315457,
"uptime" : "2 weeks, 2 days, 18 hours, 4 minutes and 49 seconds",
"uptime_in_millis" : 1447489000,
"load_average" : [ 0.63, 0.75, 0.65 ],
"cpu" : {
"sys" : 0,
"user" : 3,
"idle" : 94
},
"mem" : {
"free" : "14.4gb",
"free_in_bytes" : 15566430208,
"used" : "16.8gb",
"used_in_bytes" : 18103877632,
"free_percent" : 67,
"used_percent" : 32,
"actual_free" : "21.2gb",
"actual_free_in_bytes" : 22850248704,
"actual_used" : "10gb",
"actual_used_in_bytes" : 10820059136
},
"swap" : {
"used" : "0b",
"used_in_bytes" : 0,
"free" : "0b",
"free_in_bytes" : 0
}
},
"process" : {
"timestamp" : 1304385315457,
"cpu" : {
"percent" : 24,
"sys" : "1 hour, 58 minutes, 27 seconds and 430 milliseconds",
"sys_in_millis" : 7107430,
"user" : "3 days, 43 minutes, 59 seconds and 700 milliseconds",
"user_in_millis" : 261839700,
"total" : "-1 milliseconds",
"total_in_millis" : -1
},
"mem" : {
"resident" : "9.1gb",
"resident_in_bytes" : 9786376192,
"share" : "11.2mb",
"share_in_bytes" : 11816960,
"total_virtual" : "9.5gb",
"total_virtual_in_bytes" : 10210037760
},
"fd" : {
"total" : 1522
}
},
"jvm" : {
"timestamp" : 1304385315458,
"uptime" : "1 week, 5 days, 19 hours, 8 minutes, 45 seconds and 450 milliseconds",
"uptime_in_millis" : 1105725450,
"mem" : {
"heap_used" : "6.6gb",
"heap_used_in_bytes" : 7119475328,
"heap_committed" : "7.9gb",
"heap_committed_in_bytes" : 8581218304,
"non_heap_used" : "49.3mb",
"non_heap_used_in_bytes" : 51784616,
"non_heap_committed" : "75.5mb",
"non_heap_committed_in_bytes" : 79192064
},
"threads" : {
"count" : 65,
"peak_count" : 125
},
"gc" : {
"collection_count" : 149366,
"collection_time" : "1 hour, 25 minutes, 1 second and 607 milliseconds",
"collection_time_in_millis" : 5101607,
"collectors" : {
"ParNew" : {
"collection_count" : 147464,
"collection_time" : "1 hour, 23 minutes, 45 seconds and 2 milliseconds",
"collection_time_in_millis" : 5025002
},
"ConcurrentMarkSweep" : {
"collection_count" : 1902,
"collection_time" : "1 minute, 16 seconds and 605 milliseconds",
"collection_time_in_millis" : 76605
}
}
}
},
"network" : {
"tcp" : {
"active_opens" : 115479,
"passive_opens" : 2205698,
"curr_estab" : 132,
"in_segs" : 72025611,
"out_segs" : 65572988,
"retrans_segs" : 1050,
"estab_resets" : 78,
"attempt_fails" : 23293,
"in_errs" : 0,
"out_rsts" : 23240
}
},
"transport" : {
"rx_count" : 23492644,
"rx_size" : "6.1gb",
"rx_size_in_bytes" : 6631081692,
"tx_count" : 23403105,
"tx_size" : "5.9gb",
"tx_size_in_bytes" : 6346979158
}
}
}
}

What conclusions would you draw from the above? I assume the frequent GC + evictions points to insufficient memory for the indices/filters/cache?

What would you recommend as a next step? Adding a single node with the same specs, same shard + replica counts? Or increasing the heap allocated to the current nodes?

Cheers,
David.