Hello!
I inherited a 3 node Elastic 7.17 cluster and yesterday, the health status was red.
_cluster/health?pretty=true
{
"cluster_name" : "graylog-production",
"status" : "red",
"timed_out" : false,
"number_of_nodes" : 3,
"number_of_data_nodes" : 3,
"active_primary_shards" : 2208,
"active_shards" : 2253,
"relocating_shards" : 0,
"initializing_shards" : 0,
"unassigned_shards" : 3,
"delayed_unassigned_shards" : 0,
"number_of_pending_tasks" : 0,
"number_of_in_flight_fetch" : 0,
"task_max_waiting_in_queue_millis" : 0,
"active_shards_percent_as_number" : 99.86702127659575
}
Doing a rolling restart of the nodes usually clears things up, but not this time. I reviewed the Common Cluster Issues documents and also noted that we have no snapshots to fall back to. So I am loathe to work with the reroute API until I understand it a little more.
I did note the Unassigned shards claim that there is no_valid_shard_copy
but also saw there was a 4th shard that looks available, but it says "the cluster has unassigned shards and cluster setting cluster.routing.allocation.allow_rebalance
is set to indices_all_active
". All 4 shards for this index are primary, but only 3 nodes.
I was thinking there was a way to copy the "good" shard onto the others (is that rebalancing?), and willing to accept data loss (but hopefully as a last resort) as it is log data. Any suggestions or clarifications are appreciated.
I am appending _cluster/allocation/explain
info on the bad shards and then the good shard for reference:
"Bad Shards"
root@elastic-03-in-prod:~# curl -X GET "http://elastic-03-in-prod:9200/_cluster/allocation/explain?pretty" -H 'Content-Type: application/json' -d'{"index": "graylog_39","shard": 0,"primary": true}'
{
"index" : "graylog_39",
"shard" : 0,
"primary" : true,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2023-06-21T16:45:40.991Z",
"failed_allocation_attempts" : 2,
"details" : "failed shard on node [POHn_aN0R-CE7kteDGVRfA]: shard failure, reason [corrupt file (source: [start])], failure CorruptIndexException[Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/0/index/_b_Lucene84_0.tim)]; nested: NoSuchFileException[/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/0/index/_b_Lucene84_0.tim]; ",
"last_allocation_status" : "no_valid_shard_copy"
},
"can_allocate" : "no_valid_shard_copy",
"allocate_explanation" : "cannot allocate because all found copies of the shard are either stale or corrupt",
"node_allocation_decisions" : [
{
"node_id" : "POHn_aN0R-CE7kteDGVRfA",
"node_name" : "elastic-02-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787419136",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"in_sync" : true,
"allocation_id" : "C-m7pTmoRJKzZglZBrydRg",
"store_exception" : {
"type" : "corrupt_index_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
"caused_by" : {
"type" : "i_o_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])])",
"caused_by" : {
"type" : "corrupt_index_exception",
"reason" : "Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/0/index/_b_Lucene84_0.tim)",
"caused_by" : {
"type" : "no_such_file_exception",
"reason" : "/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/0/index/_b_Lucene84_0.tim"
}
}
}
}
}
},
{
"node_id" : "UZTEb2h7Q7SiiBWAEvU8yg",
"node_name" : "elastic-03-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
},
{
"node_id" : "tv0duWT1Q_OQ9Xslv7drEQ",
"node_name" : "elastic-01-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
}
]
}
root@elastic-03-in-prod:~# curl -X GET "http://elastic-03-in-prod:9200/_cluster/allocation/explain?pretty" -H 'Content-Type: application/json' -d'{"index": "graylog_39","shard": 2,"primary": true}'
{
"index" : "graylog_39",
"shard" : 2,
"primary" : true,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2023-06-21T16:45:40.837Z",
"failed_allocation_attempts" : 2,
"details" : "failed shard on node [tv0duWT1Q_OQ9Xslv7drEQ]: shard failure, reason [corrupt file (source: [start])], failure CorruptIndexException[Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/2/index/_b_Lucene84_0.tim)]; nested: NoSuchFileException[/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/2/index/_b_Lucene84_0.tim]; ",
"last_allocation_status" : "no_valid_shard_copy"
},
"can_allocate" : "no_valid_shard_copy",
"allocate_explanation" : "cannot allocate because all found copies of the shard are either stale or corrupt",
"node_allocation_decisions" : [
{
"node_id" : "POHn_aN0R-CE7kteDGVRfA",
"node_name" : "elastic-02-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787419136",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
},
{
"node_id" : "UZTEb2h7Q7SiiBWAEvU8yg",
"node_name" : "elastic-03-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
},
{
"node_id" : "tv0duWT1Q_OQ9Xslv7drEQ",
"node_name" : "elastic-01-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"in_sync" : true,
"allocation_id" : "Du7lJexcQwWG7jcH45obTw",
"store_exception" : {
"type" : "corrupt_index_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
"caused_by" : {
"type" : "i_o_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])])",
"caused_by" : {
"type" : "corrupt_index_exception",
"reason" : "Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/2/index/_b_Lucene84_0.tim)",
"caused_by" : {
"type" : "no_such_file_exception",
"reason" : "/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/2/index/_b_Lucene84_0.tim"
}
}
}
}
}
}
]
}
root@elastic-03-in-prod:~# curl -X GET "http://elastic-03-in-prod:9200/_cluster/allocation/explain?pretty" -H 'Content-Type: application/json' -d'{"index": "graylog_39","shard": 3,"primary": true}'
{
"index" : "graylog_39",
"shard" : 3,
"primary" : true,
"current_state" : "unassigned",
"unassigned_info" : {
"reason" : "ALLOCATION_FAILED",
"at" : "2023-06-21T16:39:25.682Z",
"failed_allocation_attempts" : 2,
"details" : "failed shard on node [POHn_aN0R-CE7kteDGVRfA]: shard failure, reason [corrupt file (source: [start])], failure CorruptIndexException[Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/3/index/_l_Lucene84_0.tim)]; nested: NoSuchFileException[/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/3/index/_l_Lucene84_0.tim]; ",
"last_allocation_status" : "no_valid_shard_copy"
},
"can_allocate" : "no_valid_shard_copy",
"allocate_explanation" : "cannot allocate because all found copies of the shard are either stale or corrupt",
"node_allocation_decisions" : [
{
"node_id" : "POHn_aN0R-CE7kteDGVRfA",
"node_name" : "elastic-02-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787419136",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"in_sync" : true,
"allocation_id" : "KnnvS0goSM2tlczol_G5Rg",
"store_exception" : {
"type" : "corrupt_index_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])]) (resource=preexisting_corruption)",
"caused_by" : {
"type" : "i_o_exception",
"reason" : "failed engine (reason: [corrupt file (source: [start])])",
"caused_by" : {
"type" : "corrupt_index_exception",
"reason" : "Problem reading index. (resource=/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/3/index/_l_Lucene84_0.tim)",
"caused_by" : {
"type" : "no_such_file_exception",
"reason" : "/var/lib/elasticsearch/nodes/0/indices/utxx9Zc_TKaeZqRwpTRkfw/3/index/_l_Lucene84_0.tim"
}
}
}
}
}
},
{
"node_id" : "UZTEb2h7Q7SiiBWAEvU8yg",
"node_name" : "elastic-03-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
},
{
"node_id" : "tv0duWT1Q_OQ9Xslv7drEQ",
"node_name" : "elastic-01-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "no",
"store" : {
"found" : false
}
}
]
}
"Good Shard"
root@elastic-03-in-prod:~# curl -X GET "http://elastic-03-in-prod.webtech.uits.iu.edu:9200/_cluster/allocation/explain?pretty" -H 'Content-Type: application/json' -d'{"index": "graylog_39","shard": 1,"primary": true}'
{
"index" : "graylog_39",
"shard" : 1,
"primary" : true,
"current_state" : "started",
"current_node" : {
"id" : "UZTEb2h7Q7SiiBWAEvU8yg",
"name" : "elastic-03-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"weight_ranking" : 2
},
"can_remain_on_current_node" : "yes",
"can_rebalance_cluster" : "no",
"can_rebalance_cluster_decisions" : [
{
"decider" : "cluster_rebalance",
"decision" : "NO",
"explanation" : "the cluster has unassigned shards and cluster setting [cluster.routing.allocation.allow_rebalance] is set to [indices_all_active]"
}
],
"can_rebalance_to_other_node" : "no",
"rebalance_explanation" : "rebalancing is not allowed, even though there is at least one node on which the shard can be allocated",
"node_allocation_decisions" : [
{
"node_id" : "tv0duWT1Q_OQ9Xslv7drEQ",
"node_name" : "elastic-01-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787484672",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "yes",
"weight_ranking" : 1
},
{
"node_id" : "POHn_aN0R-CE7kteDGVRfA",
"node_name" : "elastic-02-in-prod",
"transport_address" : "xx.xx.xx.xx:9300",
"node_attributes" : {
"ml.machine_memory" : "16787419136",
"ml.max_open_jobs" : "512",
"xpack.installed" : "true",
"ml.max_jvm_size" : "15032385536",
"transform.node" : "true"
},
"node_decision" : "worse_balance",
"weight_ranking" : 3
}
]
}
Again, my thanks!
Robert