Yesterday I did a full reindex of an index with ~8 million documents.
I had 3 nodes on EC2 large instances. There were 8 clients indexing
documents at the rate of ~1100 documents per minute per client.
Today I did some match_all count queries to verify the index size, and
I notice that every other result is different: sometimes 5m, sometimes
8m. I look at the index status and I see that two shards have very
differently sized replicas: The primaries have <120k documents, and
the replicas have >1.3m documents.
I tried stopping nodes to see if I could get the correct replicas to
become the primaries, but it seems they may be stored in the gateway
as the much smaller size rather than the correct size. It appears the
behavior is that if a replica goes down, it is restored from the
primary, but if a primary goes down, it is restored from the gateway.
Here is the output from the index status command. Notice that shard 4
has a big difference between the size of the replica and the primary.
Shard 3 originally had the same issue, but the correct replica was
lost when the node with that replica was accidentally restarted
(oops!)
{
"ok" : true,
"_shards" : {
"total" : 12,
"successful" : 12,
"failed" : 0
},
"indices" : {
"stats" : {
"aliases" : [ ],
"settings" : {
"index.number_of_shards" : "6",
"index.number_of_replicas" : "1"
},
"store_size" : "20.3g",
"store_size_in_bytes" : 21814342869,
"estimated_flushable_memory_size" : "96.5m",
"estimated_flushable_memory_size_in_bytes" : 101234949,
"translog_operations" : 102062,
"docs" : {
"num_docs" : 5738311,
"max_doc" : 5744803,
"deleted_docs" : 6492
},
"shards" : {
"0" : [ {
"routing" : {
"state" : "STARTED",
"primary" : false,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : null,
"shard" : 0,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "2.6g",
"store_size_in_bytes" : 2842186930,
"estimated_flushable_memory_size" : "298.8k",
"estimated_flushable_memory_size_in_bytes" : 306021,
"translog_id" : 0,
"translog_operations" : 306,
"docs" : {
"num_docs" : 1352920,
"max_doc" : 1352920,
"deleted_docs" : 0
}
}, {
"routing" : {
"state" : "RELOCATING",
"primary" : true,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : "411f522e-
bfbc-4e0b-8a70-6451d1bef1da",
"shard" : 0,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "2.6g",
"store_size_in_bytes" : 2842196644,
"estimated_flushable_memory_size" : "298.8k",
"estimated_flushable_memory_size_in_bytes" : 306021,
"translog_id" : 1,
"translog_operations" : 306,
"docs" : {
"num_docs" : 1352920,
"max_doc" : 1352920,
"deleted_docs" : 0
}
} ],
"1" : [ {
"routing" : {
"state" : "STARTED",
"primary" : false,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : null,
"shard" : 1,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2109353140,
"estimated_flushable_memory_size" : "304.8k",
"estimated_flushable_memory_size_in_bytes" : 312123,
"translog_id" : 0,
"translog_operations" : 323,
"docs" : {
"num_docs" : 1413888,
"max_doc" : 1418829,
"deleted_docs" : 4941
}
}, {
"routing" : {
"state" : "RELOCATING",
"primary" : true,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : "411f522e-
bfbc-4e0b-8a70-6451d1bef1da",
"shard" : 1,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2109369148,
"estimated_flushable_memory_size" : "304.8k",
"estimated_flushable_memory_size_in_bytes" : 312123,
"translog_id" : 1,
"translog_operations" : 323,
"docs" : {
"num_docs" : 1413888,
"max_doc" : 1418829,
"deleted_docs" : 4941
}
} ],
"2" : [ {
"routing" : {
"state" : "RELOCATING",
"primary" : false,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : "411f522e-
bfbc-4e0b-8a70-6451d1bef1da",
"shard" : 2,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2050764870,
"estimated_flushable_memory_size" : "877.4k",
"estimated_flushable_memory_size_in_bytes" : 898541,
"translog_id" : 0,
"translog_operations" : 897,
"docs" : {
"num_docs" : 1412645,
"max_doc" : 1413300,
"deleted_docs" : 655
}
}, {
"routing" : {
"state" : "STARTED",
"primary" : true,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : null,
"shard" : 2,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2050751585,
"estimated_flushable_memory_size" : "876.7k",
"estimated_flushable_memory_size_in_bytes" : 897826,
"translog_id" : 0,
"translog_operations" : 896,
"docs" : {
"num_docs" : 1412644,
"max_doc" : 1413299,
"deleted_docs" : 655
}
} ],
"3" : [ {
"routing" : {
"state" : "RELOCATING",
"primary" : true,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : "411f522e-
bfbc-4e0b-8a70-6451d1bef1da",
"shard" : 3,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "195m",
"store_size_in_bytes" : 204574035,
"estimated_flushable_memory_size" : "3.7m",
"estimated_flushable_memory_size_in_bytes" : 3949835,
"translog_id" : 0,
"translog_operations" : 3842,
"docs" : {
"num_docs" : 103921,
"max_doc" : 103921,
"deleted_docs" : 0
}
}, {
"routing" : {
"state" : "STARTED",
"primary" : false,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : null,
"shard" : 3,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "195m",
"store_size_in_bytes" : 204572396,
"estimated_flushable_memory_size" : "3.7m",
"estimated_flushable_memory_size_in_bytes" : 3949835,
"translog_id" : 0,
"translog_operations" : 3842,
"docs" : {
"num_docs" : 103921,
"max_doc" : 103921,
"deleted_docs" : 0
}
} ],
"4" : [ {
"routing" : {
"state" : "STARTED",
"primary" : false,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : null,
"shard" : 4,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2106604638,
"estimated_flushable_memory_size" : "40.9m",
"estimated_flushable_memory_size_in_bytes" : 42929689,
"translog_id" : 262,
"translog_operations" : 43410,
"docs" : {
"num_docs" : 1415174,
"max_doc" : 1419993,
"deleted_docs" : 4819
}
}, {
"routing" : {
"state" : "STARTED",
"primary" : true,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : null,
"shard" : 4,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "104.9m",
"store_size_in_bytes" : 110022541,
"estimated_flushable_memory_size" : "3.2m",
"estimated_flushable_memory_size_in_bytes" : 3392079,
"translog_id" : 0,
"translog_operations" : 3311,
"docs" : {
"num_docs" : 38319,
"max_doc" : 38319,
"deleted_docs" : 0
}
} ],
"5" : [ {
"routing" : {
"state" : "STARTED",
"primary" : true,
"node" : "9a54c6aa-469d-46ff-a574-dc24c969a0b0",
"relocating_node" : null,
"shard" : 5,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "2.8g",
"store_size_in_bytes" : 3074996380,
"estimated_flushable_memory_size" : "41.9m",
"estimated_flushable_memory_size_in_bytes" : 43980856,
"translog_id" : 276,
"translog_operations" : 44606,
"docs" : {
"num_docs" : 1416619,
"max_doc" : 1417515,
"deleted_docs" : 896
}
}, {
"routing" : {
"state" : "STARTED",
"primary" : false,
"node" : "ccc84aea-aabb-40e3-837c-af1bcb1b764a",
"relocating_node" : null,
"shard" : 5,
"index" : "stats"
},
"state" : "STARTED",
"store_size" : "1.9g",
"store_size_in_bytes" : 2108950562,
"estimated_flushable_memory_size" : "0",
"estimated_flushable_memory_size_in_bytes" : 0,
"translog_id" : 1,
"translog_operations" : 0,
"docs" : {
"num_docs" : 1416619,
"max_doc" : 1417515,
"deleted_docs" : 896
}
} ]
}
}
}