Hi
We've recently started witnessing duplicated results in our search results when paginating. We're running ES 1.5.0 on a 5 node cluster (1 primary + 2 replicas per shard).
Using a simple bash line I'm sending 5 paginated queries, extract the doc ids sort and find duplicates. Here are several subsequent runs showing the duplicates are similar but not consistent:
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::350226::446665245::color::570106872_2938",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::468036666::cId::1322501",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::468036666::cId::1322501",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::468036666::cId::1322501",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::468036666::cId::1322501",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::468036666::cId::1322501",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
barakcohen@elasticsearch-app-prod-6of6:~$ for i in {0..5}; do curl -s "http://localhost:9200/slices_v3/_search?pretty&from=$(expr $i \* 20)&size=20" -d @query; done > results; grep '"_id' results | sort | uniq -cd
2 "_id" : "ss::344589::472181639::cId::1000179",
This is the query used:
{
"version": true,
"query": {
"filtered": {
"filter": {
"bool": {
"must": [
{
"range": {
"stock_quantity": {
"gt": 0
}
}
},
{
"term": {
"is_available": true
}
}
]
}
},
"query": {
"multi_match": {
"minimum_should_match": "2",
"fields": [
"store_name.stemmed^1",
"color^0.5",
"brand^1",
"name.stemmed^1",
"department_name.stemmed^1",
"category_path^1",
"name^1",
"department_name^1",
"store_name^1",
"brand.stemmed^1",
"category_path.stemmed^1"
],
"type": "cross_fields",
"query": "jeans jacket"
}
}
}
},
"aggs": {
"free_shipping": {
"terms": {
"field": "free_shipping"
}
},
"sizes": {
"terms": {
"field": "sizes.raw",
"size": 100
}
},
"price": {
"stats": {
"field": "price"
}
},
"on_sale": {
"range": {
"ranges": [
{
"to": 20,
"key": "False"
},
{
"from": 20,
"key": "True"
}
],
"field": "discount",
"keyed": true
}
},
"department_name": {
"terms": {
"field": "department_name.raw",
"size": 50
}
},
"store_name": {
"terms": {
"field": "store_name.raw",
"size": 200
}
},
"brand": {
"terms": {
"field": "brand.raw",
"size": 200
}
}
}
}
Decreasing the query's complexity can decrease the duplicates but not remove them altogether.
Any help is greatly appreciated as this is currently looking like a bug in ES.
Thanks,
Barak